├── .formatter.exs ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── LICENSE ├── README.md ├── bench └── comparison.exs ├── config └── config.exs ├── lib └── blex.ex ├── mix.exs ├── mix.lock └── test ├── blex_test.exs └── test_helper.exs /.formatter.exs: -------------------------------------------------------------------------------- 1 | # Used by "mix format" 2 | [ 3 | inputs: ["{mix,.formatter}.exs", "{config,lib,test,bench}/**/*.{ex,exs}"] 4 | ] 5 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Elixir CI 2 | 3 | on: push 4 | 5 | jobs: 6 | test: 7 | 8 | runs-on: ubuntu-latest 9 | 10 | container: 11 | image: elixir:1.9.1-slim 12 | 13 | steps: 14 | - uses: actions/checkout@v1 15 | - name: Install Dependencies 16 | run: | 17 | mix local.rebar --force 18 | mix local.hex --force 19 | mix deps.get 20 | - name: Run Tests 21 | run: mix test 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # The directory Mix will write compiled artifacts to. 2 | /_build/ 3 | 4 | # If you run "mix test --cover", coverage assets end up here. 5 | /cover/ 6 | 7 | # The directory Mix downloads your dependencies sources to. 8 | /deps/ 9 | 10 | # Where 3rd-party dependencies like ExDoc output generated docs. 11 | /doc/ 12 | 13 | # Ignore .fetch files in case you like to edit your project deps locally. 14 | /.fetch 15 | 16 | # If the VM crashes, it generates a dump, let's ignore it too. 17 | erl_crash.dump 18 | 19 | # Also ignore archive artifacts (built via "mix archive.build"). 20 | *.ez 21 | 22 | # Ignore package tarball (built via "mix hex.build"). 23 | blex-*.tar 24 | 25 | .elixir_ls 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 yunsong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Blex 2 | 3 | Blex is a fast Bloom filter with **concurrent accessibility**, powered by [`:atomics`](http://erlang.org/doc/man/atomics.html) module. 4 | 5 | ## Features 6 | 7 | * Fixed size Bloom filter 8 | * Concurrent reads & writes 9 | * Serialization 10 | * Merge multiple Bloom filters into one 11 | * Only one copy of data because data is saved in either `:atomics` or binary (if > 64 bytes) 12 | * Custom hash functions 13 | 14 | ## Example 15 | 16 | ```elixir 17 | iex> b = Blex.new(1000, 0.01) 18 | iex> Task.async(fn -> Blex.put(b, "hello") end) |> Task.await() 19 | iex> Task.async(fn -> Blex.put(b, "world") end) |> Task.await() 20 | iex> Blex.member?(b, "hello") 21 | true 22 | iex> Blex.member?(b, "world") 23 | true 24 | iex> Blex.member?(b, "others") 25 | false 26 | ``` 27 | 28 | ## Installation 29 | 30 | **Note**: it requires OTP-21.2.1 or later. OTP-21.2 is not good due to a [issue](https://github.com/erlang/otp/pull/2061). 31 | 32 | It can be installed by adding `blex` to your list of dependencies in `mix.exs`: 33 | 34 | ```elixir 35 | def deps do 36 | [ 37 | {:blex, "~> 0.2"} 38 | ] 39 | end 40 | ``` 41 | 42 | ## Documentation 43 | 44 | Documentation can be found at [hexdocs.pm/blex/Blex.html](https://hexdocs.pm/blex/Blex.html). 45 | 46 | ## Benchmarking 47 | 48 | Compare to alternative Bloom filter powered by `:array` module, 49 | 50 | Blex is faster with read operation: 51 | 52 | ``` 53 | Operating System: macOS" 54 | CPU Information: Intel(R) Core(TM) i7-3720QM CPU @ 2.60GHz 55 | Number of Available Cores: 8 56 | Available memory: 16 GB 57 | Elixir 1.7.4 58 | Erlang 21.2.2 59 | 60 | Benchmark suite executing with the following configuration: 61 | warmup: 2 s 62 | time: 5 s 63 | memory time: 0 μs 64 | parallel: 1 65 | inputs: none specified 66 | Estimated total run time: 21 s 67 | 68 | 69 | Benchmarking Blex.members?... 70 | Benchmarking Blex.members? with binary format... 71 | Benchmarking Bloomex.members?... 72 | 73 | Name ips average deviation median 99th % 74 | Blex.members? with binary format 0.69 1.44 s ±0.23% 1.44 s 1.44 s 75 | Blex.members? 0.63 1.58 s ±0.61% 1.58 s 1.58 s 76 | Bloomex.members? 0.40 2.51 s ±0.00% 2.51 s 2.51 s 77 | 78 | Comparison: 79 | Blex.members? with binary format 0.69 80 | Blex.members? 0.63 - 1.09x slower 81 | Bloomex.members? 0.40 - 1.74x slower 82 | ``` 83 | 84 | Blex is much faster with write operation: 85 | 86 | ``` 87 | Operating System: macOS" 88 | CPU Information: Intel(R) Core(TM) i7-3720QM CPU @ 2.60GHz 89 | Number of Available Cores: 8 90 | Available memory: 16 GB 91 | Elixir 1.7.4 92 | Erlang 21.2.2 93 | 94 | Benchmark suite executing with the following configuration: 95 | warmup: 2 s 96 | time: 10 s 97 | memory time: 0 μs 98 | parallel: 1 99 | inputs: none specified 100 | Estimated total run time: 24 s 101 | 102 | 103 | Benchmarking Blex.put... 104 | Benchmarking Bloomex.add... 105 | 106 | Name ips average deviation median 99th % 107 | Blex.put 0.44 2.25 s ±3.98% 2.30 s 2.33 s 108 | Bloomex.add 0.126 7.91 s ±0.22% 7.91 s 7.92 s 109 | 110 | Comparison: 111 | Blex.put 0.44 112 | Bloomex.add 0.126 - 3.51x slower 113 | ``` 114 | 115 | Above benchmarking script is available at `bench/comparison.exs`. 116 | 117 | ## Implementation 118 | 119 | Instead of traditional Bloom filter, partitioned Bloom filter (a variant Bloom filter described in section 3 of 120 | [the paper](http://gsd.di.uminho.pt/members/cbm/ps/dbloom.pdf)) is used for performance benefits. The partitioned 121 | Bloom filter would partition bits array into **k** parts where **k** is number of hash functions. Each hash functions 122 | would only read & write bits from its own partitioned space. This would bring following benefits: 123 | 124 | * Reduce hash function (`:erlang.phash2`) calls for some cases. 125 | * Speed up `Blex.estimate_size` by scanning only part of bits. 126 | 127 | ## License 128 | 129 | MIT 130 | -------------------------------------------------------------------------------- /bench/comparison.exs: -------------------------------------------------------------------------------- 1 | defmodule Blex.Bench.Comparison do 2 | def loop(0, x, _), do: x 3 | 4 | def loop(n, x, f) do 5 | loop(n - 1, f.(x), f) 6 | end 7 | 8 | @capacity 1_000_000 9 | @false_positive_probability 0.01 10 | @n 1_000_000 11 | 12 | def run_estimation() do 13 | Benchee.run(%{ 14 | "Blex.estimate_size?" => 15 | {fn b -> 16 | Blex.estimate_size(b) 17 | end, 18 | before_each: fn _ -> 19 | b = Blex.new(@capacity, @false_positive_probability) 20 | 21 | loop(@n, b, fn b -> 22 | Blex.put(b, :rand.uniform(1000_000)) 23 | b 24 | end) 25 | 26 | b 27 | end}, 28 | "Blex.estimate_size? with binary format" => 29 | {fn b -> 30 | Blex.estimate_size(b) 31 | end, 32 | before_each: fn _ -> 33 | b = Blex.new(@capacity, @false_positive_probability) 34 | 35 | loop(@n, b, fn b -> 36 | Blex.put(b, :rand.uniform(1000_000)) 37 | b 38 | end) 39 | 40 | Blex.encode(b) 41 | end} 42 | }) 43 | end 44 | 45 | def run_serialization() do 46 | Benchee.run(%{ 47 | "Blex.encode" => 48 | {fn b -> 49 | Blex.encode(b) 50 | end, 51 | before_each: fn _ -> 52 | Blex.new(@capacity, @false_positive_probability) 53 | end}, 54 | "Blex.decode" => 55 | {fn encoded -> 56 | Blex.decode(encoded) 57 | end, 58 | before_each: fn _ -> 59 | Blex.new(@capacity, @false_positive_probability) 60 | |> Blex.encode() 61 | end}, 62 | "Blex.merge" => 63 | {fn list -> 64 | Blex.merge(list) 65 | end, 66 | before_each: fn _ -> 67 | b1 = Blex.new(@capacity, @false_positive_probability) 68 | b2 = Blex.new(@capacity, @false_positive_probability) 69 | [b1, b2] 70 | end}, 71 | "Blex.merge_encode" => 72 | {fn list -> 73 | Blex.merge_encode(list) 74 | end, 75 | before_each: fn _ -> 76 | b1 = Blex.new(@capacity, @false_positive_probability) 77 | b2 = Blex.new(@capacity, @false_positive_probability) 78 | [b1, b2] 79 | end} 80 | }) 81 | end 82 | 83 | def run_read_operation() do 84 | Benchee.run(%{ 85 | "Bloomex.members?" => 86 | {fn b -> 87 | loop(@n, b, fn b -> 88 | Bloomex.member?(b, :rand.uniform(1000_000)) 89 | b 90 | end) 91 | end, 92 | before_each: fn _ -> 93 | b = Bloomex.plain(@capacity, @false_positive_probability) 94 | 95 | loop(@n, b, fn b -> 96 | Bloomex.add(b, :rand.uniform(1000_000)) 97 | end) 98 | end}, 99 | "Blex.members?" => 100 | {fn b -> 101 | loop(@n, b, fn b -> 102 | Blex.member?(b, :rand.uniform(1000_000)) 103 | b 104 | end) 105 | end, 106 | before_each: fn _ -> 107 | b = Blex.new(@capacity, @false_positive_probability) 108 | 109 | loop(@n, b, fn b -> 110 | Blex.put(b, :rand.uniform(1000_000)) 111 | b 112 | end) 113 | 114 | b 115 | end}, 116 | "Blex.members? with binary format" => 117 | {fn b -> 118 | loop(@n, b, fn b -> 119 | Blex.member?(b, :rand.uniform(1000_000)) 120 | b 121 | end) 122 | end, 123 | before_each: fn _ -> 124 | b = Blex.new(@capacity, @false_positive_probability) 125 | 126 | loop(@n, b, fn b -> 127 | Blex.put(b, :rand.uniform(1000_000)) 128 | b 129 | end) 130 | 131 | Blex.encode(b) 132 | end} 133 | }) 134 | end 135 | 136 | def run_write_operation() do 137 | Benchee.run( 138 | %{ 139 | "Bloomex.add" => 140 | {fn b -> 141 | loop(@n, b, fn b -> 142 | Bloomex.add(b, :rand.uniform(1000_000)) 143 | end) 144 | end, 145 | before_each: fn _ -> 146 | Bloomex.plain(@capacity, @false_positive_probability) 147 | end}, 148 | "Blex.put" => 149 | {fn b -> 150 | loop(@n, b, fn b -> 151 | Blex.put(b, :rand.uniform(1000_000)) 152 | b 153 | end) 154 | end, 155 | before_each: fn _ -> 156 | Blex.new(@capacity, @false_positive_probability) 157 | end} 158 | }, 159 | time: 10 160 | ) 161 | end 162 | end 163 | 164 | Blex.Bench.Comparison.run_estimation() 165 | IO.puts("\n=====================================\n") 166 | Blex.Bench.Comparison.run_serialization() 167 | IO.puts("\n=====================================\n") 168 | Blex.Bench.Comparison.run_read_operation() 169 | IO.puts("\n=====================================\n") 170 | Blex.Bench.Comparison.run_write_operation() 171 | -------------------------------------------------------------------------------- /config/config.exs: -------------------------------------------------------------------------------- 1 | # This file is responsible for configuring your application 2 | # and its dependencies with the aid of the Mix.Config module. 3 | use Mix.Config 4 | 5 | # This configuration is loaded before any dependency and is restricted 6 | # to this project. If another project depends on this project, this 7 | # file won't be loaded nor affect the parent project. For this reason, 8 | # if you want to provide default values for your application for 9 | # 3rd-party users, it should be done in your "mix.exs" file. 10 | 11 | # You can configure your application as: 12 | # 13 | # config :blex, key: :value 14 | # 15 | # and access this configuration in your application as: 16 | # 17 | # Application.get_env(:blex, :key) 18 | # 19 | # You can also configure a 3rd-party app: 20 | # 21 | # config :logger, level: :info 22 | # 23 | 24 | # It is also possible to import configuration files, relative to this 25 | # directory. For example, you can emulate configuration per environment 26 | # by uncommenting the line below and defining dev.exs, test.exs and such. 27 | # Configuration from the imported file will override the ones defined 28 | # here (which is why it is important to import them last). 29 | # 30 | # import_config "#{Mix.env()}.exs" 31 | -------------------------------------------------------------------------------- /lib/blex.ex: -------------------------------------------------------------------------------- 1 | defmodule Blex do 2 | @moduledoc """ 3 | 4 | Blex is a fast Bloom filter with **concurrent accessibility**, powered by [`:atomics`](http://erlang.org/doc/man/atomics.html) module. 5 | 6 | ## Features 7 | 8 | * Fixed size Bloom filter 9 | * Concurrent reads & writes 10 | * Serialization 11 | * Merge multiple Bloom filters into one 12 | * Only one copy of data because data is saved in either `:atomics` or binary (if > 64 bytes) 13 | * Custom hash functions 14 | 15 | ## Example 16 | 17 | iex> b = Blex.new(1000, 0.01) 18 | iex> Task.async(fn -> Blex.put(b, "hello") end) |> Task.await() 19 | iex> Task.async(fn -> Blex.put(b, "world") end) |> Task.await() 20 | iex> Blex.member?(b, "hello") 21 | true 22 | iex> Blex.member?(b, "world") 23 | true 24 | iex> Blex.member?(b, "others") 25 | false 26 | 27 | ## Blex struct and Blex binary 28 | 29 | Blex struct is the struct that contains Blex info, created via `Blex.new/2`, `Blex.new/3`, 30 | `Blex.decode/1` and `Blex.merge/1`. Data is saved in atomics array. 31 | 32 | Blex binary is encoded binary from Blex struct via `Blex.encode/1` or `Blex.merge_encode/1`. 33 | It supports most operations (e.g. `Blex.member?/2`) except `Blex.put/2` (obviously, we cannot mutate binary). 34 | This is useful when we collect Bloom filters from other nodes, we can avoid deserialization 35 | if we do not need to add more memebers to it. 36 | 37 | ## How to start ? 38 | 39 | Checkout `Blex.new/2`, `Blex.put/2` and `Blex.member?/2`. 40 | 41 | ## How to do serialization ? 42 | 43 | Checkout `Blex.encode/1`, `Blex.decode/1`, `Blex.merge/1` and `Blex.merge_encode/1`. 44 | 45 | ## How to merge mutliple bloom filters ? 46 | 47 | Checkout `Blex.merge/1` and `Blex.merge_encode/1`. 48 | 49 | ## How to use custom hash functions ? 50 | 51 | Checkout `Blex.register/2` and `Blex.new/3`. 52 | 53 | ## How to collect meta info ? 54 | 55 | Checkout `Blex.estimate_size/1`, `Blex.estimate_memory/1` and `Blex.estimate_capacity/1`. 56 | 57 | """ 58 | 59 | use Bitwise 60 | 61 | defstruct [ 62 | :a, 63 | :k, 64 | :b, 65 | :m, 66 | :hash_id, 67 | :hash_fn 68 | ] 69 | 70 | @type hash_id :: non_neg_integer() 71 | 72 | @type hash_function :: (non_neg_integer(), any() -> {non_neg_integer(), any()}) 73 | 74 | @type t :: %__MODULE__{ 75 | a: :atomics.atomics_ref(), 76 | k: pos_integer(), 77 | b: pos_integer(), 78 | m: pos_integer(), 79 | hash_id: hash_id(), 80 | hash_fn: hash_function() 81 | } 82 | 83 | @doc """ 84 | 85 | Create a Bloom filter with default hash function. It returns a Blex struct. 86 | 87 | `capacity` should be a positive integer. 88 | 89 | `false_positive_probability` should be a float that greater than 0 and smaller than 1. 90 | 91 | ## Example 92 | 93 | To create a Bloom filter with 1000 capacity and 1% false positive probability, we can do: 94 | 95 | iex> b = Blex.new(1000, 0.01) 96 | iex> Blex.put(b, "hello") 97 | :ok 98 | iex> Blex.member?(b, "hello") 99 | true 100 | iex> Blex.member?(b, "others") 101 | false 102 | 103 | """ 104 | 105 | @spec new(pos_integer(), float()) :: t() 106 | 107 | def new(capacity, false_positive_probability) 108 | when is_integer(capacity) and capacity > 0 and false_positive_probability > 0 and 109 | false_positive_probability < 1 do 110 | k = compute_optimal_k(false_positive_probability) 111 | b = compute_optimal_b(capacity, false_positive_probability, k) 112 | 113 | hash_id = 114 | cond do 115 | b <= 16 -> 201 116 | b <= 32 -> 202 117 | b <= 48 -> 203 118 | true -> raise ArgumentError, "unsupported capacity" 119 | end 120 | 121 | create_instance(hash_id, k, b) 122 | end 123 | 124 | defp compute_optimal_k(false_positive_probability) do 125 | -:math.log2(false_positive_probability) |> ceil() 126 | end 127 | 128 | defp compute_optimal_b(n, false_positive_probability, k) do 129 | p = :math.pow(false_positive_probability, 1 / k) 130 | 131 | # From Scalable Bloom Filter paper, we have p = 1 - (1 - 1 / m)^n 132 | # Therefore, m = 1 / (1 - (1 - p)^(1 / n)) 133 | m = 1 / (1 - :math.pow(1 - p, 1 / n)) 134 | 135 | # grow in power of 2 to make hash coding easier 136 | b = :math.log2(m) |> ceil() 137 | 138 | # it needs to be at least 6 bits to fit :atomics 64 bits unsigned integer 139 | max(b, 6) 140 | end 141 | 142 | defp create_instance(hash_id, k, b) do 143 | m = 1 <<< b 144 | atomics_size = div(k * m, 64) 145 | hash_fn = get_hash_fn(hash_id) 146 | 147 | # Require OTP-21.2.1 or later for a bug fix 148 | a = :atomics.new(atomics_size, signed: false) 149 | 150 | %__MODULE__{ 151 | a: a, 152 | k: k, 153 | b: b, 154 | m: m, 155 | hash_id: hash_id, 156 | hash_fn: hash_fn 157 | } 158 | end 159 | 160 | # hash_id coding range: 161 | # 0 ~ 200 custom hash functions 162 | # 201 ~ 203 default hash functions 163 | # 204 ~ 255 reserved for future extension 164 | 165 | @range 1 <<< 32 166 | 167 | @spec get_hash_fn(hash_id()) :: hash_function() 168 | 169 | # for b <= 16, it requires one :erlang.phash2 call 170 | defp get_hash_fn(201) do 171 | fn 172 | 0, {item, b, m} -> 173 | hash = :erlang.phash2(item, @range) 174 | <> = <> 175 | {h1, {m, h1, h2}} 176 | 177 | i, {m, h1, h2} = acc when is_integer(h1) and is_integer(h2) -> 178 | {rem(h1 + i * h2, m), acc} 179 | end 180 | end 181 | 182 | # for 16 < b <= 32, it requires two :erlang.phash2 calls 183 | defp get_hash_fn(202) do 184 | fn 185 | 0, {item, _b, m} -> 186 | h1 = :erlang.phash2(item, m) 187 | {h1, {item, m, h1}} 188 | 189 | 1, {item, m, h1} when is_integer(h1) -> 190 | h2 = :erlang.phash2([item], m) 191 | {rem(h1 + h2, m), {h1, h2, m}} 192 | 193 | i, {h1, h2, m} = acc when is_integer(h1) and is_integer(h2) -> 194 | {rem(h1 + i * h2, m), acc} 195 | end 196 | end 197 | 198 | # for 32 < b <= 48, it requires three :erlang.phash2 calls 199 | defp get_hash_fn(203) do 200 | fn 201 | 0, {item, b, m} -> 202 | first = :erlang.phash2(item, @range) 203 | second = :erlang.phash2([item], @range) 204 | <> = bin = <> 205 | {h1, {item, b, m, bin}} 206 | 207 | 1, {item, b, m, bin} -> 208 | third = :erlang.phash2({item}, @range) 209 | <> = <> 210 | {rem(h1 + h2, m), {h1, h2, m}} 211 | 212 | i, {h1, h2, m} = acc when is_integer(h1) and is_integer(h2) -> 213 | {rem(h1 + i * h2, m), acc} 214 | end 215 | end 216 | 217 | # custom hash functions 218 | defp get_hash_fn(custom_hash_id) do 219 | :persistent_term.get({__MODULE__, custom_hash_id}) 220 | end 221 | 222 | @doc """ 223 | 224 | Create a Bloom filter with custom hash id. It returns a Blex struct. 225 | 226 | `capacity` should be a positive integer. 227 | 228 | `false_positive_probability` should be a float that greater than 0 and smaller than 1. 229 | 230 | Before we use a custom hash id, we need to do `Blex.register/2` to register it first. 231 | 232 | ## Example 233 | 234 | To create a Bloom filter with custom hash function, 1000 capacity and 1% false positive probability, we can do: 235 | 236 | iex> custom_hash_id = 1 237 | iex> Blex.register(custom_hash_id, fn 238 | ...> 0, {item, b, range} -> 239 | ...> <> = <> 240 | ...> {h1, {range, h1, h2}} 241 | ...> 242 | ...> i, {range, h1, h2} = acc -> 243 | ...> {rem(h1 + i * h2, range), acc} 244 | ...> end) 245 | :ok 246 | iex> b = Blex.new(1000, 0.01, custom_hash_id) 247 | iex> Blex.put(b, "hello") 248 | :ok 249 | iex> Blex.member?(b, "hello") 250 | true 251 | iex> Blex.member?(b, "others") 252 | false 253 | 254 | """ 255 | 256 | @spec new(pos_integer(), float(), hash_id()) :: t() 257 | 258 | def new(capacity, false_positive_probability, custom_hash_id) 259 | when is_integer(capacity) and 0 < capacity and 0 < false_positive_probability and 260 | false_positive_probability < 1 and custom_hash_id in 0..200 do 261 | k = compute_optimal_k(false_positive_probability) 262 | b = compute_optimal_b(capacity, false_positive_probability, k) 263 | create_instance(custom_hash_id, k, b) 264 | end 265 | 266 | @doc """ 267 | 268 | Register a custom function with given id. 269 | 270 | Custom hash id must be integer and within range from 0 to 200. 271 | So we can have max 201 custom hash functions. Adding more 272 | custom hash functions is possible but not supported yet 273 | because 201 custom functions should be enough in practice. 274 | 275 | The signature of hash function is similar to `fun` from `Enum.map_reduce/3`. 276 | The spec of hash function is `(non_neg_integer(), any() -> {non_neg_integer(), any()})`. 277 | The hash function would be invoked k time if Bloom filter has k hash functions. 278 | The first parameter is integer from `0` to `k-1`. 279 | The second parameter is the accumulator. The first accumulator is `{item, b, range}` where 280 | `item` is the value to hash. `range` indicates that returned position should be in 281 | range from `0` to `range-1`. `b` is bit size of range and we have `(1 <<< b) == range`. 282 | The returned value is a tuple of two element. The first element is the position of the bit. 283 | The second element is the accumulator that would be passed to next interation. 284 | 285 | The hash id and hash function pair would be saved in `:persistent_term`. We should only 286 | register it once at the beginning. 287 | 288 | ## Example 289 | 290 | iex> custom_hash_id = 1 291 | iex> Blex.register(custom_hash_id, fn 292 | ...> 0, {item, b, range} -> 293 | ...> <> = <> 294 | ...> {h1, {range, h1, h2}} 295 | ...> 296 | ...> i, {range, h1, h2} = acc -> 297 | ...> {rem(h1 + i * h2, range), acc} 298 | ...> end) 299 | :ok 300 | 301 | """ 302 | 303 | @spec register(hash_id(), hash_function()) :: :ok 304 | 305 | def register(custom_hash_id, hash_function) when custom_hash_id in 0..200 do 306 | :persistent_term.put({__MODULE__, custom_hash_id}, hash_function) 307 | end 308 | 309 | @doc """ 310 | 311 | Put item into Bloom filter (Blex struct). 312 | 313 | ## Example 314 | 315 | iex> b = Blex.new(1000, 0.01) 316 | iex> Blex.member?(b, "hello") 317 | false 318 | iex> Blex.put(b, "hello") 319 | :ok 320 | iex> Blex.member?(b, "hello") 321 | true 322 | 323 | """ 324 | 325 | @spec put(t(), any()) :: :ok 326 | 327 | def put(%__MODULE__{a: a, k: k, b: b, m: m, hash_fn: hash_fn} = _blex_struct, item) do 328 | # base starts with 64 because :atomics array is one-indexed with 64 bits integer. 329 | set_all(0, k, a, {item, b, m}, hash_fn, 64, m) 330 | end 331 | 332 | @spec set_all( 333 | integer(), 334 | integer(), 335 | :atomics.atomics_ref(), 336 | {any(), integer(), integer()}, 337 | hash_function(), 338 | integer(), 339 | integer() 340 | ) :: :ok 341 | 342 | defp set_all(k, k, _, _, _, _, _), do: :ok 343 | 344 | defp set_all(i, k, a, acc, f, base, m) do 345 | {position, new_acc} = f.(i, acc) 346 | index = div(position + base, 64) 347 | bits = 1 <<< rem(position, 64) 348 | set(a, index, bits, :atomics.get(a, index)) 349 | set_all(i + 1, k, a, new_acc, f, base + m, m) 350 | end 351 | 352 | @spec set(:atomics.atomics_ref(), integer(), integer(), integer()) :: :ok 353 | 354 | defp set(a, index, bits, origin) do 355 | case origin ||| bits do 356 | ^origin -> 357 | :ok 358 | 359 | result -> 360 | case :atomics.compare_exchange(a, index, origin, result) do 361 | :ok -> 362 | :ok 363 | 364 | actual -> 365 | set(a, index, bits, actual) 366 | end 367 | end 368 | end 369 | 370 | @doc """ 371 | 372 | Check if item is member of Blex struct or Blex binary. 373 | 374 | ## Example 375 | 376 | iex> b = Blex.new(1000, 0.01) 377 | iex> Blex.member?(b, "hello") 378 | false 379 | iex> Blex.put(b, "hello") 380 | :ok 381 | iex> Blex.member?(b, "hello") 382 | true 383 | iex> encoded = Blex.encode(b) 384 | iex> Blex.member?(encoded, "hello") 385 | true 386 | 387 | """ 388 | 389 | @spec member?(t() | binary(), any()) :: boolean() 390 | 391 | def member?(%__MODULE__{a: a, k: k, b: b, m: m, hash_fn: hash_fn} = _blex, item) do 392 | check_member(0, k, a, {item, b, m}, hash_fn, 64, m) 393 | end 394 | 395 | def member?(<> = bin, item) do 396 | m = 1 <<< b 397 | f = get_hash_fn(code) 398 | # max = m * k + 8 * 3 - 1 399 | max = m * k + 23 400 | check_member_for_binary(0, k, bin, {item, b, m}, f, max, m) 401 | end 402 | 403 | @spec check_member( 404 | integer(), 405 | integer(), 406 | :atomics.atomics_ref(), 407 | {any(), integer(), integer()}, 408 | hash_function(), 409 | integer(), 410 | integer() 411 | ) :: boolean() 412 | 413 | defp check_member(k, k, _, _, _, _, _), do: true 414 | 415 | defp check_member(i, k, a, acc, f, base, m) do 416 | {position, new_acc} = f.(i, acc) 417 | index = div(position + base, 64) 418 | bits = 1 <<< rem(position, 64) 419 | 420 | case :atomics.get(a, index) &&& bits do 421 | ^bits -> 422 | check_member(i + 1, k, a, new_acc, f, base + m, m) 423 | 424 | _ -> 425 | false 426 | end 427 | end 428 | 429 | @spec check_member_for_binary( 430 | integer(), 431 | integer(), 432 | binary(), 433 | {any(), integer(), integer()}, 434 | hash_function(), 435 | integer(), 436 | integer() 437 | ) :: boolean() 438 | 439 | defp check_member_for_binary(k, k, _, _, _, _, _), do: true 440 | 441 | defp check_member_for_binary(i, k, bin, acc, f, max, m) do 442 | {position, new_acc} = f.(i, acc) 443 | index = div(max - position, 8) 444 | bits = 1 <<< rem(position, 8) 445 | 446 | case :binary.at(bin, index) &&& bits do 447 | ^bits -> 448 | check_member_for_binary(i + 1, k, bin, new_acc, f, max - m, m) 449 | 450 | _ -> 451 | false 452 | end 453 | end 454 | 455 | @doc """ 456 | 457 | Estimate actual size of unique items that Blex struct or Blex binary contains. 458 | 459 | ## Example 460 | 461 | iex> b = Blex.new(1000, 0.01) 462 | iex> Blex.estimate_size(b) 463 | 0 464 | iex> Blex.put(b, "hello") 465 | :ok 466 | iex> Blex.estimate_size(b) 467 | 1 468 | iex> Blex.put(b, "world") 469 | :ok 470 | iex> Blex.estimate_size(b) 471 | 2 472 | iex> encoded = Blex.encode(b) 473 | iex> Blex.estimate_size(encoded) 474 | 2 475 | 476 | """ 477 | 478 | @spec estimate_size(t() | binary()) :: non_neg_integer() 479 | 480 | def estimate_size(%__MODULE__{a: a, m: m} = _blex) do 481 | 1..div(m, 64) 482 | |> Enum.reduce(0, fn i, acc -> 483 | bits = <<:atomics.get(a, i)::64>> 484 | acc + count_64_bits(bits) 485 | end) 486 | |> compute_estimated_size(m) 487 | end 488 | 489 | def estimate_size(<<_, k, b, _::bits>> = bin) do 490 | m = 1 <<< b 491 | prefix = 24 + m * (k - 1) 492 | 493 | <<_::bits-size(prefix), target::bits>> = bin 494 | 495 | count_bits_for_bin(target, 0) 496 | |> compute_estimated_size(m) 497 | end 498 | 499 | defp compute_estimated_size(x, m) when x < m do 500 | round(-m * :math.log(1 - x / m)) 501 | end 502 | 503 | # when x == m, 1 - x/m would be 0.0, :math.log(0.0) would raise error 504 | defp compute_estimated_size(m, m) do 505 | round(-m * :math.log(1 / m)) 506 | end 507 | 508 | @spec count_bits_for_bin(binary(), integer()) :: integer() 509 | 510 | defp count_bits_for_bin(<>, acc) do 511 | count_bits_for_bin(rest, acc + count_64_bits(x)) 512 | end 513 | 514 | defp count_bits_for_bin(<<>>, acc), do: acc 515 | 516 | defp count_64_bits( 517 | <> 525 | ) do 526 | b_01 + b_02 + b_03 + b_04 + b_05 + b_06 + b_07 + b_08 + b_09 + b_10 + b_11 + b_12 + b_13 + 527 | b_14 + b_15 + b_16 + b_17 + b_18 + b_19 + b_20 + b_21 + b_22 + b_23 + b_24 + b_25 + b_26 + 528 | b_27 + b_28 + b_29 + b_30 + b_31 + b_32 + b_33 + b_34 + b_35 + b_36 + b_37 + b_38 + b_39 + 529 | b_40 + b_41 + b_42 + b_43 + b_44 + b_45 + b_46 + b_47 + b_48 + b_49 + b_50 + b_51 + b_52 + 530 | b_53 + b_54 + b_55 + b_56 + b_57 + b_58 + b_59 + b_60 + b_61 + b_62 + b_63 + b_64 531 | end 532 | 533 | @doc """ 534 | 535 | Estimate memory consumption in bytes for Blex struct or Blex binary. 536 | 537 | ## Example 538 | 539 | iex> b = Blex.new(1000, 0.01) 540 | iex> Blex.estimate_memory(b) 541 | 1832 542 | iex> encoded = Blex.encode(b) 543 | iex> Blex.estimate_memory(encoded) 544 | 1795 545 | 546 | """ 547 | 548 | @spec estimate_memory(t() | binary()) :: non_neg_integer() 549 | 550 | def estimate_memory(%__MODULE__{a: a} = _blex) do 551 | :atomics.info(a).memory 552 | end 553 | 554 | def estimate_memory(bin) when is_binary(bin) do 555 | byte_size(bin) 556 | end 557 | 558 | @doc """ 559 | 560 | Estimate actual capacity of Blex struct or Blex binary. 561 | 562 | Capacity grows in power of 2. Sometimes, the actual capacity 563 | could be bigger than specified capacity in `Blex.new/2` and `Blex.new/3`. 564 | 565 | It's estimated value and it could be slightly smaller than specified 566 | capacity. 567 | 568 | ## Example 569 | 570 | iex> b = Blex.new(1000, 0.01) 571 | iex> Blex.estimate_capacity(b) 572 | 1419 573 | iex> encoded = Blex.encode(b) 574 | iex> Blex.estimate_capacity(encoded) 575 | 1419 576 | 577 | """ 578 | 579 | @spec estimate_capacity(t() | binary()) :: non_neg_integer() 580 | 581 | def estimate_capacity(%__MODULE__{m: m}) do 582 | compute_estimated_capacity(m) 583 | end 584 | 585 | def estimate_capacity(<<_, _, b, _::bits>> = _blex) do 586 | compute_estimated_capacity(1 <<< b) 587 | end 588 | 589 | defp compute_estimated_capacity(m) do 590 | # derived from Scalable Bloom filter paper that: 591 | # p = 1/2 = 1 - (1 - 1/m)^n 592 | round(:math.log(0.5) / :math.log(1 - 1 / m)) 593 | end 594 | 595 | @doc """ 596 | 597 | Encode Blex struct to Blex binary. 598 | 599 | ## Example 600 | 601 | iex> b = Blex.new(40, 0.5) 602 | iex> Blex.encode(b) 603 | <<201, 1, 6, 0, 0, 0, 0, 0, 0, 0, 0>> 604 | iex> Blex.put(b, "hello") 605 | iex> Blex.encode(b) 606 | <<201, 1, 6, 0, 0, 0, 0, 1, 0, 0, 0>> 607 | 608 | """ 609 | 610 | @spec encode(t()) :: binary() 611 | 612 | def encode(%__MODULE__{a: a, k: k, b: b, m: m, hash_id: hash_id} = _blex_struct) do 613 | size = div(m, 64) * k 614 | 615 | data = 616 | Enum.reduce(1..size, [], fn i, acc -> 617 | [<<:atomics.get(a, i)::integer-unsigned-64>> | acc] 618 | end) 619 | 620 | IO.iodata_to_binary([hash_id, k, b | data]) 621 | end 622 | 623 | @doc """ 624 | 625 | Decode Blex binary to Blex struct. 626 | 627 | ## Example 628 | 629 | iex> b = Blex.new(40, 0.5) 630 | iex> Blex.put(b, "hello") 631 | :ok 632 | iex> encoded = Blex.encode(b) 633 | <<201, 1, 6, 0, 0, 0, 0, 1, 0, 0, 0>> 634 | iex> decoded = Blex.decode(encoded) 635 | iex> Blex.member?(decoded, "hello") 636 | true 637 | 638 | """ 639 | 640 | @spec decode(binary()) :: t() 641 | 642 | def decode(<> = _blex_binary) do 643 | blex = create_instance(hash_id, k, b) 644 | size = div(k * blex.m, 64) 645 | copy_data(rest, blex.a, size) 646 | blex 647 | end 648 | 649 | @spec copy_data(binary(), :atomics.atomics_ref(), integer()) :: :ok 650 | 651 | defp copy_data(<>, a, i) do 652 | :atomics.put(a, i, x) 653 | copy_data(rest, a, i - 1) 654 | end 655 | 656 | defp copy_data(<<>>, _, 0), do: :ok 657 | 658 | @doc """ 659 | 660 | Merge multiple Blex struct or Blex binary into one Blex struct. 661 | 662 | ## Example 663 | 664 | iex> b1 = Blex.new(1000, 0.01) 665 | iex> b2 = Blex.new(1000, 0.01) 666 | iex> b3 = Blex.new(1000, 0.01) 667 | iex> Blex.put(b1, "hello") 668 | :ok 669 | iex> Blex.put(b2, "world") 670 | :ok 671 | iex> Blex.put(b3, "okk") 672 | :ok 673 | iex> encoded_b3 = Blex.encode(b3) 674 | iex> merged = Blex.merge([b1, b2, encoded_b3]) 675 | iex> Blex.member?(merged, "hello") 676 | true 677 | iex> Blex.member?(merged, "world") 678 | true 679 | iex> Blex.member?(merged, "okk") 680 | true 681 | iex> Blex.member?(merged, "others") 682 | false 683 | 684 | """ 685 | 686 | @spec merge([t() | binary()]) :: t() 687 | 688 | def merge([first | rest]) do 689 | {hash_id, k, b, f_first} = transform(first) 690 | 691 | f_rest = 692 | Enum.map(rest, fn it -> 693 | {^hash_id, ^k, ^b, f} = transform(it) 694 | f 695 | end) 696 | 697 | dest = create_instance(hash_id, k, b) 698 | a = dest.a 699 | m = dest.m 700 | size = div(m * k, 64) 701 | 702 | Enum.each(1..size, fn i -> 703 | result = 704 | Enum.reduce(f_rest, f_first.(i), fn f, acc -> 705 | f.(i) ||| acc 706 | end) 707 | 708 | :atomics.put(a, i, result) 709 | end) 710 | 711 | dest 712 | end 713 | 714 | @doc """ 715 | 716 | Merge multiple Blex struct or Blex binary into given Blex struct. 717 | 718 | ## Example 719 | 720 | iex> b1 = Blex.new(1000, 0.01) 721 | iex> b2 = Blex.new(1000, 0.01) 722 | iex> b3 = Blex.new(1000, 0.01) 723 | iex> Blex.put(b1, "hello") 724 | :ok 725 | iex> Blex.put(b2, "world") 726 | :ok 727 | iex> Blex.put(b3, "okk") 728 | :ok 729 | iex> encoded_b3 = Blex.encode(b3) 730 | iex> Blex.merge_into([b2, encoded_b3], b1) 731 | :ok 732 | iex> Blex.member?(b1, "hello") 733 | true 734 | iex> Blex.member?(b1, "world") 735 | true 736 | iex> Blex.member?(b1, "okk") 737 | true 738 | iex> Blex.member?(b1, "others") 739 | false 740 | 741 | """ 742 | @doc since: "0.2.0" 743 | 744 | @spec merge_into([t() | binary()], t()) :: :ok 745 | 746 | def merge_into(blexes, %__MODULE__{a: a, k: k, b: b, m: m, hash_id: hash_id} = _blex_struct) do 747 | f_blexes = 748 | Enum.reduce(blexes, [], fn it, acc -> 749 | {^hash_id, ^k, ^b, f} = transform(it) 750 | [f | acc] 751 | end) 752 | 753 | size = div(m * k, 64) 754 | 755 | Enum.each(1..size, fn i -> 756 | bits = 757 | Enum.reduce(f_blexes, 0, fn f, acc -> 758 | f.(i) ||| acc 759 | end) 760 | 761 | set(a, i, bits, :atomics.get(a, i)) 762 | end) 763 | end 764 | 765 | defp transform(%__MODULE__{a: a, k: k, b: b, hash_id: hash_id}) do 766 | f = fn i -> 767 | :atomics.get(a, i) 768 | end 769 | 770 | {hash_id, k, b, f} 771 | end 772 | 773 | defp transform(<> = bin) do 774 | size = (1 <<< b) * k + 24 775 | 776 | f = fn i -> 777 | prefix_size = size - i * 64 778 | <<_::size(prefix_size), x::integer-unsigned-64, _::bits>> = bin 779 | x 780 | end 781 | 782 | {hash_id, k, b, f} 783 | end 784 | 785 | @doc """ 786 | 787 | Merge multiple Blex struct or Blex binary into one Blex binary. 788 | 789 | It does `list |> Blex.merge() |> Blex.encode()` without intermeidate step. 790 | 791 | ## Example 792 | 793 | iex> b1 = Blex.new(1000, 0.01) 794 | iex> b2 = Blex.new(1000, 0.01) 795 | iex> b3 = Blex.new(1000, 0.01) 796 | iex> Blex.put(b1, "hello") 797 | :ok 798 | iex> Blex.put(b2, "world") 799 | :ok 800 | iex> Blex.put(b3, "okk") 801 | :ok 802 | iex> encoded_b3 = Blex.encode(b3) 803 | iex> merged = Blex.merge_encode([b1, b2, encoded_b3]) 804 | iex> is_binary(merged) 805 | true 806 | iex> Blex.member?(merged, "hello") 807 | true 808 | iex> Blex.member?(merged, "world") 809 | true 810 | iex> Blex.member?(merged, "okk") 811 | true 812 | iex> Blex.member?(merged, "others") 813 | false 814 | 815 | """ 816 | 817 | @spec merge_encode([t() | binary()]) :: binary() 818 | 819 | def merge_encode([first | rest]) do 820 | {hash_id, k, b, f_first} = transform(first) 821 | 822 | f_rest = 823 | Enum.map(rest, fn it -> 824 | {^hash_id, ^k, ^b, f} = transform(it) 825 | f 826 | end) 827 | 828 | size = div((1 <<< b) * k, 64) 829 | 830 | data = 831 | Enum.reduce(1..size, [], fn i, acc -> 832 | result = 833 | Enum.reduce(f_rest, f_first.(i), fn f, acc -> 834 | f.(i) ||| acc 835 | end) 836 | 837 | [<> | acc] 838 | end) 839 | 840 | [hash_id, k, b | data] 841 | |> IO.iodata_to_binary() 842 | end 843 | end 844 | -------------------------------------------------------------------------------- /mix.exs: -------------------------------------------------------------------------------- 1 | defmodule Blex.MixProject do 2 | use Mix.Project 3 | 4 | def project do 5 | [ 6 | app: :blex, 7 | version: "0.2.1", 8 | description: 9 | "A fast Bloom filter with concurrent accessibility, powered by :atomics module", 10 | elixir: "~> 1.8", 11 | start_permanent: Mix.env() == :prod, 12 | package: package(), 13 | deps: deps(), 14 | name: "Blex", 15 | source_url: "https://github.com/gyson/blex" 16 | ] 17 | end 18 | 19 | # Run "mix help compile.app" to learn about applications. 20 | def application do 21 | [ 22 | extra_applications: [:logger] 23 | ] 24 | end 25 | 26 | # Run "mix help deps" to learn about dependencies. 27 | defp deps do 28 | [ 29 | {:bloomex, "~> 1.0", only: :dev}, 30 | {:benchee, "~> 0.13", only: :dev}, 31 | {:murmur, "~> 1.0", only: [:dev, :test]}, 32 | {:stream_data, "~> 0.4", only: [:dev, :test]}, 33 | {:ex_doc, "~> 0.19", only: :dev, runtime: false}, 34 | {:ex_type, "~> 0.4.0", only: :dev, runtime: true}, 35 | {:dialyxir, "~> 1.0.0-rc.4", only: :dev, runtime: false} 36 | ] 37 | end 38 | 39 | def package do 40 | %{ 41 | licenses: ["MIT"], 42 | links: %{"GitHub" => "https://github.com/gyson/blex"} 43 | } 44 | end 45 | end 46 | -------------------------------------------------------------------------------- /mix.lock: -------------------------------------------------------------------------------- 1 | %{ 2 | "benchee": {:hex, :benchee, "0.13.2", "30cd4ff5f593fdd218a9b26f3c24d580274f297d88ad43383afe525b1543b165", [:mix], [{:deep_merge, "~> 0.1", [hex: :deep_merge, repo: "hexpm", optional: false]}], "hexpm", "d8b3f1720073413c36a21e56a1d1112a4d67a9ad0ec900437efed08b39e515b2"}, 3 | "bloomex": {:hex, :bloomex, "1.0.5", "4d4b366412270e1fdb0c93f9f376f93f360b1b18514ac97915b43f9f9c6025f3", [:mix], [], "hexpm", "ff1446c5f3e6c2c560b1cf8ef849cab78a743a0e103cde7575c0cc62a250dfcf"}, 4 | "deep_merge": {:hex, :deep_merge, "0.2.0", "c1050fa2edf4848b9f556fba1b75afc66608a4219659e3311d9c9427b5b680b3", [:mix], [], "hexpm", "e3bf435a54ed27b0ba3a01eb117ae017988804e136edcbe8a6a14c310daa966e"}, 5 | "dialyxir": {:hex, :dialyxir, "1.0.0-rc.4", "71b42f5ee1b7628f3e3a6565f4617dfb02d127a0499ab3e72750455e986df001", [:mix], [{:erlex, "~> 0.1", [hex: :erlex, repo: "hexpm", optional: false]}], "hexpm", "4bba10c6f267a0dd127d687d1295f6a11af6a7f160cc0e261c46f1962a98d7d8"}, 6 | "earmark": {:hex, :earmark, "1.3.1", "73812f447f7a42358d3ba79283cfa3075a7580a3a2ed457616d6517ac3738cb9", [:mix], [], "hexpm", "000aaeff08919e95e7aea13e4af7b2b9734577b3e6a7c50ee31ee88cab6ec4fb"}, 7 | "erlex": {:hex, :erlex, "0.1.6", "c01c889363168d3fdd23f4211647d8a34c0f9a21ec726762312e08e083f3d47e", [:mix], [], "hexpm", "f9388f7d1a668bee6ebddc040422ed6340af74aced153e492330da4c39516d92"}, 8 | "ex_doc": {:hex, :ex_doc, "0.19.2", "6f4081ccd9ed081b6dc0bd5af97a41e87f5554de469e7d76025fba535180565f", [:mix], [{:earmark, "~> 1.2", [hex: :earmark, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.10", [hex: :makeup_elixir, repo: "hexpm", optional: false]}], "hexpm", "4eae888633d2937e0a8839ae6002536d459c22976743c9dc98dd05941a06c016"}, 9 | "ex_type": {:hex, :ex_type, "0.4.0", "7c93a0e64956192d0a7319ad7431471331b33377c7476b4ce61d173d474b1fa0", [:mix], [{:ex_type_runtime, "~> 0.1.0", [hex: :ex_type_runtime, repo: "hexpm", optional: false]}], "hexpm", "86e1f72eec57566138578f9db3adf91fbb8cca7b32d1033d99e15a95cfba7a78"}, 10 | "ex_type_runtime": {:hex, :ex_type_runtime, "0.1.0", "c7557490c969aacaaada89b30213d3572b615b5ef29c6617f1b588b1213a5127", [:mix], [], "hexpm", "341deab4814cba23cdd0c912c87573fd48bdc3ceb7cd328f63406f3dab2448cc"}, 11 | "makeup": {:hex, :makeup, "0.8.0", "9cf32aea71c7fe0a4b2e9246c2c4978f9070257e5c9ce6d4a28ec450a839b55f", [:mix], [{:nimble_parsec, "~> 0.5.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "5fbc8e549aa9afeea2847c0769e3970537ed302f93a23ac612602e805d9d1e7f"}, 12 | "makeup_elixir": {:hex, :makeup_elixir, "0.13.0", "be7a477997dcac2e48a9d695ec730b2d22418292675c75aa2d34ba0909dcdeda", [:mix], [{:makeup, "~> 0.8", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "adf0218695e22caeda2820eaba703fa46c91820d53813a2223413da3ef4ba515"}, 13 | "murmur": {:hex, :murmur, "1.0.1", "a6e6bced2dd0d666090a9cf3e73699f3b9176bbcf32d35b0f022f137667608e3", [:mix], [], "hexpm", "1303e80b1a5514a8afe0baf4241903ae31e3ba8b7e9e7c2219a668c065a691a6"}, 14 | "nimble_parsec": {:hex, :nimble_parsec, "0.5.0", "90e2eca3d0266e5c53f8fbe0079694740b9c91b6747f2b7e3c5d21966bba8300", [:mix], [], "hexpm", "5c040b8469c1ff1b10093d3186e2e10dbe483cd73d79ec017993fb3985b8a9b3"}, 15 | "stream_data": {:hex, :stream_data, "0.4.2", "fa86b78c88ec4eaa482c0891350fcc23f19a79059a687760ddcf8680aac2799b", [:mix], [], "hexpm", "54d6bf6f1e5e27fbf4a7784a2bffbb993446d0efd079debca0f27bf859c0d1cf"}, 16 | } 17 | -------------------------------------------------------------------------------- /test/blex_test.exs: -------------------------------------------------------------------------------- 1 | defmodule BlexTest do 2 | use ExUnit.Case 3 | doctest Blex 4 | 5 | test "it should works" do 6 | b = Blex.new(1024, 0.01) 7 | 8 | Blex.put(b, "hello") 9 | 10 | assert Blex.member?(b, "hello") == true 11 | 12 | assert Blex.member?(b, "ok") == false 13 | end 14 | 15 | test "serialization" do 16 | b = Blex.new(1000, 0.02) 17 | 18 | Blex.put(b, "hello") 19 | Blex.put(b, "world") 20 | 21 | bin = Blex.encode(b) 22 | 23 | assert Blex.member?(bin, "hello") == true 24 | assert Blex.member?(bin, "world") == true 25 | assert Blex.member?(bin, "abcde") == false 26 | assert Blex.member?(bin, "okkkk") == false 27 | 28 | b2 = Blex.decode(bin) 29 | 30 | assert Blex.member?(b2, "hello") == true 31 | assert Blex.member?(b2, "world") == true 32 | assert Blex.member?(b2, "abcde") == false 33 | assert Blex.member?(b2, "okkkk") == false 34 | end 35 | 36 | test "serialization with StreamData" do 37 | StreamData.binary() 38 | |> Enum.take(1000) 39 | |> Enum.each(fn data -> 40 | b = Blex.new(100, 0.02) 41 | 42 | Blex.put(b, data) 43 | 44 | bin = Blex.encode(b) 45 | 46 | assert Blex.member?(bin, data) == true 47 | 48 | b2 = Blex.decode(bin) 49 | 50 | assert Blex.member?(b2, data) == true 51 | end) 52 | end 53 | 54 | test "merge" do 55 | b1 = Blex.new(1000, 0.05) 56 | b2 = Blex.new(1000, 0.05) 57 | 58 | Blex.put(b1, "hello") 59 | Blex.put(b2, "world") 60 | 61 | b3 = Blex.merge([b1, b2]) 62 | 63 | assert Blex.member?(b3, "hello") == true 64 | assert Blex.member?(b3, "world") == true 65 | assert Blex.member?(b3, "abcde") == false 66 | assert Blex.member?(b3, "okkkk") == false 67 | end 68 | 69 | test "merge_into" do 70 | b1 = Blex.new(1000, 0.05) 71 | b2 = Blex.new(1000, 0.05) 72 | b3 = Blex.new(1000, 0.05) 73 | 74 | Blex.put(b1, "hello") 75 | Blex.put(b2, "world") 76 | Blex.put(b3, "third") 77 | 78 | b3_encoded = Blex.encode(b3) 79 | Blex.merge_into([b2, b3_encoded], b1) 80 | 81 | assert Blex.member?(b1, "hello") == true 82 | assert Blex.member?(b1, "world") == true 83 | assert Blex.member?(b1, "third") == true 84 | assert Blex.member?(b1, "abcde") == false 85 | assert Blex.member?(b1, "okkkk") == false 86 | end 87 | 88 | test "merge should be comptaible with merge_into" do 89 | b1 = Blex.new(1000, 0.05) 90 | b2 = Blex.new(1000, 0.05) 91 | b3 = Blex.new(1000, 0.05) 92 | b4 = Blex.new(1000, 0.05) 93 | 94 | Blex.put(b1, "cool") 95 | Blex.put(b2, "cool") 96 | Blex.put(b3, "okkk") 97 | Blex.put(b4, "nooo") 98 | 99 | b4_encoded = Blex.encode(b4) 100 | 101 | merge_encoded = 102 | Blex.merge([b1, b3, b4_encoded]) 103 | |> Blex.encode() 104 | 105 | Blex.merge_into([b3, b4_encoded], b2) 106 | 107 | merge_into_encoded = Blex.encode(b2) 108 | 109 | assert merge_encoded == merge_into_encoded 110 | end 111 | 112 | test "merge_encode" do 113 | b1 = Blex.new(1000, 0.05) 114 | b2 = Blex.new(1000, 0.05) 115 | 116 | Blex.put(b1, "hello") 117 | Blex.put(b2, "world") 118 | 119 | b3 = Blex.merge_encode([b1, b2]) 120 | 121 | assert Blex.member?(b3, "hello") == true 122 | assert Blex.member?(b3, "world") == true 123 | assert Blex.member?(b3, "abcde") == false 124 | assert Blex.member?(b3, "okkkk") == false 125 | end 126 | 127 | test "Blex.estimate_size" do 128 | b = Blex.new(1000, 0.01) 129 | assert Blex.estimate_size(b) == 0 130 | assert Blex.estimate_size(Blex.encode(b)) == 0 131 | Blex.put(b, 1) 132 | assert Blex.estimate_size(b) == 1 133 | assert Blex.estimate_size(Blex.encode(b)) == 1 134 | Blex.put(b, 2) 135 | assert Blex.estimate_size(b) == 2 136 | assert Blex.estimate_size(Blex.encode(b)) == 2 137 | Blex.put(b, 3) 138 | assert Blex.estimate_size(b) == 3 139 | assert Blex.estimate_size(Blex.encode(b)) == 3 140 | Blex.put(b, 4) 141 | assert Blex.estimate_size(b) == 4 142 | assert Blex.estimate_size(Blex.encode(b)) == 4 143 | Blex.put(b, 5) 144 | assert Blex.estimate_size(b) == 5 145 | assert Blex.estimate_size(Blex.encode(b)) == 5 146 | Blex.put(b, 6) 147 | assert Blex.estimate_size(b) == 6 148 | assert Blex.estimate_size(Blex.encode(b)) == 6 149 | 150 | for i <- 1..1000 do 151 | Blex.put(b, i) 152 | end 153 | 154 | estimated_size = Blex.estimate_size(b) 155 | assert estimated_size > 950 156 | assert estimated_size < 1050 157 | 158 | estimated_size_via_binary = Blex.estimate_size(Blex.encode(b)) 159 | assert estimated_size == estimated_size_via_binary 160 | end 161 | 162 | test "Blex.estimate_capacity" do 163 | b = Blex.new(1400, 0.01) 164 | cap = Blex.estimate_capacity(b) 165 | assert cap > 1350 166 | assert cap < 1450 167 | end 168 | 169 | test "definitely not in" do 170 | b = Blex.new(1_000_000, 0.01) 171 | 172 | for i <- 1..1_000_000, rem(i, 2) == 0 do 173 | Blex.put(b, i) 174 | end 175 | 176 | for i <- 1..1_000_000, not Blex.member?(b, i) do 177 | assert rem(i, 2) == 1 178 | end 179 | end 180 | 181 | test "may be in" do 182 | b = Blex.new(1_000_000, 0.01) 183 | 184 | for i <- 1..1_000_000 do 185 | Blex.put(b, i) 186 | end 187 | 188 | result = 189 | Enum.reduce(1_000_001..10_000_000, 0, fn i, acc -> 190 | if Blex.member?(b, i) do 191 | acc + 1 192 | else 193 | acc 194 | end 195 | end) 196 | 197 | assert result < 9_000_000 * 0.01 198 | end 199 | 200 | @tag timeout: 3_000_000 201 | test "very large bloom filter" do 202 | # this takes about 6 GB memory, note that github actions host currently has 7 GB memory limit. 203 | b = Blex.new(3_200_000_000, 0.02) 204 | 205 | Blex.put(b, "hello") 206 | Blex.put(b, "world") 207 | 208 | assert Blex.member?(b, "hello") == true 209 | assert Blex.member?(b, "world") == true 210 | assert Blex.member?(b, "no exist") == false 211 | 212 | # this test takes a few minutes 213 | assert Blex.estimate_size(b) == 2 214 | end 215 | end 216 | -------------------------------------------------------------------------------- /test/test_helper.exs: -------------------------------------------------------------------------------- 1 | ExUnit.start() 2 | --------------------------------------------------------------------------------