├── .formatter.exs ├── .github └── CODEOWNERS ├── .gitignore ├── .travis.yml ├── LICENSE.md ├── README.md ├── bench.exs ├── config └── config.exs ├── lib ├── bitmacro.ex └── ex_lsh.ex ├── logo.svg ├── mix.exs ├── mix.lock └── test ├── ex_lsh_test.exs └── test_helper.exs /.formatter.exs: -------------------------------------------------------------------------------- 1 | # Used by "mix format" 2 | [ 3 | inputs: ["mix.exs", "{config,lib,test}/**/*.{ex,exs}"] 4 | ] 5 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # This is a comment. 2 | # Each line is a file pattern followed by one or more owners. 3 | 4 | # These owners will be the default owners for everything in the repo. 5 | # Unless a later match takes precedence, @global-owner1 and @global-owner2 6 | # will be requested for review when someone opens a pull request. 7 | * @jann @stiangrindvoll 8 | 9 | # Order is important; the last matching pattern takes the most precedence. 10 | # When someone opens a pull request that only modifies JS files, only @js-owner 11 | # and not the global owner(s) will be requested for a review. 12 | #*.js @js-owner 13 | 14 | # You can also use email addresses if you prefer. They'll be used to look up 15 | # users just like we do for commit author emails. 16 | #docs/* docs@example.com 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # The directory Mix will write compiled artifacts to. 2 | /_build/ 3 | 4 | # If you run "mix test --cover", coverage assets end up here. 5 | /cover/ 6 | 7 | # The directory Mix downloads your dependencies sources to. 8 | /deps/ 9 | 10 | # Where 3rd-party dependencies like ExDoc output generated docs. 11 | /doc/ 12 | 13 | # Ignore .fetch files in case you like to edit your project deps locally. 14 | /.fetch 15 | 16 | # If the VM crashes, it generates a dump, let's ignore it too. 17 | erl_crash.dump 18 | 19 | # Also ignore archive artifacts (built via "mix archive.build"). 20 | *.ez 21 | 22 | # Ignore package tarball (built via "mix hex.build"). 23 | ex_lsh-*.tar 24 | 25 | /*.txt 26 | /*.csv 27 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: elixir 2 | elixir: 3 | - 1.6 4 | - 1.7 5 | - 1.8 6 | otp_release: 7 | - 19.3 8 | - 20.3 9 | - 21.2 10 | matrix: 11 | exclude: 12 | - elixir: 1.8 13 | otp_release: 19.3 14 | branches: 15 | only: 16 | - master 17 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019 Meltwater 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![ExLSH logo](logo.svg) 2 | 3 | ## ExLSH 4 | 5 | [![Build Status](https://travis-ci.com/meltwater/ex_lsh.svg?token=ydrd7j6fwuq6xzD4yQkt&branch=master)](https://travis-ci.com/meltwater/ex_lsh) 6 | 7 | Calculates a locality sensitive hash for text. 8 | 9 | [Locality-sensitive hashing](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) 10 | is a technique for dimensionality reduction. Its properties guarantee similar 11 | output vectors for similar inputs. It can be used for clustering and 12 | [near-duplicate detection](https://moz.com/devblog/near-duplicate-detection/). 13 | This implementation is targeted for natural language as input. It takes a 14 | `String` of arbitrary length and outputs a vector encoded as `:binary`. 15 | 16 | If you want to learn more about why and how we built ExLSH, read our blog post [Locality-sensitive Hashing in Elixir]( 17 | https://underthehood.meltwater.com/blog/2019/02/25/locality-sensitive-hashing-in-elixir/). 18 | 19 | ## Installation 20 | 21 | Add `ex_lsh` to your list of dependencies in `mix.exs`: 22 | 23 | ```elixir 24 | def deps do 25 | [ 26 | {:ex_lsh, version: "~> 0.4"} 27 | ] 28 | end 29 | ``` 30 | 31 | ## Usage 32 | 33 | ```elixir 34 | "Lorem ipsum dolor sit amet" 35 | |> ExLSH.lsh() 36 | |> Base.encode64() 37 | ``` 38 | 39 | ## Docs 40 | 41 | see [hexdocs.pm/ex_lsh](https://hexdocs.pm/ex_lsh) 42 | 43 | ## Contributions 44 | 45 | Please fork the project and submit a PR. 46 | 47 | ## Credits 48 | 49 | - [SimHash](https://github.com/UniversalAvenue/simhash-ex) is a very similar, 50 | but less versatile implementation that is focused on short strings only. 51 | ExLSH is approximately [7 times faster](#benchmark) and supports arbitrary 52 | tokenization, shingling and hash functions. 53 | - [SpiritFingers](https://github.com/holsee/spirit_fingers) is ca. [27 times 54 | faster](#benchmark) than ExLSH but relies on a NIF that needs the full Rust 55 | toolchain to compile. SpiritFingers doesn't support customization of the 56 | algorithm, it uses SipHash by default. 57 | - [Resemblance](https://github.com/matpalm/resemblance) explores simhash and 58 | sketching in Ruby. The author has documented his findings in a series of 59 | articles. You may want to make yourself familiar with 60 | [Part 3: The SimHash Algorithm](http://matpalm.com/resemblance/simhash/). 61 | - [Near-duplicate detection](https://moz.com/devblog/near-duplicate-detection/) 62 | is a very helpful article by Moz. It explains core concepts such as 63 | tokenization, shingling, MinHash, SimHash, etc. 64 | 65 | ## Benchmark 66 | 67 | Benchmark against [SimHash](https://hex.pm/packages/simhash), run with 68 | [Benchee](https://hex.pm/packages/benchee). See the setup on the [benchmark 69 | branch](https://github.com/meltwater/ex_lsh/tree/benchmark). 70 | 71 | ``` 72 | Operating System: macOS 73 | CPU Information: Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz 74 | Number of Available Cores: 8 75 | Available memory: 16 GB 76 | Elixir 1.8.1 77 | Erlang 21.2.5 78 | 79 | Benchmark suite executing with the following configuration: 80 | warmup: 2 s 81 | time: 5 s 82 | memory time: 0 ns 83 | parallel: 1 84 | inputs: none specified 85 | Estimated total run time: 21 s 86 | 87 | 88 | Benchmarking ExLSH... 89 | Benchmarking Simhash... 90 | Benchmarking SpiritFingers... 91 | 92 | Name ips average deviation median 99th % 93 | SpiritFingers 8556.15 0.117 ms ±13.37% 0.111 ms 0.183 ms 94 | ExLSH 309.61 3.23 ms ±5.88% 3.19 ms 3.81 ms 95 | Simhash 43.19 23.15 ms ±12.57% 22.08 ms 30.54 ms 96 | 97 | Comparison: 98 | SpiritFingers 8556.15 99 | ExLSH 309.61 - 27.64x slower 100 | Simhash 43.19 - 198.11x slower 101 | ``` 102 | -------------------------------------------------------------------------------- /bench.exs: -------------------------------------------------------------------------------- 1 | clean = [ 2 | "qui", 3 | "eos", 4 | "ad", 5 | "qui", 6 | "ab", 7 | "laboriosam", 8 | "ut", 9 | "consequatur", 10 | "dolor", 11 | "est", 12 | "fugit", 13 | "voluptatem", 14 | "impedit", 15 | "aliquam", 16 | "excepturi", 17 | "recusandae", 18 | "facere", 19 | "alias", 20 | "omnis", 21 | "fuga", 22 | "enim", 23 | "tempore", 24 | "quia", 25 | "soluta", 26 | "maiores", 27 | "pariatur", 28 | "et", 29 | "cum", 30 | "non", 31 | "quibusdam", 32 | "veritatis", 33 | "nemo", 34 | "eius", 35 | "iusto", 36 | "voluptates", 37 | "neque", 38 | "quas", 39 | "quasi", 40 | "et", 41 | "nihil", 42 | "ipsam", 43 | "explicabo", 44 | "dicta", 45 | "quo", 46 | "quod", 47 | "veniam", 48 | "natus", 49 | "laboriosam", 50 | "exercitationem", 51 | "deleniti", 52 | "sint", 53 | "sapiente", 54 | "qui", 55 | "nostrum", 56 | "delectus", 57 | "voluptatem", 58 | "aspernatur", 59 | "eum", 60 | "voluptatem", 61 | "labore", 62 | "ut", 63 | "dolorum", 64 | "et", 65 | "nostrum", 66 | "sequi", 67 | "reiciendis", 68 | "nisi", 69 | "atque", 70 | "nulla", 71 | "nisi", 72 | "doloremque", 73 | "eaque", 74 | "est", 75 | "ad", 76 | "dolore", 77 | "perspiciatis", 78 | "veniam", 79 | "necessitatibus", 80 | "in", 81 | "sed", 82 | "laborum", 83 | "dolores", 84 | "quidem", 85 | "est", 86 | "et", 87 | "commodi", 88 | "vel", 89 | "tempore", 90 | "magni", 91 | "saepe", 92 | "hic", 93 | "rem", 94 | "qui", 95 | "dolorem", 96 | "et", 97 | "sed", 98 | "molestiae", 99 | "sed", 100 | "quae", 101 | "sint", 102 | "dolores", 103 | "rerum", 104 | "placeat", 105 | "et", 106 | "necessitatibus", 107 | "ratione", 108 | "odit", 109 | "omnis", 110 | "minima", 111 | "est", 112 | "delectus", 113 | "inventore", 114 | "itaque", 115 | "est", 116 | "et", 117 | "debitis", 118 | "laudantium", 119 | "sit", 120 | "at", 121 | "sit", 122 | "enim", 123 | "eius", 124 | "commodi", 125 | "omnis", 126 | "non", 127 | "ea", 128 | "quo", 129 | "rerum", 130 | "libero", 131 | "occaecati", 132 | "aut", 133 | "aperiam", 134 | "pariatur", 135 | "voluptate", 136 | "a", 137 | "officia", 138 | "optio", 139 | "quibusdam", 140 | "magnam", 141 | "neque", 142 | "eaque", 143 | "molestiae", 144 | "consectetur", 145 | "consectetur", 146 | "voluptatum", 147 | "nobis", 148 | "nam", 149 | "iste", 150 | "voluptatibus", 151 | "fugiat", 152 | "aliquid", 153 | "totam", 154 | "iure", 155 | "et", 156 | "repellat", 157 | "quis", 158 | "est", 159 | "quae", 160 | "fugiat", 161 | "minus", 162 | "asperiores", 163 | "dolorem", 164 | "magni", 165 | "nemo", 166 | "qui", 167 | "aliquam", 168 | "officia", 169 | "soluta", 170 | "distinctio", 171 | "consequuntur", 172 | "libero", 173 | "error", 174 | "quasi", 175 | "adipisci", 176 | "ullam", 177 | "reprehenderit", 178 | "laborum", 179 | "illo", 180 | "animi", 181 | "repellendus", 182 | "perferendis", 183 | "in", 184 | "fuga", 185 | "numquam", 186 | "ullam", 187 | "voluptatem", 188 | "facilis", 189 | "eum", 190 | "voluptas", 191 | "illo", 192 | "praesentium", 193 | "doloremque", 194 | "qui", 195 | "odio", 196 | "a", 197 | "dolores", 198 | "eum", 199 | "aliquid", 200 | "sit", 201 | "et", 202 | "consequatur", 203 | "et", 204 | "aspernatur", 205 | "error", 206 | "voluptas", 207 | "impedit", 208 | "numquam", 209 | "quia", 210 | "nesciunt", 211 | "et", 212 | "sed", 213 | "eveniet", 214 | "et", 215 | "voluptas", 216 | "laudantium", 217 | "consequuntur", 218 | "consequatur", 219 | "rerum", 220 | "aut", 221 | "aperiam", 222 | "accusamus", 223 | "praesentium", 224 | "similique", 225 | "vitae", 226 | "odio", 227 | "debitis", 228 | "dolor", 229 | "tempora", 230 | "quaerat", 231 | "sint", 232 | "voluptatum", 233 | "est", 234 | "qui", 235 | "unde", 236 | "eos", 237 | "non", 238 | "ut", 239 | "non", 240 | "dolore", 241 | "vitae", 242 | "est", 243 | "qui", 244 | "ut", 245 | "id", 246 | "accusantium", 247 | "iusto", 248 | "accusantium", 249 | "mollitia", 250 | "velit", 251 | "expedita", 252 | "consequatur", 253 | "et", 254 | "ipsam", 255 | "repudiandae", 256 | "sit", 257 | "vel", 258 | "non", 259 | "et", 260 | "beatae", 261 | "voluptatem", 262 | "aut", 263 | "provident", 264 | "ea", 265 | "sit", 266 | "natus", 267 | "odit", 268 | "sed", 269 | "facilis", 270 | "cum", 271 | "in", 272 | "excepturi", 273 | "amet", 274 | "assumenda", 275 | "inventore", 276 | "eos", 277 | "temporibus", 278 | "deserunt", 279 | "earum", 280 | "ducimus", 281 | "consequatur", 282 | "modi", 283 | "nam", 284 | "voluptate", 285 | "aut", 286 | "quia", 287 | "sunt", 288 | "qui", 289 | "possimus", 290 | "voluptatem", 291 | "ratione", 292 | "aut", 293 | "quia", 294 | "voluptas", 295 | "autem", 296 | "perspiciatis", 297 | "repellendus", 298 | "voluptates", 299 | "corporis", 300 | "quisquam", 301 | "nobis", 302 | "quas", 303 | "facere", 304 | "quo", 305 | "dolorem", 306 | "voluptatem", 307 | "sed", 308 | "ipsum", 309 | "explicabo", 310 | "exercitationem", 311 | "harum", 312 | "ut", 313 | "omnis", 314 | "molestias", 315 | "ipsa", 316 | "vel", 317 | "quis", 318 | "esse", 319 | "culpa", 320 | "tenetur", 321 | "quia", 322 | "temporibus", 323 | "eos", 324 | "totam", 325 | "similique", 326 | "sunt", 327 | "quia", 328 | "maxime", 329 | "consequatur", 330 | "unde", 331 | "suscipit", 332 | "blanditiis", 333 | "molestiae", 334 | "modi", 335 | "optio", 336 | "illum", 337 | "ex", 338 | "labore", 339 | "quisquam", 340 | "placeat", 341 | "eum", 342 | "cumque", 343 | "sapiente", 344 | "aut", 345 | "hic", 346 | "recusandae", 347 | "voluptatibus", 348 | "deserunt", 349 | "et", 350 | "minima", 351 | "sunt", 352 | "distinctio", 353 | "eligendi", 354 | "enim", 355 | "nesciunt", 356 | "beatae", 357 | "molestias", 358 | "officiis", 359 | "esse", 360 | "corrupti", 361 | "quia", 362 | "amet", 363 | "enim", 364 | "porro", 365 | "alias", 366 | "velit", 367 | "ipsa", 368 | "iure", 369 | "at", 370 | "nulla", 371 | "dicta", 372 | "quos", 373 | "provident", 374 | "fugit", 375 | "sint", 376 | "et", 377 | "quis", 378 | "blanditiis", 379 | "ea", 380 | "velit", 381 | "maxime", 382 | "nihil", 383 | "culpa", 384 | "architecto", 385 | "suscipit", 386 | "illum", 387 | "dolorem", 388 | "autem", 389 | "rem", 390 | "doloribus", 391 | "atque", 392 | "aut", 393 | "reiciendis", 394 | "ut", 395 | "quam", 396 | "ducimus", 397 | "autem", 398 | "dolor", 399 | "quo", 400 | "est", 401 | "autem" 402 | ] 403 | 404 | ExLSH.lsh(clean, 3, &ExLSH.default_hash/1, & &1, & &1) 405 | -------------------------------------------------------------------------------- /config/config.exs: -------------------------------------------------------------------------------- 1 | use Mix.Config 2 | -------------------------------------------------------------------------------- /lib/bitmacro.ex: -------------------------------------------------------------------------------- 1 | defmodule ExLSH.BitMacro do 2 | @moduledoc false 3 | 4 | # The following code aggregates hashes of shingles onto a list of ints. 5 | # To do this efficiently, we generate a set of functions that pattern-match 6 | # on the individual bits as wide as possible. An example implementation for 2 7 | # bits looks like this: 8 | # 9 | # def vector_reducer( 10 | # <>, 11 | # [a0, a1 | a_rest] 12 | # ) do 13 | # [ 14 | # a0 + b0 * 2 - 1, 15 | # a1 + b1 * 2 - 1 | vector_reducer(b_rest, a_rest) 16 | # ] 17 | # end 18 | # def vector_reducer(<<>>, []), do: [] 19 | # 20 | # This would result in 64 recursions per shingle for a 128-bit hash. To speed 21 | # things up, we try to match for as many bits as possible, and keep the 22 | # recursion number low. Obviously, writing this out for more than 16 bits is 23 | # unfeasible, so we have built a macro. 24 | 25 | @doc false 26 | defmacro vector_reducer(bits) do 27 | # match individual bits of the hash bitstring 28 | bit_matches = 29 | quote bind_quoted: [bits: bits] do 30 | for i <- 0..(bits - 1) do 31 | var = Macro.var(:"b#{i}", nil) 32 | 33 | quote do 34 | <> 35 | end 36 | end 37 | end 38 | 39 | # match individual elements of the accumulator list 40 | acc_matches = 41 | quote bind_quoted: [bits: bits] do 42 | for i <- 0..(bits - 1) do 43 | Macro.var(:"a#{i}", nil) 44 | end 45 | end 46 | 47 | # bitwise operation: adds 1 if bit is 1, substracts 1 if bit is 0 48 | addition = 49 | quote bind_quoted: [bits: bits] do 50 | for i <- 0..(bits - 1) do 51 | bitvar = Macro.var(:"b#{i}", nil) 52 | accvar = Macro.var(:"a#{i}", nil) 53 | 54 | quote do 55 | unquote(accvar) + 2 * unquote(bitvar) - 1 56 | end 57 | end 58 | end 59 | 60 | quote bind_quoted: [ 61 | bit_matches: bit_matches, 62 | acc_matches: acc_matches, 63 | addition: addition 64 | ] do 65 | defp vector_reducer( 66 | <>, 67 | [unquote_splicing(acc_matches) | acc_rest] 68 | ) do 69 | [unquote_splicing(addition) | vector_reducer(bit_rest, acc_rest)] 70 | end 71 | end 72 | end 73 | end 74 | -------------------------------------------------------------------------------- /lib/ex_lsh.ex: -------------------------------------------------------------------------------- 1 | defmodule ExLSH do 2 | @moduledoc """ 3 | Calculates a locality sensitive hash for text. 4 | 5 | ## Examples: 6 | 7 | iex> "Lorem ipsum dolor sit amet" 8 | ...> |> ExLSH.lsh() 9 | ...> |> Base.encode64() 10 | "uX05itKaghA0gQHCwDCIFg==" 11 | 12 | iex> "Lorem ipsum dolor sit amet" 13 | ...> |> ExLSH.lsh(2, &:crypto.hash(:sha, &1)) 14 | ...> |> Base.encode64() 15 | "VhW06EEJyWQA1gKIAAlQgI4NHUE=" 16 | 17 | """ 18 | 19 | require ExLSH.BitMacro 20 | 21 | @spec lsh( 22 | String.t(), 23 | pos_integer, 24 | (iodata() -> binary()), 25 | (String.t() -> String.t()), 26 | (String.t() -> list(String.t())), 27 | (list(String.t()) -> list(String.t())) 28 | ) :: binary 29 | 30 | @doc ~S""" 31 | Compute an LSH/SimHash for a given text. 32 | 33 | Returns a non-printable `:binary` of the hash. 34 | 35 | ## The following parameters are configurable: 36 | - `shingle_width`: if given 1, it will use the "bag of words" approach. 37 | Given an int > 1, it will compute hashes of n-grams of the given width. 38 | - `hasher`: a function that takes an IOList and returns its hash in a 39 | `:binary`. LSH computation is significantly faster on shorter hashes. See 40 | :crypto.supports()[:hashs] for all available hash functions on your 41 | platform 42 | - `normalizer`: a function that takes a string and returns a normalized string 43 | - `tokenizer`: a function that takes a normalized string and returns 44 | tokens, e.g. graphemes or words 45 | - `filter`: a functions that filters a list of tokens, e.g. removes 46 | stop-words, non-ASCII chars, etc. 47 | 48 | ## Examples: 49 | 50 | iex> "Lorem ipsum dolor sit amet" 51 | ...> |> ExLSH.lsh() 52 | ...> |> Base.encode64() 53 | "uX05itKaghA0gQHCwDCIFg==" 54 | 55 | iex> "Lorem ipsum dolor sit amet" 56 | ...> |> ExLSH.lsh(2, &:crypto.hash(:sha, &1)) 57 | ...> |> Base.encode64() 58 | "VhW06EEJyWQA1gKIAAlQgI4NHUE=" 59 | 60 | """ 61 | def lsh( 62 | text, 63 | shingle_width \\ 3, 64 | hasher \\ &default_hash/1, 65 | normalizer \\ &normalize/1, 66 | tokenizer \\ &tokenize_words/1, 67 | filter \\ &filter/1 68 | ) do 69 | hash_width = bit_size(hasher.("foo")) 70 | 71 | text 72 | |> normalizer.() 73 | |> tokenizer.() 74 | |> filter.() 75 | |> shingle(shingle_width) 76 | |> Enum.map(hasher) 77 | |> add_vectors(hash_width) 78 | |> ints_to_bits() 79 | |> bits_to_binary() 80 | end 81 | 82 | @doc """ 83 | Compute an LSH for a piece of text, e.g. a document. 84 | """ 85 | def wordwise_lsh(text, shingle_width \\ 3) do 86 | lsh(text, shingle_width) 87 | end 88 | 89 | @doc """ 90 | Compute an LSH for a short string, e.g. a username or email. 91 | """ 92 | def charwise_lsh(text, shingle_width \\ 3) do 93 | lsh(text, shingle_width, &default_hash/1, &normalize/1, &tokenize_chars/1) 94 | end 95 | 96 | @doc """ 97 | Default text normalizer: unicode normalization, lower case, replace all 98 | non-word chars with space, reduce consecutive spaces to one. 99 | """ 100 | def normalize(text) do 101 | text 102 | |> String.normalize(:nfc) 103 | |> String.downcase() 104 | |> String.replace(~r/\W/, " ") 105 | |> String.replace(~r/\W+/, " ") 106 | end 107 | 108 | @doc """ 109 | Split a string to its unicode graphemes. 110 | """ 111 | def tokenize_chars(text), do: text |> String.graphemes() 112 | 113 | @doc """ 114 | Split a string into words 115 | """ 116 | def tokenize_words(text), do: text |> String.split() 117 | 118 | @doc """ 119 | A noop filter. 120 | """ 121 | @spec filter(String.t()) :: String.t() 122 | def filter(words), do: words 123 | 124 | # Converts a list of tokens into a list of overlapping lists. 125 | defp shingle(words, n) do 126 | Enum.chunk_every(words, n, 1, :discard) 127 | end 128 | 129 | @doc """ 130 | Default hash, uses `:crypto.hash(:md5)` 131 | """ 132 | def default_hash(message) do 133 | :erlang.md5(message) 134 | end 135 | 136 | # Aggregate a list of binaries using a SimHash algorithm. 137 | defp add_vectors(vectors, hash_width) do 138 | acc = List.duplicate(0, hash_width) 139 | Enum.reduce(vectors, acc, &vector_reducer/2) 140 | end 141 | 142 | # Convert a list of ints to bits: positive ints become a 1, others: 0. 143 | defp ints_to_bits([head | tail]) when head > 0, do: [1 | ints_to_bits(tail)] 144 | defp ints_to_bits([_head | tail]), do: [0 | ints_to_bits(tail)] 145 | defp ints_to_bits([]), do: [] 146 | 147 | # Convert a list of bits represented by integers to a binary. 148 | defp bits_to_binary(bits) do 149 | bits 150 | |> Enum.chunk_every(8) 151 | |> Enum.map(&Integer.undigits(&1, 2)) 152 | |> :binary.list_to_bin() 153 | end 154 | 155 | # Recursion base case 156 | defp vector_reducer(<<>>, []), do: [] 157 | 158 | # The following code aggregates hashes of shingles onto a list of ints. 159 | # To do this efficiently, we generate a set of functions that pattern-match 160 | # on the individual bits as wide as possible. An example implementation for 2 161 | # bits looks like this: 162 | # 163 | # def vector_reducer( 164 | # <>, 165 | # [a0, a1 | a_rest] 166 | # ) do 167 | # [ 168 | # a0 + b0 * 2 - 1, 169 | # a1 + b1 * 2 - 1 | vector_reducer(b_rest, a_rest) 170 | # ] 171 | # end 172 | # def vector_reducer(<<>>, []), do: [] 173 | # 174 | # This would result in 64 recursions per shingle for a 128-bit hash. To speed 175 | # things up, we try to match for as many bits as possible, and keep the 176 | # recursion number low. Obviously, writing this out for more than 16 bits is 177 | # unfeasible, so we have built a macro, see `bitmacro.ex`. 178 | for i <- [256, 128, 64, 32, 8] do 179 | ExLSH.BitMacro.vector_reducer(i) 180 | end 181 | end 182 | -------------------------------------------------------------------------------- /logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 21 | 44 | 46 | 47 | 49 | image/svg+xml 50 | 52 | 53 | 54 | 55 | 56 | 61 | 70 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /mix.exs: -------------------------------------------------------------------------------- 1 | defmodule ExLSH.MixProject do 2 | use Mix.Project 3 | 4 | def project do 5 | [ 6 | app: :ex_lsh, 7 | aliases: aliases(), 8 | description: description(), 9 | version: "0.5.0", 10 | elixir: "~> 1.6", 11 | start_permanent: Mix.env() == :prod, 12 | deps: deps(), 13 | name: "ExLSH", 14 | source_url: "https://github.com/meltwater/ex_lsh", 15 | homepage_url: "https://hexdocs.pm/ex_lsh", 16 | docs: [ 17 | main: "ExLSH", 18 | logo: "logo.svg", 19 | extras: ["README.md"] 20 | ], 21 | package: package() 22 | ] 23 | end 24 | 25 | def application do 26 | [] 27 | end 28 | 29 | defp deps do 30 | [ 31 | {:ex_doc, "~> 0.18", only: :dev, runtime: false}, 32 | {:credo, "~> 1.0.0", only: [:dev, :test], runtime: false} 33 | ] 34 | end 35 | 36 | defp description() do 37 | "ExLSH calculates a locality sensitive hash for text. It can be used for near-dupclicate detection for text." 38 | end 39 | 40 | defp package() do 41 | [ 42 | licenses: ["Apache 2.0"], 43 | links: %{"GitHub" => "https://github.com/meltwater/ex_lsh"} 44 | ] 45 | end 46 | 47 | defp aliases do 48 | [ 49 | test: [ 50 | "format --check-formatted", 51 | "credo --strict", 52 | "test" 53 | ] 54 | ] 55 | end 56 | end 57 | -------------------------------------------------------------------------------- /mix.lock: -------------------------------------------------------------------------------- 1 | %{ 2 | "bunt": {:hex, :bunt, "0.2.0", "951c6e801e8b1d2cbe58ebbd3e616a869061ddadcc4863d0a2182541acae9a38", [:mix], [], "hexpm"}, 3 | "credo": {:hex, :credo, "1.0.2", "88bc918f215168bf6ce7070610a6173c45c82f32baa08bdfc80bf58df2d103b6", [:mix], [{:bunt, "~> 0.2.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm"}, 4 | "earmark": {:hex, :earmark, "1.3.1", "73812f447f7a42358d3ba79283cfa3075a7580a3a2ed457616d6517ac3738cb9", [:mix], [], "hexpm"}, 5 | "ex_doc": {:hex, :ex_doc, "0.19.3", "3c7b0f02851f5fc13b040e8e925051452e41248f685e40250d7e40b07b9f8c10", [:mix], [{:earmark, "~> 1.2", [hex: :earmark, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.10", [hex: :makeup_elixir, repo: "hexpm", optional: false]}], "hexpm"}, 6 | "jason": {:hex, :jason, "1.1.2", "b03dedea67a99223a2eaf9f1264ce37154564de899fd3d8b9a21b1a6fd64afe7", [:mix], [{:decimal, "~> 1.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm"}, 7 | "makeup": {:hex, :makeup, "0.8.0", "9cf32aea71c7fe0a4b2e9246c2c4978f9070257e5c9ce6d4a28ec450a839b55f", [:mix], [{:nimble_parsec, "~> 0.5.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm"}, 8 | "makeup_elixir": {:hex, :makeup_elixir, "0.13.0", "be7a477997dcac2e48a9d695ec730b2d22418292675c75aa2d34ba0909dcdeda", [:mix], [{:makeup, "~> 0.8", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm"}, 9 | "nimble_parsec": {:hex, :nimble_parsec, "0.5.0", "90e2eca3d0266e5c53f8fbe0079694740b9c91b6747f2b7e3c5d21966bba8300", [:mix], [], "hexpm"}, 10 | } 11 | -------------------------------------------------------------------------------- /test/ex_lsh_test.exs: -------------------------------------------------------------------------------- 1 | defmodule ExLSHTest do 2 | use ExUnit.Case 3 | doctest ExLSH 4 | require IEx 5 | 6 | test "Bag of words is independent of word order" do 7 | assert ExLSH.wordwise_lsh("foo bar baz", 1) == ExLSH.wordwise_lsh("foo baz bar", 1) 8 | end 9 | 10 | test "Repeating the phrase doesn't affect the hash" do 11 | s1 = repeat("foo bar baz", 100) 12 | s2 = repeat("foo bar baz", 200) 13 | assert ExLSH.wordwise_lsh(s1) == ExLSH.wordwise_lsh(s2) 14 | end 15 | 16 | test "Works with CRC32" do 17 | s1 = repeat("foo bar baz", 100) 18 | s2 = repeat("foo bar baz", 200) 19 | crc32 = fn s -> s |> :erlang.crc32() |> :binary.encode_unsigned() end 20 | assert ExLSH.lsh(s1, 3, crc32) == ExLSH.lsh(s2, 3, crc32) 21 | end 22 | 23 | def similarity(hash1, hash2) do 24 | 1.0 - hamming_distance(hash_to_bin(hash1), hash_to_bin(hash2)) / length(hash_to_bin(hash1)) 25 | end 26 | 27 | def hamming_distance([bit1 | rest1], [bit2 | rest2]) do 28 | if(bit1 == bit2, do: 0, else: 1) + hamming_distance(rest1, rest2) 29 | end 30 | 31 | def hamming_distance([], []), do: 0 32 | 33 | def hash_to_bin( 34 | <> 36 | ) do 37 | [b0, b1, b2, b3, b4, b5, b6, b7] ++ hash_to_bin(rest) 38 | end 39 | 40 | def hash_to_bin(<<>>), do: [] 41 | 42 | def repeat(s, times) do 43 | s |> List.duplicate(times) |> Enum.join(" ") 44 | end 45 | end 46 | -------------------------------------------------------------------------------- /test/test_helper.exs: -------------------------------------------------------------------------------- 1 | ExUnit.start() 2 | --------------------------------------------------------------------------------