├── config └── config.exs ├── native └── simhash │ ├── .gitignore │ ├── README.md │ ├── .cargo │ └── config.toml │ ├── Cargo.toml │ ├── src │ ├── lib.rs │ └── simhash_algo.rs │ └── Cargo.lock ├── test ├── test_helper.exs └── simhash_test.exs ├── .tool-versions ├── logo.png ├── .DS_Store ├── lib ├── spirit_fingers.ex ├── mix │ └── tasks │ │ └── test.rust.ex └── simhash.ex ├── logo ├── Spirit Fingers - Logo.ai ├── Spirit Fingers - Logo.eps ├── Spirit Fingers - Logo.jpg ├── Spirit Fingers - Logo.png └── Spirit Fingers - Logo-01.eps ├── .formatter.exs ├── .dialyzer_ignore.exs ├── .gitignore ├── LICENSE ├── README.md ├── mix.exs ├── mix.lock └── .github └── workflows └── ci.yml /config/config.exs: -------------------------------------------------------------------------------- 1 | import Config 2 | -------------------------------------------------------------------------------- /native/simhash/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /test/test_helper.exs: -------------------------------------------------------------------------------- 1 | ExUnit.start() 2 | -------------------------------------------------------------------------------- /.tool-versions: -------------------------------------------------------------------------------- 1 | rust 1.90.0 2 | elixir 1.19.1-otp-28 3 | erlang 28.1.1 4 | -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/holsee/spirit_fingers/HEAD/logo.png -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/holsee/spirit_fingers/HEAD/.DS_Store -------------------------------------------------------------------------------- /lib/spirit_fingers.ex: -------------------------------------------------------------------------------- 1 | defmodule SpiritFingers do 2 | @moduledoc "See: `SpiritFingers.SimHash`." 3 | end 4 | -------------------------------------------------------------------------------- /logo/Spirit Fingers - Logo.ai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/holsee/spirit_fingers/HEAD/logo/Spirit Fingers - Logo.ai -------------------------------------------------------------------------------- /logo/Spirit Fingers - Logo.eps: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/holsee/spirit_fingers/HEAD/logo/Spirit Fingers - Logo.eps -------------------------------------------------------------------------------- /logo/Spirit Fingers - Logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/holsee/spirit_fingers/HEAD/logo/Spirit Fingers - Logo.jpg -------------------------------------------------------------------------------- /logo/Spirit Fingers - Logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/holsee/spirit_fingers/HEAD/logo/Spirit Fingers - Logo.png -------------------------------------------------------------------------------- /logo/Spirit Fingers - Logo-01.eps: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/holsee/spirit_fingers/HEAD/logo/Spirit Fingers - Logo-01.eps -------------------------------------------------------------------------------- /.formatter.exs: -------------------------------------------------------------------------------- 1 | # Used by "mix format" 2 | [ 3 | inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"] 4 | ] 5 | -------------------------------------------------------------------------------- /native/simhash/README.md: -------------------------------------------------------------------------------- 1 | # NIF for Elixir.SpiritFingers.SimHash 2 | 3 | Rust NIF wrapping: https://github.com/holsee/simhash-rs 4 | -------------------------------------------------------------------------------- /test/simhash_test.exs: -------------------------------------------------------------------------------- 1 | defmodule SpiritFingers.SimHashTest do 2 | use ExUnit.Case 3 | doctest SpiritFingers.SimHash 4 | end 5 | -------------------------------------------------------------------------------- /.dialyzer_ignore.exs: -------------------------------------------------------------------------------- 1 | [ 2 | # Mix tasks are build-time only and don't need runtime type checking 3 | ~r/lib\/mix\/tasks\// 4 | ] 5 | -------------------------------------------------------------------------------- /native/simhash/.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [target.'cfg(target_os = "macos")'] 2 | rustflags = [ 3 | "-C", "link-arg=-undefined", 4 | "-C", "link-arg=dynamic_lookup", 5 | ] 6 | -------------------------------------------------------------------------------- /native/simhash/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "spirit_fingers_simhash" 3 | version = "0.4.1" 4 | authors = [] 5 | edition = "2021" 6 | 7 | [lib] 8 | name = "spirit_fingers_simhash" 9 | path = "src/lib.rs" 10 | crate-type = ["cdylib"] 11 | 12 | [dependencies] 13 | rustler = { version = "0.37", default-features = true } 14 | siphasher = "0.3" 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /native/simhash/target/ 2 | 3 | # The directory Mix will write compiled artifacts to. 4 | /_build/ 5 | 6 | # If you run "mix test --cover", coverage assets end up here. 7 | /cover/ 8 | 9 | # The directory Mix downloads your dependencies sources to. 10 | /deps/ 11 | 12 | # Where 3rd-party dependencies like ExDoc output generated docs. 13 | /doc/ 14 | 15 | # Ignore .fetch files in case you like to edit your project deps locally. 16 | /.fetch 17 | 18 | # If the VM crashes, it generates a dump, let's ignore it too. 19 | erl_crash.dump 20 | 21 | # Also ignore archive artifacts (built via "mix archive.build"). 22 | *.ez 23 | 24 | # Ignore package tarball (built via "mix hex.build"). 25 | spirit_fingers-*.tar 26 | 27 | # Ignore rust artifacts 28 | **/*.so 29 | -------------------------------------------------------------------------------- /lib/mix/tasks/test.rust.ex: -------------------------------------------------------------------------------- 1 | defmodule Mix.Tasks.Test.Rust do 2 | @moduledoc """ 3 | Runs Rust unit tests for the native simhash crate. 4 | 5 | ## Usage 6 | 7 | mix test.rust 8 | 9 | This task runs `cargo test` in the native/simhash directory. 10 | """ 11 | @shortdoc "Run Rust unit tests" 12 | 13 | use Mix.Task 14 | 15 | @impl Mix.Task 16 | def run(_args) do 17 | crate_path = Path.join([File.cwd!(), "native", "simhash"]) 18 | 19 | if File.dir?(crate_path) do 20 | Mix.shell().info("Running Rust tests in #{crate_path}...") 21 | 22 | case System.cmd("cargo", ["test"], cd: crate_path, into: IO.stream(:stdio, :line)) do 23 | {_, 0} -> 24 | Mix.shell().info("\nRust tests passed! ✓") 25 | :ok 26 | 27 | {_, exit_code} -> 28 | Mix.shell().error("\nRust tests failed with exit code #{exit_code}") 29 | Mix.raise("Rust tests failed") 30 | end 31 | else 32 | Mix.raise("Rust crate directory not found at #{crate_path}") 33 | end 34 | end 35 | end 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright © 2018 Steven Holdsworth 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SpiritFingers 2 | 3 | [![Hex.pm Version](https://img.shields.io/hexpm/v/spirit_fingers.svg)](https://hex.pm/packages/spirit_fingers) 4 | [![Hex.pm Downloads](https://img.shields.io/hexpm/dt/spirit_fingers.svg)](https://hex.pm/packages/spirit_fingers) 5 | [![Documentation](https://img.shields.io/badge/docs-hexpm-blue.svg)](https://hexdocs.pm/spirit_fingers) 6 | [![CI](https://github.com/holsee/spirit_fingers/actions/workflows/ci.yml/badge.svg)](https://github.com/holsee/spirit_fingers/actions/workflows/ci.yml) 7 | [![License](https://img.shields.io/hexpm/l/spirit_fingers.svg)](https://github.com/holsee/spirit_fingers/blob/master/LICENSE) 8 | 9 | "Fast SimHash NIFs written in Rust 🐇💨 as Erlang/Elixir versions were too slow 🐢" 10 | 11 |

12 | logo 13 |

14 | 15 | * [Hex Package](https://hex.pm/packages/spirit_fingers). 16 | * [Documentation](https://hexdocs.pm/spirit_fingers). 17 | * [Simhash Benchmarks](https://github.com/holsee/simhash_benchmarks) TL;DR 400-900x faster, orders of magnitude more memory efficient and handles large binaries where others cannot. 18 | 19 | ## Build 20 | 21 | ``` 22 | mix compile 23 | ``` 24 | 25 | ## Test 26 | 27 | ``` 28 | mix test 29 | ``` 30 | 31 | ## Versions 32 | 33 | * Elixir ~> 1.14 34 | * OTP 25 35 | * Rust 2021 ~> 1.67.0 36 | * Rustler 0.27.0 37 | 38 | ## Installation 39 | 40 | Add `spirit_fingers` to your list of dependencies in `mix.exs`: 41 | 42 | ```elixir 43 | def deps do 44 | [ 45 | {:spirit_fingers, "~> 0.5"} 46 | ] 47 | end 48 | ``` 49 | 50 | ## Usage 51 | 52 | Generate SimHash: 53 | ``` elixir 54 | SpiritFingers.SimHash.similarity_hash("wow") 55 | {:ok, 17399923637769257768} 56 | ``` 57 | 58 | Similarity between strings: 59 | ``` elixir 60 | SpiritFingers.SimHash.similarity("Hocus pocus", "Hocus pocus pilatus pas") 61 | {:ok, 0.9375} 62 | ``` 63 | 64 | Hamming Distance between hashes: 65 | ``` elixir 66 | SpiritFingers.SimHash.hamming_distance(17399923637769257768, 17399923637769257768) 67 | {:ok, 0.0} 68 | ``` 69 | 70 | Similarity between hashes: 71 | ``` elixir 72 | SpiritFingers.SimHash.hash_similarity(17399923637769257768, 17399923637769257768) 73 | {:ok, 1.0} 74 | ``` 75 | 76 | -------------------------------------------------------------------------------- /mix.exs: -------------------------------------------------------------------------------- 1 | defmodule SpiritFingers.MixProject do 2 | use Mix.Project 3 | 4 | def project do 5 | [ 6 | app: :spirit_fingers, 7 | version: "0.5.1", 8 | elixir: "~> 1.15", 9 | start_permanent: Mix.env() == :prod, 10 | deps: deps(), 11 | aliases: aliases(), 12 | dialyzer: dialyzer(), 13 | name: "SpiritFingers", 14 | source_url: "https://github.com/holsee/spirit_fingers", 15 | homepage_url: "https://hex.pm/packages/spirit_fingers", 16 | docs: [main: "SpiritFingers", logo: "logo.png", extras: ["README.md"]], 17 | package: package(), 18 | description: description() 19 | ] 20 | end 21 | 22 | # Run "mix help compile.app" to learn about applications. 23 | def application do 24 | [ 25 | extra_applications: [:logger] 26 | ] 27 | end 28 | 29 | # Run "mix help deps" to learn about dependencies. 30 | defp deps do 31 | [ 32 | {:rustler, "~> 0.37.1"}, 33 | {:ex_doc, "~> 0.34", only: :dev, runtime: false}, 34 | {:dialyxir, "~> 1.4", only: [:dev, :test], runtime: false}, 35 | {:credo, "~> 1.7", only: [:dev, :test], runtime: false} 36 | ] 37 | end 38 | 39 | defp package() do 40 | [ 41 | name: "spirit_fingers", 42 | files: ~w( 43 | config 44 | lib 45 | native/simhash/.cargo 46 | native/simhash/src 47 | native/simhash/Cargo.* 48 | native/simhash/README.md 49 | mix.exs 50 | README.md 51 | LICENSE 52 | logo.png 53 | ), 54 | links: %{ 55 | "GitHub" => "https://github.com/holsee/spirit_fingers" 56 | }, 57 | maintainers: ["Steven Holdsworth (@holsee)"], 58 | licenses: ["MIT"] 59 | ] 60 | end 61 | 62 | defp description() do 63 | "Fast SimHash NIFs written in Rust 🐇💨 as Erlang/Elixir versions were too slow 🐢" 64 | end 65 | 66 | defp dialyzer do 67 | [ 68 | plt_add_apps: [:mix], 69 | ignore_warnings: ".dialyzer_ignore.exs" 70 | ] 71 | end 72 | 73 | defp aliases do 74 | [ 75 | "test.all": ["test", "test.rust"] 76 | ] 77 | end 78 | 79 | def cli do 80 | [ 81 | preferred_envs: [ 82 | "test.all": :test, 83 | "test.rust": :test 84 | ] 85 | ] 86 | end 87 | end 88 | -------------------------------------------------------------------------------- /native/simhash/src/lib.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2023 Steven Holdsworth (@holsee) 2 | // 3 | // Licensed under the MIT License 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining a copy 6 | // of this software and associated documentation files (the "Software"), to deal 7 | // in the Software without restriction, including without limitation the rights 8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | // copies of the Software, and to permit persons to whom the Software is 10 | // furnished to do so, subject to the following conditions: 11 | // 12 | // The above copyright notice and this permission notice shall be included in all 13 | // copies or substantial portions of the Software. 14 | // 15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | // SOFTWARE. 22 | 23 | use rustler::{Atom, Error}; 24 | 25 | mod simhash_algo; 26 | 27 | mod atoms { 28 | rustler::atoms! { 29 | ok, 30 | error 31 | } 32 | } 33 | 34 | #[rustler::nif] 35 | fn similarity_hash(text: &str) -> Result<(Atom, u64), Error> { 36 | let hash: u64 = simhash_algo::simhash(text); 37 | Ok((atoms::ok(), hash)) 38 | } 39 | 40 | #[rustler::nif] 41 | fn hamming_distance(hash0: u64, hash1: u64) -> Result<(Atom, u32), Error> { 42 | let ham_dist: u32 = simhash_algo::hamming_distance(hash0, hash1); 43 | Ok((atoms::ok(), ham_dist)) 44 | } 45 | 46 | #[rustler::nif] 47 | fn hash_similarity(hash0: u64, hash1: u64) -> Result<(Atom, f64), Error> { 48 | let hash_similarity = simhash_algo::hash_similarity(hash0, hash1); 49 | Ok((atoms::ok(), hash_similarity)) 50 | } 51 | 52 | #[rustler::nif] 53 | fn similarity(text0: &str, text1: &str) -> Result<(Atom, f64), Error> { 54 | let similarity: f64 = simhash_algo::similarity(text0, text1); 55 | Ok((atoms::ok(), similarity)) 56 | } 57 | 58 | rustler::init!("Elixir.SpiritFingers.SimHash"); 59 | -------------------------------------------------------------------------------- /lib/simhash.ex: -------------------------------------------------------------------------------- 1 | defmodule SpiritFingers.SimHash do 2 | @moduledoc """ 3 | SimHash Module which delegates to Rust NIFs which will 4 | perform the hashing, similarity and distance calculations. 5 | """ 6 | use Rustler, 7 | otp_app: :spirit_fingers, 8 | crate: "spirit_fingers_simhash", 9 | path: "native/simhash", 10 | mode: :release 11 | 12 | @typedoc "unsigned 64 bit integer represenation of simhash" 13 | @type t :: pos_integer() 14 | 15 | @typedoc """ 16 | Similarity between two `SimHash.t`, represented as a value 17 | between 0.0 and 1.0. 18 | * `0.0` means no similarity, 19 | * `1.0` means identical. 20 | """ 21 | @type similarity :: float() 22 | 23 | @typedoc """ 24 | 64 bit floating point represenation of the 25 | [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance) 26 | between 2 `SimHash.t`. 27 | """ 28 | @type distance :: float() 29 | 30 | @doc """ 31 | Calculate `SimHash.t` split by whitespace. 32 | 33 | ## Examples 34 | 35 | iex> SpiritFingers.SimHash.similarity_hash("The cat sat on the mat") 36 | {:ok, 2595200813813010837} 37 | 38 | iex> SpiritFingers.SimHash.similarity_hash("The cat sat under the mat") 39 | {:ok, 2595269945604666783} 40 | 41 | iex> SpiritFingers.SimHash.similarity_hash("Why the lucky stiff") 42 | {:ok, 1155526875459215761} 43 | """ 44 | @spec similarity_hash(binary()) :: {:ok, t()} 45 | def similarity_hash(_bin), do: :erlang.nif_error(:nif_not_loaded) 46 | 47 | @doc """ 48 | Bitwise hamming distance of two `SimHash.t` hashes 49 | 50 | ## Examples 51 | 52 | iex> SpiritFingers.SimHash.hamming_distance(0, 0) 53 | {:ok, 0} 54 | 55 | iex> SpiritFingers.SimHash.hamming_distance(0b1111111, 0b0000000) 56 | {:ok, 7} 57 | 58 | iex> SpiritFingers.SimHash.hamming_distance(0b0100101, 0b1100110) 59 | {:ok, 3} 60 | """ 61 | @spec hamming_distance(t(), t()) :: {:ok, distance()} 62 | def hamming_distance(_hash0, _hash1), do: :erlang.nif_error(:nif_not_loaded) 63 | 64 | @doc """ 65 | Calculate similarity as `SimHash.similarity` of two hashes. 66 | `0.0` means no similarity, `1.0` means identical. 67 | 68 | ## Examples 69 | 70 | iex> SpiritFingers.SimHash.hash_similarity(0, 0) 71 | {:ok, 1.0} 72 | 73 | iex> SpiritFingers.SimHash.hash_similarity(0xFFFFFFFFFFFFFFFF, 0) 74 | {:ok, 0.0} 75 | 76 | iex> SpiritFingers.SimHash.hash_similarity(0xFFFFFFFF, 0) 77 | {:ok, 0.5} 78 | """ 79 | @spec hash_similarity(t(), t()) :: {:ok, similarity()} 80 | def hash_similarity(_hash0, _hash1), do: :erlang.nif_error(:nif_not_loaded) 81 | 82 | @doc """ 83 | Calculate similarity `SimHash.similarity` of two string slices split by whitespace by simhash. 84 | 85 | ## Examples 86 | 87 | iex> SpiritFingers.SimHash.similarity("Stop hammertime", "Stop hammertime") 88 | {:ok, 1.0} 89 | 90 | iex> SpiritFingers.SimHash.similarity("Hocus pocus", "Hocus pocus pilatus pas") 91 | {:ok, 0.9375} 92 | 93 | iex> SpiritFingers.SimHash.similarity("Peanut butter", "Strawberry cocktail") 94 | {:ok, 0.59375} 95 | """ 96 | @spec similarity(binary(), binary()) :: {:ok, similarity()} 97 | def similarity(_text0, _text1), do: :erlang.nif_error(:nif_not_loaded) 98 | end 99 | -------------------------------------------------------------------------------- /mix.lock: -------------------------------------------------------------------------------- 1 | %{ 2 | "bunt": {:hex, :bunt, "1.0.0", "081c2c665f086849e6d57900292b3a161727ab40431219529f13c4ddcf3e7a44", [:mix], [], "hexpm", "dc5f86aa08a5f6fa6b8096f0735c4e76d54ae5c9fa2c143e5a1fc7c1cd9bb6b5"}, 3 | "credo": {:hex, :credo, "1.7.13", "126a0697df6b7b71cd18c81bc92335297839a806b6f62b61d417500d1070ff4e", [:mix], [{:bunt, "~> 0.2.1 or ~> 1.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:file_system, "~> 0.2 or ~> 1.0", [hex: :file_system, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "47641e6d2bbff1e241e87695b29f617f1a8f912adea34296fb10ecc3d7e9e84f"}, 4 | "dialyxir": {:hex, :dialyxir, "1.4.6", "7cca478334bf8307e968664343cbdb432ee95b4b68a9cba95bdabb0ad5bdfd9a", [:mix], [{:erlex, ">= 0.2.7", [hex: :erlex, repo: "hexpm", optional: false]}], "hexpm", "8cf5615c5cd4c2da6c501faae642839c8405b49f8aa057ad4ae401cb808ef64d"}, 5 | "earmark_parser": {:hex, :earmark_parser, "1.4.44", "f20830dd6b5c77afe2b063777ddbbff09f9759396500cdbe7523efd58d7a339c", [:mix], [], "hexpm", "4778ac752b4701a5599215f7030989c989ffdc4f6df457c5f36938cc2d2a2750"}, 6 | "erlex": {:hex, :erlex, "0.2.7", "810e8725f96ab74d17aac676e748627a07bc87eb950d2b83acd29dc047a30595", [:mix], [], "hexpm", "3ed95f79d1a844c3f6bf0cea61e0d5612a42ce56da9c03f01df538685365efb0"}, 7 | "ex_doc": {:hex, :ex_doc, "0.38.4", "ab48dff7a8af84226bf23baddcdda329f467255d924380a0cf0cee97bb9a9ede", [:mix], [{:earmark_parser, "~> 1.4.44", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "f7b62346408a83911c2580154e35613eb314e0278aeea72ed7fedef9c1f165b2"}, 8 | "file_system": {:hex, :file_system, "1.1.1", "31864f4685b0148f25bd3fbef2b1228457c0c89024ad67f7a81a3ffbc0bbad3a", [:mix], [], "hexpm", "7a15ff97dfe526aeefb090a7a9d3d03aa907e100e262a0f8f7746b78f8f87a5d"}, 9 | "jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"}, 10 | "makeup": {:hex, :makeup, "1.2.1", "e90ac1c65589ef354378def3ba19d401e739ee7ee06fb47f94c687016e3713d1", [:mix], [{:nimble_parsec, "~> 1.4", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "d36484867b0bae0fea568d10131197a4c2e47056a6fbe84922bf6ba71c8d17ce"}, 11 | "makeup_elixir": {:hex, :makeup_elixir, "1.0.1", "e928a4f984e795e41e3abd27bfc09f51db16ab8ba1aebdba2b3a575437efafc2", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "7284900d412a3e5cfd97fdaed4f5ed389b8f2b4cb49efc0eb3bd10e2febf9507"}, 12 | "makeup_erlang": {:hex, :makeup_erlang, "1.0.2", "03e1804074b3aa64d5fad7aa64601ed0fb395337b982d9bcf04029d68d51b6a7", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "af33ff7ef368d5893e4a267933e7744e46ce3cf1f61e2dccf53a111ed3aa3727"}, 13 | "nimble_parsec": {:hex, :nimble_parsec, "1.4.2", "8efba0122db06df95bfaa78f791344a89352ba04baedd3849593bfce4d0dc1c6", [:mix], [], "hexpm", "4b21398942dda052b403bbe1da991ccd03a053668d147d53fb8c4e0efe09c973"}, 14 | "rustler": {:hex, :rustler, "0.37.1", "721434020c7f6f8e1cdc57f44f75c490435b01de96384f8ccb96043f12e8a7e0", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "24547e9b8640cf00e6a2071acb710f3e12ce0346692e45098d84d45cdb54fd79"}, 15 | } 16 | -------------------------------------------------------------------------------- /native/simhash/Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 4 4 | 5 | [[package]] 6 | name = "cfg-if" 7 | version = "1.0.4" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" 10 | 11 | [[package]] 12 | name = "heck" 13 | version = "0.5.0" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" 16 | 17 | [[package]] 18 | name = "inventory" 19 | version = "0.3.21" 20 | source = "registry+https://github.com/rust-lang/crates.io-index" 21 | checksum = "bc61209c082fbeb19919bee74b176221b27223e27b65d781eb91af24eb1fb46e" 22 | dependencies = [ 23 | "rustversion", 24 | ] 25 | 26 | [[package]] 27 | name = "libloading" 28 | version = "0.8.9" 29 | source = "registry+https://github.com/rust-lang/crates.io-index" 30 | checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" 31 | dependencies = [ 32 | "cfg-if", 33 | "windows-link", 34 | ] 35 | 36 | [[package]] 37 | name = "proc-macro2" 38 | version = "1.0.101" 39 | source = "registry+https://github.com/rust-lang/crates.io-index" 40 | checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" 41 | dependencies = [ 42 | "unicode-ident", 43 | ] 44 | 45 | [[package]] 46 | name = "quote" 47 | version = "1.0.41" 48 | source = "registry+https://github.com/rust-lang/crates.io-index" 49 | checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1" 50 | dependencies = [ 51 | "proc-macro2", 52 | ] 53 | 54 | [[package]] 55 | name = "regex-lite" 56 | version = "0.1.8" 57 | source = "registry+https://github.com/rust-lang/crates.io-index" 58 | checksum = "8d942b98df5e658f56f20d592c7f868833fe38115e65c33003d8cd224b0155da" 59 | 60 | [[package]] 61 | name = "rustler" 62 | version = "0.37.0" 63 | source = "registry+https://github.com/rust-lang/crates.io-index" 64 | checksum = "fb867bb35b291ef105abbe0a0d04bd4d7af372e023d08845698687bc254f222b" 65 | dependencies = [ 66 | "inventory", 67 | "libloading", 68 | "regex-lite", 69 | "rustler_codegen", 70 | ] 71 | 72 | [[package]] 73 | name = "rustler_codegen" 74 | version = "0.37.0" 75 | source = "registry+https://github.com/rust-lang/crates.io-index" 76 | checksum = "90993223c5ac0fb580ff966fb9477289c4e8a610a2f4639912a2639c5e7b5095" 77 | dependencies = [ 78 | "heck", 79 | "inventory", 80 | "proc-macro2", 81 | "quote", 82 | "syn", 83 | ] 84 | 85 | [[package]] 86 | name = "rustversion" 87 | version = "1.0.22" 88 | source = "registry+https://github.com/rust-lang/crates.io-index" 89 | checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" 90 | 91 | [[package]] 92 | name = "siphasher" 93 | version = "0.3.11" 94 | source = "registry+https://github.com/rust-lang/crates.io-index" 95 | checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" 96 | 97 | [[package]] 98 | name = "spirit_fingers_simhash" 99 | version = "0.4.1" 100 | dependencies = [ 101 | "rustler", 102 | "siphasher", 103 | ] 104 | 105 | [[package]] 106 | name = "syn" 107 | version = "2.0.107" 108 | source = "registry+https://github.com/rust-lang/crates.io-index" 109 | checksum = "2a26dbd934e5451d21ef060c018dae56fc073894c5a7896f882928a76e6d081b" 110 | dependencies = [ 111 | "proc-macro2", 112 | "quote", 113 | "unicode-ident", 114 | ] 115 | 116 | [[package]] 117 | name = "unicode-ident" 118 | version = "1.0.20" 119 | source = "registry+https://github.com/rust-lang/crates.io-index" 120 | checksum = "462eeb75aeb73aea900253ce739c8e18a67423fadf006037cd3ff27e82748a06" 121 | 122 | [[package]] 123 | name = "windows-link" 124 | version = "0.2.1" 125 | source = "registry+https://github.com/rust-lang/crates.io-index" 126 | checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" 127 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | env: 10 | MIX_ENV: test 11 | 12 | jobs: 13 | test: 14 | name: Test (Elixir ${{matrix.elixir}} | OTP ${{matrix.otp}}) 15 | runs-on: ubuntu-latest 16 | 17 | strategy: 18 | matrix: 19 | elixir: ['1.19.1'] 20 | otp: ['28.1.1'] 21 | 22 | steps: 23 | - name: Checkout code 24 | uses: actions/checkout@v4 25 | 26 | - name: Set up Elixir 27 | uses: erlef/setup-beam@v1 28 | with: 29 | elixir-version: ${{matrix.elixir}} 30 | otp-version: ${{matrix.otp}} 31 | 32 | - name: Set up Rust 33 | uses: dtolnay/rust-toolchain@1.90.0 34 | 35 | - name: Cache Mix dependencies 36 | uses: actions/cache@v4 37 | with: 38 | path: deps 39 | key: ${{ runner.os }}-mix-deps-${{ hashFiles('**/mix.lock') }} 40 | restore-keys: | 41 | ${{ runner.os }}-mix-deps- 42 | 43 | - name: Cache Cargo 44 | uses: actions/cache@v4 45 | with: 46 | path: | 47 | ~/.cargo/bin/ 48 | ~/.cargo/registry/index/ 49 | ~/.cargo/registry/cache/ 50 | ~/.cargo/git/db/ 51 | native/simhash/target/ 52 | key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} 53 | restore-keys: | 54 | ${{ runner.os }}-cargo- 55 | 56 | - name: Install dependencies 57 | run: mix deps.get 58 | 59 | - name: Compile dependencies 60 | run: mix deps.compile 61 | 62 | - name: Compile project (including Rust NIF) 63 | run: mix compile --warnings-as-errors 64 | 65 | - name: Run Elixir tests 66 | run: mix test 67 | 68 | - name: Run Rust tests 69 | run: mix test.rust 70 | 71 | quality: 72 | name: Code Quality 73 | runs-on: ubuntu-latest 74 | 75 | strategy: 76 | matrix: 77 | elixir: ['1.19.1'] 78 | otp: ['28.1.1'] 79 | 80 | steps: 81 | - name: Checkout code 82 | uses: actions/checkout@v4 83 | 84 | - name: Set up Elixir 85 | uses: erlef/setup-beam@v1 86 | with: 87 | elixir-version: ${{matrix.elixir}} 88 | otp-version: ${{matrix.otp}} 89 | 90 | - name: Set up Rust 91 | uses: dtolnay/rust-toolchain@1.90.0 92 | 93 | - name: Cache Mix dependencies 94 | uses: actions/cache@v4 95 | with: 96 | path: deps 97 | key: ${{ runner.os }}-mix-deps-${{ hashFiles('**/mix.lock') }} 98 | restore-keys: | 99 | ${{ runner.os }}-mix-deps- 100 | 101 | - name: Cache Cargo 102 | uses: actions/cache@v4 103 | with: 104 | path: | 105 | ~/.cargo/bin/ 106 | ~/.cargo/registry/index/ 107 | ~/.cargo/registry/cache/ 108 | ~/.cargo/git/db/ 109 | native/simhash/target/ 110 | key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} 111 | restore-keys: | 112 | ${{ runner.os }}-cargo- 113 | 114 | - name: Cache PLT 115 | id: plt_cache 116 | uses: actions/cache@v4 117 | with: 118 | path: _build/test/dialyxir*.plt 119 | key: ${{ runner.os }}-plt-${{ matrix.otp }}-${{ matrix.elixir }}-${{ hashFiles('**/mix.lock') }} 120 | restore-keys: | 121 | ${{ runner.os }}-plt-${{ matrix.otp }}-${{ matrix.elixir }}- 122 | 123 | - name: Install dependencies 124 | run: mix deps.get 125 | 126 | - name: Compile 127 | run: mix compile 128 | 129 | - name: Create PLTs 130 | if: steps.plt_cache.outputs.cache-hit != 'true' 131 | run: mix dialyzer --plt 132 | 133 | - name: Run Credo 134 | run: mix credo --strict 135 | 136 | - name: Run Dialyzer 137 | run: mix dialyzer --format github 138 | 139 | rust-quality: 140 | name: Rust Quality 141 | runs-on: ubuntu-latest 142 | 143 | steps: 144 | - name: Checkout code 145 | uses: actions/checkout@v4 146 | 147 | - name: Set up Rust 148 | uses: dtolnay/rust-toolchain@1.90.0 149 | with: 150 | components: clippy, rustfmt 151 | 152 | - name: Cache Cargo 153 | uses: actions/cache@v4 154 | with: 155 | path: | 156 | ~/.cargo/bin/ 157 | ~/.cargo/registry/index/ 158 | ~/.cargo/registry/cache/ 159 | ~/.cargo/git/db/ 160 | native/simhash/target/ 161 | key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} 162 | 163 | - name: Run Clippy 164 | run: cd native/simhash && cargo clippy -- -D warnings 165 | 166 | - name: Check formatting 167 | run: cd native/simhash && cargo fmt -- --check 168 | -------------------------------------------------------------------------------- /native/simhash/src/simhash_algo.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014 Bart Olsthoorn 2 | // Copyright (c) 2017 Jakub Pastuszek 3 | // 4 | // Licensed under the MIT License 5 | // 6 | // Permission is hereby granted, free of charge, to any person obtaining a copy 7 | // of this software and associated documentation files (the "Software"), to deal 8 | // in the Software without restriction, including without limitation the rights 9 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | // copies of the Software, and to permit persons to whom the Software is 11 | // furnished to do so, subject to the following conditions: 12 | // 13 | // The above copyright notice and this permission notice shall be included in all 14 | // copies or substantial portions of the Software. 15 | // 16 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | // SOFTWARE. 23 | 24 | //! Rust Simhash implementation 25 | //! 26 | //! Originally implemented by Bart Olsthoorn on 12/08/2014 27 | //! Ported to Rust 1.16.0 by Jakub Pastuszek on 29/05/2017 28 | //! With the help of http://matpalm.com/resemblance/simhash/ 29 | //! 30 | //! Vendored from https://github.com/bartolsthoorn/simhash-rs into spirit_fingers project 31 | //! Enhanced with performance improvements and idiomatic Rust patterns by @holsee 32 | 33 | use siphasher::sip::SipHasher; 34 | use std::hash::{Hash, Hasher}; 35 | 36 | /// Number of bits in the hash (u64) 37 | const HASH_BITS: usize = 64; 38 | 39 | fn hash_feature(t: &T) -> u64 { 40 | let mut s = SipHasher::default(); 41 | t.hash(&mut s); 42 | s.finish() 43 | } 44 | 45 | /// Calculate `u64` simhash from stream of `&str` words 46 | /// 47 | /// # Returns 48 | /// Returns `0` for empty input streams 49 | pub fn simhash_stream<'w, W>(words: W) -> u64 50 | where 51 | W: Iterator, 52 | { 53 | let mut v = [0i32; HASH_BITS]; 54 | let mut simhash: u64 = 0; 55 | 56 | for feature in words { 57 | let feature_hash: u64 = hash_feature(&feature); 58 | 59 | // Update weights for each bit position 60 | for (i, weight) in v.iter_mut().enumerate() { 61 | let bit = (feature_hash >> i) & 1; 62 | if bit == 1 { 63 | *weight = weight.saturating_add(1); 64 | } else { 65 | *weight = weight.saturating_sub(1); 66 | } 67 | } 68 | } 69 | 70 | // Build final hash from positive weights 71 | for (i, &weight) in v.iter().enumerate() { 72 | if weight > 0 { 73 | simhash |= 1 << i; 74 | } 75 | } 76 | simhash 77 | } 78 | 79 | /// Calculate `u64` simhash from `&str` split by whitespace 80 | /// 81 | /// # Examples 82 | /// ``` 83 | /// # use spirit_fingers_simhash::simhash_algo::simhash; 84 | /// let hash = simhash("The cat sat on the mat"); 85 | /// assert_eq!(hash, 2595200813813010837); 86 | /// ``` 87 | /// 88 | /// # Returns 89 | /// Returns `0` for empty or whitespace-only strings 90 | pub fn simhash(text: &str) -> u64 { 91 | simhash_stream(text.split_whitespace()) 92 | } 93 | 94 | /// Bitwise hamming distance of two `u64` hashes 95 | pub fn hamming_distance(x: u64, y: u64) -> u32 { 96 | (x ^ y).count_ones() 97 | } 98 | 99 | /// Calculate similarity as `f64` of two hashes 100 | /// 0.0 means no similarity, 1.0 means identical 101 | pub fn hash_similarity(hash1: u64, hash2: u64) -> f64 { 102 | let distance: f64 = hamming_distance(hash1, hash2) as f64; 103 | 1.0 - (distance / HASH_BITS as f64) 104 | } 105 | 106 | /// Calculate similarity of two streams of string slices by simhash 107 | pub fn similarity_streams<'w1, 'w2, W1, W2>(words1: W1, words2: W2) -> f64 108 | where 109 | W1: Iterator, 110 | W2: Iterator, 111 | { 112 | hash_similarity(simhash_stream(words1), simhash_stream(words2)) 113 | } 114 | 115 | /// Calculate similarity of two string slices split by whitespace by simhash 116 | pub fn similarity(text1: &str, text2: &str) -> f64 { 117 | similarity_streams(text1.split_whitespace(), text2.split_whitespace()) 118 | } 119 | 120 | #[cfg(test)] 121 | mod tests { 122 | use super::*; 123 | 124 | #[test] 125 | fn simhash_test() { 126 | assert_eq!(simhash("The cat sat on the mat"), 2595200813813010837); 127 | assert_eq!(simhash("The cat sat under the mat"), 2595269945604666783); 128 | assert_eq!(simhash("Why the lucky stiff"), 1155526875459215761); 129 | } 130 | 131 | #[test] 132 | fn hamming_distance_test() { 133 | assert_eq!(hamming_distance(0b0000000u64, 0b0000000u64), 0); 134 | assert_eq!(hamming_distance(0b1111111u64, 0b0000000u64), 7); 135 | assert_eq!(hamming_distance(0b0100101u64, 0b1100110u64), 3); 136 | } 137 | 138 | #[test] 139 | fn hash_similarity_test() { 140 | assert_eq!(hash_similarity(0u64, 0u64), 1.0); 141 | assert_eq!(hash_similarity(!0u64, 0u64), 0.0); 142 | assert_eq!(hash_similarity(!0u32 as u64, 0u64), 0.5); 143 | } 144 | 145 | #[test] 146 | fn similarity_test() { 147 | assert_eq!(similarity("Stop hammertime", "Stop hammertime"), 1.0); 148 | assert!(similarity("Hocus pocus", "Hocus pocus pilatus pas") > 0.7); 149 | assert!(similarity("Peanut butter", "Strawberry cocktail") < 0.6); 150 | } 151 | } 152 | --------------------------------------------------------------------------------