├── config
└── config.exs
├── native
└── simhash
│ ├── .gitignore
│ ├── README.md
│ ├── .cargo
│ └── config.toml
│ ├── Cargo.toml
│ ├── src
│ ├── lib.rs
│ └── simhash_algo.rs
│ └── Cargo.lock
├── test
├── test_helper.exs
└── simhash_test.exs
├── .tool-versions
├── logo.png
├── .DS_Store
├── lib
├── spirit_fingers.ex
├── mix
│ └── tasks
│ │ └── test.rust.ex
└── simhash.ex
├── logo
├── Spirit Fingers - Logo.ai
├── Spirit Fingers - Logo.eps
├── Spirit Fingers - Logo.jpg
├── Spirit Fingers - Logo.png
└── Spirit Fingers - Logo-01.eps
├── .formatter.exs
├── .dialyzer_ignore.exs
├── .gitignore
├── LICENSE
├── README.md
├── mix.exs
├── mix.lock
└── .github
└── workflows
└── ci.yml
/config/config.exs:
--------------------------------------------------------------------------------
1 | import Config
2 |
--------------------------------------------------------------------------------
/native/simhash/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 |
--------------------------------------------------------------------------------
/test/test_helper.exs:
--------------------------------------------------------------------------------
1 | ExUnit.start()
2 |
--------------------------------------------------------------------------------
/.tool-versions:
--------------------------------------------------------------------------------
1 | rust 1.90.0
2 | elixir 1.19.1-otp-28
3 | erlang 28.1.1
4 |
--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/holsee/spirit_fingers/HEAD/logo.png
--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/holsee/spirit_fingers/HEAD/.DS_Store
--------------------------------------------------------------------------------
/lib/spirit_fingers.ex:
--------------------------------------------------------------------------------
1 | defmodule SpiritFingers do
2 | @moduledoc "See: `SpiritFingers.SimHash`."
3 | end
4 |
--------------------------------------------------------------------------------
/logo/Spirit Fingers - Logo.ai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/holsee/spirit_fingers/HEAD/logo/Spirit Fingers - Logo.ai
--------------------------------------------------------------------------------
/logo/Spirit Fingers - Logo.eps:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/holsee/spirit_fingers/HEAD/logo/Spirit Fingers - Logo.eps
--------------------------------------------------------------------------------
/logo/Spirit Fingers - Logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/holsee/spirit_fingers/HEAD/logo/Spirit Fingers - Logo.jpg
--------------------------------------------------------------------------------
/logo/Spirit Fingers - Logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/holsee/spirit_fingers/HEAD/logo/Spirit Fingers - Logo.png
--------------------------------------------------------------------------------
/logo/Spirit Fingers - Logo-01.eps:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/holsee/spirit_fingers/HEAD/logo/Spirit Fingers - Logo-01.eps
--------------------------------------------------------------------------------
/.formatter.exs:
--------------------------------------------------------------------------------
1 | # Used by "mix format"
2 | [
3 | inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"]
4 | ]
5 |
--------------------------------------------------------------------------------
/native/simhash/README.md:
--------------------------------------------------------------------------------
1 | # NIF for Elixir.SpiritFingers.SimHash
2 |
3 | Rust NIF wrapping: https://github.com/holsee/simhash-rs
4 |
--------------------------------------------------------------------------------
/test/simhash_test.exs:
--------------------------------------------------------------------------------
1 | defmodule SpiritFingers.SimHashTest do
2 | use ExUnit.Case
3 | doctest SpiritFingers.SimHash
4 | end
5 |
--------------------------------------------------------------------------------
/.dialyzer_ignore.exs:
--------------------------------------------------------------------------------
1 | [
2 | # Mix tasks are build-time only and don't need runtime type checking
3 | ~r/lib\/mix\/tasks\//
4 | ]
5 |
--------------------------------------------------------------------------------
/native/simhash/.cargo/config.toml:
--------------------------------------------------------------------------------
1 | [target.'cfg(target_os = "macos")']
2 | rustflags = [
3 | "-C", "link-arg=-undefined",
4 | "-C", "link-arg=dynamic_lookup",
5 | ]
6 |
--------------------------------------------------------------------------------
/native/simhash/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "spirit_fingers_simhash"
3 | version = "0.4.1"
4 | authors = []
5 | edition = "2021"
6 |
7 | [lib]
8 | name = "spirit_fingers_simhash"
9 | path = "src/lib.rs"
10 | crate-type = ["cdylib"]
11 |
12 | [dependencies]
13 | rustler = { version = "0.37", default-features = true }
14 | siphasher = "0.3"
15 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /native/simhash/target/
2 |
3 | # The directory Mix will write compiled artifacts to.
4 | /_build/
5 |
6 | # If you run "mix test --cover", coverage assets end up here.
7 | /cover/
8 |
9 | # The directory Mix downloads your dependencies sources to.
10 | /deps/
11 |
12 | # Where 3rd-party dependencies like ExDoc output generated docs.
13 | /doc/
14 |
15 | # Ignore .fetch files in case you like to edit your project deps locally.
16 | /.fetch
17 |
18 | # If the VM crashes, it generates a dump, let's ignore it too.
19 | erl_crash.dump
20 |
21 | # Also ignore archive artifacts (built via "mix archive.build").
22 | *.ez
23 |
24 | # Ignore package tarball (built via "mix hex.build").
25 | spirit_fingers-*.tar
26 |
27 | # Ignore rust artifacts
28 | **/*.so
29 |
--------------------------------------------------------------------------------
/lib/mix/tasks/test.rust.ex:
--------------------------------------------------------------------------------
1 | defmodule Mix.Tasks.Test.Rust do
2 | @moduledoc """
3 | Runs Rust unit tests for the native simhash crate.
4 |
5 | ## Usage
6 |
7 | mix test.rust
8 |
9 | This task runs `cargo test` in the native/simhash directory.
10 | """
11 | @shortdoc "Run Rust unit tests"
12 |
13 | use Mix.Task
14 |
15 | @impl Mix.Task
16 | def run(_args) do
17 | crate_path = Path.join([File.cwd!(), "native", "simhash"])
18 |
19 | if File.dir?(crate_path) do
20 | Mix.shell().info("Running Rust tests in #{crate_path}...")
21 |
22 | case System.cmd("cargo", ["test"], cd: crate_path, into: IO.stream(:stdio, :line)) do
23 | {_, 0} ->
24 | Mix.shell().info("\nRust tests passed! ✓")
25 | :ok
26 |
27 | {_, exit_code} ->
28 | Mix.shell().error("\nRust tests failed with exit code #{exit_code}")
29 | Mix.raise("Rust tests failed")
30 | end
31 | else
32 | Mix.raise("Rust crate directory not found at #{crate_path}")
33 | end
34 | end
35 | end
36 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright © 2018 Steven Holdsworth
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 |
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 |
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SpiritFingers
2 |
3 | [](https://hex.pm/packages/spirit_fingers)
4 | [](https://hex.pm/packages/spirit_fingers)
5 | [](https://hexdocs.pm/spirit_fingers)
6 | [](https://github.com/holsee/spirit_fingers/actions/workflows/ci.yml)
7 | [](https://github.com/holsee/spirit_fingers/blob/master/LICENSE)
8 |
9 | "Fast SimHash NIFs written in Rust 🐇💨 as Erlang/Elixir versions were too slow 🐢"
10 |
11 |
12 |
13 |
14 |
15 | * [Hex Package](https://hex.pm/packages/spirit_fingers).
16 | * [Documentation](https://hexdocs.pm/spirit_fingers).
17 | * [Simhash Benchmarks](https://github.com/holsee/simhash_benchmarks) TL;DR 400-900x faster, orders of magnitude more memory efficient and handles large binaries where others cannot.
18 |
19 | ## Build
20 |
21 | ```
22 | mix compile
23 | ```
24 |
25 | ## Test
26 |
27 | ```
28 | mix test
29 | ```
30 |
31 | ## Versions
32 |
33 | * Elixir ~> 1.14
34 | * OTP 25
35 | * Rust 2021 ~> 1.67.0
36 | * Rustler 0.27.0
37 |
38 | ## Installation
39 |
40 | Add `spirit_fingers` to your list of dependencies in `mix.exs`:
41 |
42 | ```elixir
43 | def deps do
44 | [
45 | {:spirit_fingers, "~> 0.5"}
46 | ]
47 | end
48 | ```
49 |
50 | ## Usage
51 |
52 | Generate SimHash:
53 | ``` elixir
54 | SpiritFingers.SimHash.similarity_hash("wow")
55 | {:ok, 17399923637769257768}
56 | ```
57 |
58 | Similarity between strings:
59 | ``` elixir
60 | SpiritFingers.SimHash.similarity("Hocus pocus", "Hocus pocus pilatus pas")
61 | {:ok, 0.9375}
62 | ```
63 |
64 | Hamming Distance between hashes:
65 | ``` elixir
66 | SpiritFingers.SimHash.hamming_distance(17399923637769257768, 17399923637769257768)
67 | {:ok, 0.0}
68 | ```
69 |
70 | Similarity between hashes:
71 | ``` elixir
72 | SpiritFingers.SimHash.hash_similarity(17399923637769257768, 17399923637769257768)
73 | {:ok, 1.0}
74 | ```
75 |
76 |
--------------------------------------------------------------------------------
/mix.exs:
--------------------------------------------------------------------------------
1 | defmodule SpiritFingers.MixProject do
2 | use Mix.Project
3 |
4 | def project do
5 | [
6 | app: :spirit_fingers,
7 | version: "0.5.1",
8 | elixir: "~> 1.15",
9 | start_permanent: Mix.env() == :prod,
10 | deps: deps(),
11 | aliases: aliases(),
12 | dialyzer: dialyzer(),
13 | name: "SpiritFingers",
14 | source_url: "https://github.com/holsee/spirit_fingers",
15 | homepage_url: "https://hex.pm/packages/spirit_fingers",
16 | docs: [main: "SpiritFingers", logo: "logo.png", extras: ["README.md"]],
17 | package: package(),
18 | description: description()
19 | ]
20 | end
21 |
22 | # Run "mix help compile.app" to learn about applications.
23 | def application do
24 | [
25 | extra_applications: [:logger]
26 | ]
27 | end
28 |
29 | # Run "mix help deps" to learn about dependencies.
30 | defp deps do
31 | [
32 | {:rustler, "~> 0.37.1"},
33 | {:ex_doc, "~> 0.34", only: :dev, runtime: false},
34 | {:dialyxir, "~> 1.4", only: [:dev, :test], runtime: false},
35 | {:credo, "~> 1.7", only: [:dev, :test], runtime: false}
36 | ]
37 | end
38 |
39 | defp package() do
40 | [
41 | name: "spirit_fingers",
42 | files: ~w(
43 | config
44 | lib
45 | native/simhash/.cargo
46 | native/simhash/src
47 | native/simhash/Cargo.*
48 | native/simhash/README.md
49 | mix.exs
50 | README.md
51 | LICENSE
52 | logo.png
53 | ),
54 | links: %{
55 | "GitHub" => "https://github.com/holsee/spirit_fingers"
56 | },
57 | maintainers: ["Steven Holdsworth (@holsee)"],
58 | licenses: ["MIT"]
59 | ]
60 | end
61 |
62 | defp description() do
63 | "Fast SimHash NIFs written in Rust 🐇💨 as Erlang/Elixir versions were too slow 🐢"
64 | end
65 |
66 | defp dialyzer do
67 | [
68 | plt_add_apps: [:mix],
69 | ignore_warnings: ".dialyzer_ignore.exs"
70 | ]
71 | end
72 |
73 | defp aliases do
74 | [
75 | "test.all": ["test", "test.rust"]
76 | ]
77 | end
78 |
79 | def cli do
80 | [
81 | preferred_envs: [
82 | "test.all": :test,
83 | "test.rust": :test
84 | ]
85 | ]
86 | end
87 | end
88 |
--------------------------------------------------------------------------------
/native/simhash/src/lib.rs:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2023 Steven Holdsworth (@holsee)
2 | //
3 | // Licensed under the MIT License
4 | //
5 | // Permission is hereby granted, free of charge, to any person obtaining a copy
6 | // of this software and associated documentation files (the "Software"), to deal
7 | // in the Software without restriction, including without limitation the rights
8 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | // copies of the Software, and to permit persons to whom the Software is
10 | // furnished to do so, subject to the following conditions:
11 | //
12 | // The above copyright notice and this permission notice shall be included in all
13 | // copies or substantial portions of the Software.
14 | //
15 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | // SOFTWARE.
22 |
23 | use rustler::{Atom, Error};
24 |
25 | mod simhash_algo;
26 |
27 | mod atoms {
28 | rustler::atoms! {
29 | ok,
30 | error
31 | }
32 | }
33 |
34 | #[rustler::nif]
35 | fn similarity_hash(text: &str) -> Result<(Atom, u64), Error> {
36 | let hash: u64 = simhash_algo::simhash(text);
37 | Ok((atoms::ok(), hash))
38 | }
39 |
40 | #[rustler::nif]
41 | fn hamming_distance(hash0: u64, hash1: u64) -> Result<(Atom, u32), Error> {
42 | let ham_dist: u32 = simhash_algo::hamming_distance(hash0, hash1);
43 | Ok((atoms::ok(), ham_dist))
44 | }
45 |
46 | #[rustler::nif]
47 | fn hash_similarity(hash0: u64, hash1: u64) -> Result<(Atom, f64), Error> {
48 | let hash_similarity = simhash_algo::hash_similarity(hash0, hash1);
49 | Ok((atoms::ok(), hash_similarity))
50 | }
51 |
52 | #[rustler::nif]
53 | fn similarity(text0: &str, text1: &str) -> Result<(Atom, f64), Error> {
54 | let similarity: f64 = simhash_algo::similarity(text0, text1);
55 | Ok((atoms::ok(), similarity))
56 | }
57 |
58 | rustler::init!("Elixir.SpiritFingers.SimHash");
59 |
--------------------------------------------------------------------------------
/lib/simhash.ex:
--------------------------------------------------------------------------------
1 | defmodule SpiritFingers.SimHash do
2 | @moduledoc """
3 | SimHash Module which delegates to Rust NIFs which will
4 | perform the hashing, similarity and distance calculations.
5 | """
6 | use Rustler,
7 | otp_app: :spirit_fingers,
8 | crate: "spirit_fingers_simhash",
9 | path: "native/simhash",
10 | mode: :release
11 |
12 | @typedoc "unsigned 64 bit integer represenation of simhash"
13 | @type t :: pos_integer()
14 |
15 | @typedoc """
16 | Similarity between two `SimHash.t`, represented as a value
17 | between 0.0 and 1.0.
18 | * `0.0` means no similarity,
19 | * `1.0` means identical.
20 | """
21 | @type similarity :: float()
22 |
23 | @typedoc """
24 | 64 bit floating point represenation of the
25 | [Hamming Distance](https://en.wikipedia.org/wiki/Hamming_distance)
26 | between 2 `SimHash.t`.
27 | """
28 | @type distance :: float()
29 |
30 | @doc """
31 | Calculate `SimHash.t` split by whitespace.
32 |
33 | ## Examples
34 |
35 | iex> SpiritFingers.SimHash.similarity_hash("The cat sat on the mat")
36 | {:ok, 2595200813813010837}
37 |
38 | iex> SpiritFingers.SimHash.similarity_hash("The cat sat under the mat")
39 | {:ok, 2595269945604666783}
40 |
41 | iex> SpiritFingers.SimHash.similarity_hash("Why the lucky stiff")
42 | {:ok, 1155526875459215761}
43 | """
44 | @spec similarity_hash(binary()) :: {:ok, t()}
45 | def similarity_hash(_bin), do: :erlang.nif_error(:nif_not_loaded)
46 |
47 | @doc """
48 | Bitwise hamming distance of two `SimHash.t` hashes
49 |
50 | ## Examples
51 |
52 | iex> SpiritFingers.SimHash.hamming_distance(0, 0)
53 | {:ok, 0}
54 |
55 | iex> SpiritFingers.SimHash.hamming_distance(0b1111111, 0b0000000)
56 | {:ok, 7}
57 |
58 | iex> SpiritFingers.SimHash.hamming_distance(0b0100101, 0b1100110)
59 | {:ok, 3}
60 | """
61 | @spec hamming_distance(t(), t()) :: {:ok, distance()}
62 | def hamming_distance(_hash0, _hash1), do: :erlang.nif_error(:nif_not_loaded)
63 |
64 | @doc """
65 | Calculate similarity as `SimHash.similarity` of two hashes.
66 | `0.0` means no similarity, `1.0` means identical.
67 |
68 | ## Examples
69 |
70 | iex> SpiritFingers.SimHash.hash_similarity(0, 0)
71 | {:ok, 1.0}
72 |
73 | iex> SpiritFingers.SimHash.hash_similarity(0xFFFFFFFFFFFFFFFF, 0)
74 | {:ok, 0.0}
75 |
76 | iex> SpiritFingers.SimHash.hash_similarity(0xFFFFFFFF, 0)
77 | {:ok, 0.5}
78 | """
79 | @spec hash_similarity(t(), t()) :: {:ok, similarity()}
80 | def hash_similarity(_hash0, _hash1), do: :erlang.nif_error(:nif_not_loaded)
81 |
82 | @doc """
83 | Calculate similarity `SimHash.similarity` of two string slices split by whitespace by simhash.
84 |
85 | ## Examples
86 |
87 | iex> SpiritFingers.SimHash.similarity("Stop hammertime", "Stop hammertime")
88 | {:ok, 1.0}
89 |
90 | iex> SpiritFingers.SimHash.similarity("Hocus pocus", "Hocus pocus pilatus pas")
91 | {:ok, 0.9375}
92 |
93 | iex> SpiritFingers.SimHash.similarity("Peanut butter", "Strawberry cocktail")
94 | {:ok, 0.59375}
95 | """
96 | @spec similarity(binary(), binary()) :: {:ok, similarity()}
97 | def similarity(_text0, _text1), do: :erlang.nif_error(:nif_not_loaded)
98 | end
99 |
--------------------------------------------------------------------------------
/mix.lock:
--------------------------------------------------------------------------------
1 | %{
2 | "bunt": {:hex, :bunt, "1.0.0", "081c2c665f086849e6d57900292b3a161727ab40431219529f13c4ddcf3e7a44", [:mix], [], "hexpm", "dc5f86aa08a5f6fa6b8096f0735c4e76d54ae5c9fa2c143e5a1fc7c1cd9bb6b5"},
3 | "credo": {:hex, :credo, "1.7.13", "126a0697df6b7b71cd18c81bc92335297839a806b6f62b61d417500d1070ff4e", [:mix], [{:bunt, "~> 0.2.1 or ~> 1.0", [hex: :bunt, repo: "hexpm", optional: false]}, {:file_system, "~> 0.2 or ~> 1.0", [hex: :file_system, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "47641e6d2bbff1e241e87695b29f617f1a8f912adea34296fb10ecc3d7e9e84f"},
4 | "dialyxir": {:hex, :dialyxir, "1.4.6", "7cca478334bf8307e968664343cbdb432ee95b4b68a9cba95bdabb0ad5bdfd9a", [:mix], [{:erlex, ">= 0.2.7", [hex: :erlex, repo: "hexpm", optional: false]}], "hexpm", "8cf5615c5cd4c2da6c501faae642839c8405b49f8aa057ad4ae401cb808ef64d"},
5 | "earmark_parser": {:hex, :earmark_parser, "1.4.44", "f20830dd6b5c77afe2b063777ddbbff09f9759396500cdbe7523efd58d7a339c", [:mix], [], "hexpm", "4778ac752b4701a5599215f7030989c989ffdc4f6df457c5f36938cc2d2a2750"},
6 | "erlex": {:hex, :erlex, "0.2.7", "810e8725f96ab74d17aac676e748627a07bc87eb950d2b83acd29dc047a30595", [:mix], [], "hexpm", "3ed95f79d1a844c3f6bf0cea61e0d5612a42ce56da9c03f01df538685365efb0"},
7 | "ex_doc": {:hex, :ex_doc, "0.38.4", "ab48dff7a8af84226bf23baddcdda329f467255d924380a0cf0cee97bb9a9ede", [:mix], [{:earmark_parser, "~> 1.4.44", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "f7b62346408a83911c2580154e35613eb314e0278aeea72ed7fedef9c1f165b2"},
8 | "file_system": {:hex, :file_system, "1.1.1", "31864f4685b0148f25bd3fbef2b1228457c0c89024ad67f7a81a3ffbc0bbad3a", [:mix], [], "hexpm", "7a15ff97dfe526aeefb090a7a9d3d03aa907e100e262a0f8f7746b78f8f87a5d"},
9 | "jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"},
10 | "makeup": {:hex, :makeup, "1.2.1", "e90ac1c65589ef354378def3ba19d401e739ee7ee06fb47f94c687016e3713d1", [:mix], [{:nimble_parsec, "~> 1.4", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "d36484867b0bae0fea568d10131197a4c2e47056a6fbe84922bf6ba71c8d17ce"},
11 | "makeup_elixir": {:hex, :makeup_elixir, "1.0.1", "e928a4f984e795e41e3abd27bfc09f51db16ab8ba1aebdba2b3a575437efafc2", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "7284900d412a3e5cfd97fdaed4f5ed389b8f2b4cb49efc0eb3bd10e2febf9507"},
12 | "makeup_erlang": {:hex, :makeup_erlang, "1.0.2", "03e1804074b3aa64d5fad7aa64601ed0fb395337b982d9bcf04029d68d51b6a7", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "af33ff7ef368d5893e4a267933e7744e46ce3cf1f61e2dccf53a111ed3aa3727"},
13 | "nimble_parsec": {:hex, :nimble_parsec, "1.4.2", "8efba0122db06df95bfaa78f791344a89352ba04baedd3849593bfce4d0dc1c6", [:mix], [], "hexpm", "4b21398942dda052b403bbe1da991ccd03a053668d147d53fb8c4e0efe09c973"},
14 | "rustler": {:hex, :rustler, "0.37.1", "721434020c7f6f8e1cdc57f44f75c490435b01de96384f8ccb96043f12e8a7e0", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm", "24547e9b8640cf00e6a2071acb710f3e12ce0346692e45098d84d45cdb54fd79"},
15 | }
16 |
--------------------------------------------------------------------------------
/native/simhash/Cargo.lock:
--------------------------------------------------------------------------------
1 | # This file is automatically @generated by Cargo.
2 | # It is not intended for manual editing.
3 | version = 4
4 |
5 | [[package]]
6 | name = "cfg-if"
7 | version = "1.0.4"
8 | source = "registry+https://github.com/rust-lang/crates.io-index"
9 | checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
10 |
11 | [[package]]
12 | name = "heck"
13 | version = "0.5.0"
14 | source = "registry+https://github.com/rust-lang/crates.io-index"
15 | checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
16 |
17 | [[package]]
18 | name = "inventory"
19 | version = "0.3.21"
20 | source = "registry+https://github.com/rust-lang/crates.io-index"
21 | checksum = "bc61209c082fbeb19919bee74b176221b27223e27b65d781eb91af24eb1fb46e"
22 | dependencies = [
23 | "rustversion",
24 | ]
25 |
26 | [[package]]
27 | name = "libloading"
28 | version = "0.8.9"
29 | source = "registry+https://github.com/rust-lang/crates.io-index"
30 | checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55"
31 | dependencies = [
32 | "cfg-if",
33 | "windows-link",
34 | ]
35 |
36 | [[package]]
37 | name = "proc-macro2"
38 | version = "1.0.101"
39 | source = "registry+https://github.com/rust-lang/crates.io-index"
40 | checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de"
41 | dependencies = [
42 | "unicode-ident",
43 | ]
44 |
45 | [[package]]
46 | name = "quote"
47 | version = "1.0.41"
48 | source = "registry+https://github.com/rust-lang/crates.io-index"
49 | checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1"
50 | dependencies = [
51 | "proc-macro2",
52 | ]
53 |
54 | [[package]]
55 | name = "regex-lite"
56 | version = "0.1.8"
57 | source = "registry+https://github.com/rust-lang/crates.io-index"
58 | checksum = "8d942b98df5e658f56f20d592c7f868833fe38115e65c33003d8cd224b0155da"
59 |
60 | [[package]]
61 | name = "rustler"
62 | version = "0.37.0"
63 | source = "registry+https://github.com/rust-lang/crates.io-index"
64 | checksum = "fb867bb35b291ef105abbe0a0d04bd4d7af372e023d08845698687bc254f222b"
65 | dependencies = [
66 | "inventory",
67 | "libloading",
68 | "regex-lite",
69 | "rustler_codegen",
70 | ]
71 |
72 | [[package]]
73 | name = "rustler_codegen"
74 | version = "0.37.0"
75 | source = "registry+https://github.com/rust-lang/crates.io-index"
76 | checksum = "90993223c5ac0fb580ff966fb9477289c4e8a610a2f4639912a2639c5e7b5095"
77 | dependencies = [
78 | "heck",
79 | "inventory",
80 | "proc-macro2",
81 | "quote",
82 | "syn",
83 | ]
84 |
85 | [[package]]
86 | name = "rustversion"
87 | version = "1.0.22"
88 | source = "registry+https://github.com/rust-lang/crates.io-index"
89 | checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
90 |
91 | [[package]]
92 | name = "siphasher"
93 | version = "0.3.11"
94 | source = "registry+https://github.com/rust-lang/crates.io-index"
95 | checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
96 |
97 | [[package]]
98 | name = "spirit_fingers_simhash"
99 | version = "0.4.1"
100 | dependencies = [
101 | "rustler",
102 | "siphasher",
103 | ]
104 |
105 | [[package]]
106 | name = "syn"
107 | version = "2.0.107"
108 | source = "registry+https://github.com/rust-lang/crates.io-index"
109 | checksum = "2a26dbd934e5451d21ef060c018dae56fc073894c5a7896f882928a76e6d081b"
110 | dependencies = [
111 | "proc-macro2",
112 | "quote",
113 | "unicode-ident",
114 | ]
115 |
116 | [[package]]
117 | name = "unicode-ident"
118 | version = "1.0.20"
119 | source = "registry+https://github.com/rust-lang/crates.io-index"
120 | checksum = "462eeb75aeb73aea900253ce739c8e18a67423fadf006037cd3ff27e82748a06"
121 |
122 | [[package]]
123 | name = "windows-link"
124 | version = "0.2.1"
125 | source = "registry+https://github.com/rust-lang/crates.io-index"
126 | checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
127 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on:
4 | push:
5 | branches: [main, master]
6 | pull_request:
7 | branches: [main, master]
8 |
9 | env:
10 | MIX_ENV: test
11 |
12 | jobs:
13 | test:
14 | name: Test (Elixir ${{matrix.elixir}} | OTP ${{matrix.otp}})
15 | runs-on: ubuntu-latest
16 |
17 | strategy:
18 | matrix:
19 | elixir: ['1.19.1']
20 | otp: ['28.1.1']
21 |
22 | steps:
23 | - name: Checkout code
24 | uses: actions/checkout@v4
25 |
26 | - name: Set up Elixir
27 | uses: erlef/setup-beam@v1
28 | with:
29 | elixir-version: ${{matrix.elixir}}
30 | otp-version: ${{matrix.otp}}
31 |
32 | - name: Set up Rust
33 | uses: dtolnay/rust-toolchain@1.90.0
34 |
35 | - name: Cache Mix dependencies
36 | uses: actions/cache@v4
37 | with:
38 | path: deps
39 | key: ${{ runner.os }}-mix-deps-${{ hashFiles('**/mix.lock') }}
40 | restore-keys: |
41 | ${{ runner.os }}-mix-deps-
42 |
43 | - name: Cache Cargo
44 | uses: actions/cache@v4
45 | with:
46 | path: |
47 | ~/.cargo/bin/
48 | ~/.cargo/registry/index/
49 | ~/.cargo/registry/cache/
50 | ~/.cargo/git/db/
51 | native/simhash/target/
52 | key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
53 | restore-keys: |
54 | ${{ runner.os }}-cargo-
55 |
56 | - name: Install dependencies
57 | run: mix deps.get
58 |
59 | - name: Compile dependencies
60 | run: mix deps.compile
61 |
62 | - name: Compile project (including Rust NIF)
63 | run: mix compile --warnings-as-errors
64 |
65 | - name: Run Elixir tests
66 | run: mix test
67 |
68 | - name: Run Rust tests
69 | run: mix test.rust
70 |
71 | quality:
72 | name: Code Quality
73 | runs-on: ubuntu-latest
74 |
75 | strategy:
76 | matrix:
77 | elixir: ['1.19.1']
78 | otp: ['28.1.1']
79 |
80 | steps:
81 | - name: Checkout code
82 | uses: actions/checkout@v4
83 |
84 | - name: Set up Elixir
85 | uses: erlef/setup-beam@v1
86 | with:
87 | elixir-version: ${{matrix.elixir}}
88 | otp-version: ${{matrix.otp}}
89 |
90 | - name: Set up Rust
91 | uses: dtolnay/rust-toolchain@1.90.0
92 |
93 | - name: Cache Mix dependencies
94 | uses: actions/cache@v4
95 | with:
96 | path: deps
97 | key: ${{ runner.os }}-mix-deps-${{ hashFiles('**/mix.lock') }}
98 | restore-keys: |
99 | ${{ runner.os }}-mix-deps-
100 |
101 | - name: Cache Cargo
102 | uses: actions/cache@v4
103 | with:
104 | path: |
105 | ~/.cargo/bin/
106 | ~/.cargo/registry/index/
107 | ~/.cargo/registry/cache/
108 | ~/.cargo/git/db/
109 | native/simhash/target/
110 | key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
111 | restore-keys: |
112 | ${{ runner.os }}-cargo-
113 |
114 | - name: Cache PLT
115 | id: plt_cache
116 | uses: actions/cache@v4
117 | with:
118 | path: _build/test/dialyxir*.plt
119 | key: ${{ runner.os }}-plt-${{ matrix.otp }}-${{ matrix.elixir }}-${{ hashFiles('**/mix.lock') }}
120 | restore-keys: |
121 | ${{ runner.os }}-plt-${{ matrix.otp }}-${{ matrix.elixir }}-
122 |
123 | - name: Install dependencies
124 | run: mix deps.get
125 |
126 | - name: Compile
127 | run: mix compile
128 |
129 | - name: Create PLTs
130 | if: steps.plt_cache.outputs.cache-hit != 'true'
131 | run: mix dialyzer --plt
132 |
133 | - name: Run Credo
134 | run: mix credo --strict
135 |
136 | - name: Run Dialyzer
137 | run: mix dialyzer --format github
138 |
139 | rust-quality:
140 | name: Rust Quality
141 | runs-on: ubuntu-latest
142 |
143 | steps:
144 | - name: Checkout code
145 | uses: actions/checkout@v4
146 |
147 | - name: Set up Rust
148 | uses: dtolnay/rust-toolchain@1.90.0
149 | with:
150 | components: clippy, rustfmt
151 |
152 | - name: Cache Cargo
153 | uses: actions/cache@v4
154 | with:
155 | path: |
156 | ~/.cargo/bin/
157 | ~/.cargo/registry/index/
158 | ~/.cargo/registry/cache/
159 | ~/.cargo/git/db/
160 | native/simhash/target/
161 | key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
162 |
163 | - name: Run Clippy
164 | run: cd native/simhash && cargo clippy -- -D warnings
165 |
166 | - name: Check formatting
167 | run: cd native/simhash && cargo fmt -- --check
168 |
--------------------------------------------------------------------------------
/native/simhash/src/simhash_algo.rs:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2014 Bart Olsthoorn
2 | // Copyright (c) 2017 Jakub Pastuszek
3 | //
4 | // Licensed under the MIT License
5 | //
6 | // Permission is hereby granted, free of charge, to any person obtaining a copy
7 | // of this software and associated documentation files (the "Software"), to deal
8 | // in the Software without restriction, including without limitation the rights
9 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | // copies of the Software, and to permit persons to whom the Software is
11 | // furnished to do so, subject to the following conditions:
12 | //
13 | // The above copyright notice and this permission notice shall be included in all
14 | // copies or substantial portions of the Software.
15 | //
16 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | // SOFTWARE.
23 |
24 | //! Rust Simhash implementation
25 | //!
26 | //! Originally implemented by Bart Olsthoorn on 12/08/2014
27 | //! Ported to Rust 1.16.0 by Jakub Pastuszek on 29/05/2017
28 | //! With the help of http://matpalm.com/resemblance/simhash/
29 | //!
30 | //! Vendored from https://github.com/bartolsthoorn/simhash-rs into spirit_fingers project
31 | //! Enhanced with performance improvements and idiomatic Rust patterns by @holsee
32 |
33 | use siphasher::sip::SipHasher;
34 | use std::hash::{Hash, Hasher};
35 |
36 | /// Number of bits in the hash (u64)
37 | const HASH_BITS: usize = 64;
38 |
39 | fn hash_feature(t: &T) -> u64 {
40 | let mut s = SipHasher::default();
41 | t.hash(&mut s);
42 | s.finish()
43 | }
44 |
45 | /// Calculate `u64` simhash from stream of `&str` words
46 | ///
47 | /// # Returns
48 | /// Returns `0` for empty input streams
49 | pub fn simhash_stream<'w, W>(words: W) -> u64
50 | where
51 | W: Iterator- ,
52 | {
53 | let mut v = [0i32; HASH_BITS];
54 | let mut simhash: u64 = 0;
55 |
56 | for feature in words {
57 | let feature_hash: u64 = hash_feature(&feature);
58 |
59 | // Update weights for each bit position
60 | for (i, weight) in v.iter_mut().enumerate() {
61 | let bit = (feature_hash >> i) & 1;
62 | if bit == 1 {
63 | *weight = weight.saturating_add(1);
64 | } else {
65 | *weight = weight.saturating_sub(1);
66 | }
67 | }
68 | }
69 |
70 | // Build final hash from positive weights
71 | for (i, &weight) in v.iter().enumerate() {
72 | if weight > 0 {
73 | simhash |= 1 << i;
74 | }
75 | }
76 | simhash
77 | }
78 |
79 | /// Calculate `u64` simhash from `&str` split by whitespace
80 | ///
81 | /// # Examples
82 | /// ```
83 | /// # use spirit_fingers_simhash::simhash_algo::simhash;
84 | /// let hash = simhash("The cat sat on the mat");
85 | /// assert_eq!(hash, 2595200813813010837);
86 | /// ```
87 | ///
88 | /// # Returns
89 | /// Returns `0` for empty or whitespace-only strings
90 | pub fn simhash(text: &str) -> u64 {
91 | simhash_stream(text.split_whitespace())
92 | }
93 |
94 | /// Bitwise hamming distance of two `u64` hashes
95 | pub fn hamming_distance(x: u64, y: u64) -> u32 {
96 | (x ^ y).count_ones()
97 | }
98 |
99 | /// Calculate similarity as `f64` of two hashes
100 | /// 0.0 means no similarity, 1.0 means identical
101 | pub fn hash_similarity(hash1: u64, hash2: u64) -> f64 {
102 | let distance: f64 = hamming_distance(hash1, hash2) as f64;
103 | 1.0 - (distance / HASH_BITS as f64)
104 | }
105 |
106 | /// Calculate similarity of two streams of string slices by simhash
107 | pub fn similarity_streams<'w1, 'w2, W1, W2>(words1: W1, words2: W2) -> f64
108 | where
109 | W1: Iterator
- ,
110 | W2: Iterator
- ,
111 | {
112 | hash_similarity(simhash_stream(words1), simhash_stream(words2))
113 | }
114 |
115 | /// Calculate similarity of two string slices split by whitespace by simhash
116 | pub fn similarity(text1: &str, text2: &str) -> f64 {
117 | similarity_streams(text1.split_whitespace(), text2.split_whitespace())
118 | }
119 |
120 | #[cfg(test)]
121 | mod tests {
122 | use super::*;
123 |
124 | #[test]
125 | fn simhash_test() {
126 | assert_eq!(simhash("The cat sat on the mat"), 2595200813813010837);
127 | assert_eq!(simhash("The cat sat under the mat"), 2595269945604666783);
128 | assert_eq!(simhash("Why the lucky stiff"), 1155526875459215761);
129 | }
130 |
131 | #[test]
132 | fn hamming_distance_test() {
133 | assert_eq!(hamming_distance(0b0000000u64, 0b0000000u64), 0);
134 | assert_eq!(hamming_distance(0b1111111u64, 0b0000000u64), 7);
135 | assert_eq!(hamming_distance(0b0100101u64, 0b1100110u64), 3);
136 | }
137 |
138 | #[test]
139 | fn hash_similarity_test() {
140 | assert_eq!(hash_similarity(0u64, 0u64), 1.0);
141 | assert_eq!(hash_similarity(!0u64, 0u64), 0.0);
142 | assert_eq!(hash_similarity(!0u32 as u64, 0u64), 0.5);
143 | }
144 |
145 | #[test]
146 | fn similarity_test() {
147 | assert_eq!(similarity("Stop hammertime", "Stop hammertime"), 1.0);
148 | assert!(similarity("Hocus pocus", "Hocus pocus pilatus pas") > 0.7);
149 | assert!(similarity("Peanut butter", "Strawberry cocktail") < 0.6);
150 | }
151 | }
152 |
--------------------------------------------------------------------------------