├── .gitignore ├── .rustfmt.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── src ├── backend │ ├── bucket │ │ ├── interned_str.rs │ │ ├── fixed_str.rs │ │ └── mod.rs │ ├── mod.rs │ ├── string.rs │ └── buffer.rs ├── symbol.rs ├── serde_impl.rs ├── lib.rs └── interner.rs ├── Cargo.toml ├── README.md ├── tests ├── allocator.rs └── tests.rs ├── benches ├── setup.rs └── bench.rs ├── .github └── workflows │ └── rust.yml ├── RELEASE_NOTES.md └── Cargo.lock /.gitignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | -------------------------------------------------------------------------------- /.rustfmt.toml: -------------------------------------------------------------------------------- 1 | imports_granularity = "Crate" 2 | imports_layout = "HorizontalVertical" 3 | edition = "2021" -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | APACHE License 2 | 3 | Copyright 2020 Robin Freyler 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 by Robin Freyler 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/backend/bucket/interned_str.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "backends")] 2 | 3 | use core::ptr::NonNull; 4 | 5 | /// Reference to an interned string. 6 | /// 7 | /// It is inherently `unsafe` to use instances of this type and should not be 8 | /// done outside of the `string-interner` crate itself. 9 | #[derive(Debug)] 10 | #[repr(transparent)] 11 | pub struct InternedStr { 12 | ptr: NonNull, 13 | } 14 | 15 | impl InternedStr { 16 | /// Creates a new interned string from the given `str`. 17 | #[inline] 18 | pub fn new(val: &str) -> Self { 19 | InternedStr { 20 | ptr: NonNull::from(val), 21 | } 22 | } 23 | 24 | /// Returns a shared reference to the underlying string. 25 | /// 26 | /// # Safety 27 | /// 28 | /// The user has to make sure that no lifetime guarantees are invalidated. 29 | #[inline] 30 | pub(super) fn as_str(&self) -> &str { 31 | // SAFETY: This is safe since we only ever operate on interned `str` 32 | // that are never moved around in memory to avoid danling 33 | // references. 34 | unsafe { self.ptr.as_ref() } 35 | } 36 | } 37 | 38 | impl Eq for InternedStr {} 39 | 40 | impl PartialEq for InternedStr { 41 | #[inline] 42 | fn eq(&self, other: &Self) -> bool { 43 | self.as_str() == other.as_str() 44 | } 45 | } 46 | 47 | #[cfg(test)] 48 | mod tests { 49 | use super::*; 50 | 51 | #[test] 52 | fn size_of() { 53 | use core::mem; 54 | assert_eq!(mem::size_of::(), mem::size_of::<&str>()); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/backend/bucket/fixed_str.rs: -------------------------------------------------------------------------------- 1 | use super::InternedStr; 2 | use alloc::string::String; 3 | 4 | #[derive(Debug, Default, Clone, PartialEq, Eq)] 5 | pub struct FixedString { 6 | contents: String, 7 | } 8 | 9 | impl FixedString { 10 | /// Creates a new fixed string with the given fixed capacity. 11 | #[inline] 12 | pub fn with_capacity(cap: usize) -> Self { 13 | Self { 14 | contents: String::with_capacity(cap), 15 | } 16 | } 17 | 18 | /// Returns the underlying [`Box`]. 19 | /// 20 | /// Guarantees not to perform any reallocations in this process. 21 | /// 22 | /// [`Box`]: alloc::boxed::Box 23 | #[inline] 24 | pub fn finish(self) -> String { 25 | self.contents 26 | } 27 | 28 | /// Returns the capacity in bytes of the fixed string. 29 | #[inline] 30 | pub fn capacity(&self) -> usize { 31 | self.contents.capacity() 32 | } 33 | 34 | /// Returns the length in bytes of the fixed string. 35 | #[inline] 36 | pub fn len(&self) -> usize { 37 | self.contents.len() 38 | } 39 | 40 | /// Pushes the given string into the fixed string if there is enough capacity. 41 | /// 42 | /// Returns a reference to the pushed string if there was enough capacity to 43 | /// perform the operation. Otherwise returns `None`. 44 | #[inline] 45 | pub fn push_str(&mut self, string: &str) -> Option { 46 | let len = self.len(); 47 | if self.capacity() < len + string.len() { 48 | return None; 49 | } 50 | self.contents.push_str(string); 51 | debug_assert_eq!(self.contents.len(), len + string.len()); 52 | Some(InternedStr::new( 53 | // SAFETY: We convert from bytes to utf8 from which we know through the 54 | // input string that they must represent valid utf8. 55 | unsafe { 56 | core::str::from_utf8_unchecked(&self.contents.as_bytes()[len..len + string.len()]) 57 | }, 58 | )) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "string-interner" 3 | version = "0.19.0" 4 | authors = ["Robbepop"] 5 | license = "MIT/Apache-2.0" 6 | readme = "README.md" 7 | repository = "https://github.com/robbepop/string-interner" 8 | documentation = "https://docs.rs/string-interner" 9 | keywords = ["interner", "intern", "string", "str", "symbol"] 10 | description = """Efficient string interner with minimal memory footprint 11 | and fast access to the underlying strings. 12 | """ 13 | categories = ["data-structures"] 14 | edition = "2021" 15 | 16 | [dependencies] 17 | hashbrown = { version = "0.15.1", default-features = false, features = ["default-hasher", "raw-entry"] } 18 | serde = { version = "1.0", default-features = false, features = ["alloc"], optional = true } 19 | 20 | [dev-dependencies] 21 | serde_json = "1.0" 22 | criterion = "0.5.1" 23 | fxhash = "0.2" 24 | 25 | [[bench]] 26 | name = "bench" 27 | harness = false 28 | 29 | [features] 30 | default = ["std", "serde", "inline-more", "backends"] 31 | std = ["serde?/std"] 32 | 33 | # Enable this if you need `Serde` serialization and deserialization support. 34 | # 35 | # Enabled by default. 36 | serde = ["dep:serde"] 37 | 38 | # Use this to mark more public functions of the StringInterner (and hashbrown) 39 | # as inline. This significantly increases compile times of the crate but improves 40 | # upon runtime execution. 41 | # 42 | # Enabled by default. 43 | inline-more = ["hashbrown/inline-more"] 44 | 45 | # Enables the backends provided out of the box by this crate. 46 | # Disable this if you want to only use your own backend and thus don't have 47 | # the need for those present backends. Reduces compilation time of this crate. 48 | # 49 | # Enabled by default. 50 | backends = [] 51 | 52 | # Enables testing of memory heap allocations. 53 | # 54 | # These tests are disabled by default since they are slow 55 | # compared to the other unit tests and also are required 56 | # to run single threaded using `--test-threads 1` as `rustc` 57 | # argument: 58 | # 59 | # cargo test --feature test-allocations -- --test-threads 1 60 | test-allocations = [] 61 | 62 | [badges] 63 | travis-ci = { repository = "Robbepop/string-interner" } 64 | appveyor = { repository = "Robbepop/string-interner", branch = "master", service = "github" } 65 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # String Interner 2 | 3 | | Continuous Integration | Test Coverage | Documentation | Crates.io | 4 | |:----------------------:|:--------------------:|:----------------:|:--------------------:| 5 | | [![travisCI][1]][2] | [![codecov][5]][6] | [![docs][9]][10] | [![crates][11]][12] | 6 | 7 | A data structure to cache strings efficiently, with minimal memory footprint and the ability to assicate 8 | the interned strings with unique symbols. 9 | These symbols allow for constant time comparisons and look-ups to the underlying interned string contents. 10 | Also, iterating through the interned strings is cache efficient. 11 | 12 | [1]: https://github.com/Robbepop/string-interner/workflows/Rust%20-%20Continuous%20Integration/badge.svg?branch=master 13 | [2]: https://github.com/Robbepop/string-interner/actions?query=workflow%3A%22Rust+-+Continuous+Integration%22+branch%3Amaster 14 | [5]: https://codecov.io/gh/robbepop/string-interner/branch/master/graph/badge.svg 15 | [6]: https://codecov.io/gh/Robbepop/string-interner/branch/master 16 | [9]: https://docs.rs/string-interner/badge.svg 17 | [10]: https://docs.rs/string-interner 18 | [11]: https://img.shields.io/crates/v/string-interner.svg 19 | [12]: https://crates.io/crates/string-interner 20 | 21 | [license-mit-badge]: https://img.shields.io/badge/license-MIT-blue.svg 22 | [license-apache-badge]: https://img.shields.io/badge/license-APACHE-orange.svg 23 | 24 | ## Contributing 25 | 26 | ### Testing 27 | 28 | Test the project using 29 | ``` 30 | cargo test --release 31 | ``` 32 | 33 | ### Memory Allocation Tests 34 | 35 | To further test memory consumption and allocations performed by the 36 | different string interner backends test the project as follows: 37 | ``` 38 | cargo test --release --features test-allocations -- --test-threads 1 39 | ``` 40 | 41 | - The `--features test-allocations` enables the memory allocations tests. 42 | - The `--test-thread 1` argument is required for the memory allocations tests 43 | since otherwise they interfere with each other causing them to randomly fail. 44 | - Use `--nocapture` to receive verbose output useful for debugging. 45 | 46 | ### Profiling & Benchmarks 47 | 48 | Benchmark the string interner and its various backends using 49 | ``` 50 | cargo bench 51 | ``` 52 | 53 | ## License 54 | 55 | Licensed under either of 56 | 57 | * Apache license, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) 58 | * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) 59 | 60 | at your option. 61 | 62 | ### Dual licence: [![badge][license-mit-badge]](LICENSE-MIT) [![badge][license-apache-badge]](LICENSE-APACHE) 63 | 64 | ### Contribution 65 | 66 | Unless you explicitly state otherwise, any contribution intentionally submitted 67 | for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as below, without any 68 | additional terms or conditions. 69 | -------------------------------------------------------------------------------- /src/backend/mod.rs: -------------------------------------------------------------------------------- 1 | //! Backends for the [`StringInterner`](`crate::StringInterner`). 2 | //! 3 | //! The backend is the method or strategy that handles the actual interning. 4 | //! There are trade-offs for the different kinds of backends. A user should 5 | //! find the backend that suits their use case best. 6 | 7 | mod bucket; 8 | mod buffer; 9 | mod string; 10 | 11 | #[cfg(feature = "backends")] 12 | pub use self::{bucket::BucketBackend, buffer::BufferBackend, string::StringBackend}; 13 | use crate::Symbol; 14 | 15 | /// The default backend recommended for general use. 16 | #[cfg(feature = "backends")] 17 | pub type DefaultBackend = StringBackend; 18 | 19 | /// Types implementing this trait may act as backends for the string interner. 20 | /// 21 | /// The job of a backend is to actually store, manage and organize the interned 22 | /// strings. Different backends have different trade-offs. Users should pick 23 | /// their backend with hinsight of their personal use-case. 24 | pub trait Backend: Default { 25 | /// The symbol used by the string interner backend. 26 | type Symbol: Symbol; 27 | 28 | /// The iterator over the symbols and their strings. 29 | type Iter<'a>: Iterator 30 | where 31 | Self: 'a; 32 | 33 | /// Creates a new backend for the given capacity. 34 | /// 35 | /// The capacity denotes how many strings are expected to be interned. 36 | fn with_capacity(cap: usize) -> Self; 37 | 38 | /// Interns the given string and returns its interned ref and symbol. 39 | /// 40 | /// # Note 41 | /// 42 | /// The backend must make sure that the returned symbol maps back to the 43 | /// original string in its [`resolve`](`Backend::resolve`) method. 44 | fn intern(&mut self, string: &str) -> Self::Symbol; 45 | 46 | /// Interns the given static string and returns its interned ref and symbol. 47 | /// 48 | /// # Note 49 | /// 50 | /// The backend must make sure that the returned symbol maps back to the 51 | /// original string in its [`resolve`](`Backend::resolve`) method. 52 | #[inline] 53 | fn intern_static(&mut self, string: &'static str) -> Self::Symbol { 54 | // The default implementation simply forwards to the normal [`intern`] 55 | // implementation. Backends that can optimize for this use case should 56 | // implement this method. 57 | self.intern(string) 58 | } 59 | 60 | /// Shrink backend capacity to fit interned symbols exactly. 61 | fn shrink_to_fit(&mut self); 62 | 63 | /// Resolves the given symbol to its original string contents. 64 | fn resolve(&self, symbol: Self::Symbol) -> Option<&str>; 65 | 66 | /// Resolves the given symbol to its original string contents. 67 | /// 68 | /// # Safety 69 | /// 70 | /// Does not perform validity checks on the given symbol and relies 71 | /// on the caller to be provided with a symbol that has been generated 72 | /// by the [`intern`](`Backend::intern`) or 73 | /// [`intern_static`](`Backend::intern_static`) methods of the same 74 | /// interner backend. 75 | unsafe fn resolve_unchecked(&self, symbol: Self::Symbol) -> &str; 76 | 77 | /// Creates an iterator that yields all interned strings and their symbols. 78 | fn iter(&self) -> Self::Iter<'_>; 79 | } 80 | -------------------------------------------------------------------------------- /tests/allocator.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | alloc::{GlobalAlloc, Layout, System}, 3 | sync::atomic::{AtomicBool, AtomicUsize, Ordering}, 4 | }; 5 | 6 | pub struct TracingAllocator { 7 | pub inner: System, 8 | pub stats: TracedStats, 9 | } 10 | 11 | impl TracingAllocator { 12 | #[allow(clippy::new_without_default)] // not applicable here 13 | pub const fn new() -> Self { 14 | Self { 15 | inner: System, 16 | stats: TracedStats::new(), 17 | } 18 | } 19 | 20 | pub fn stats(&self) -> &TracedStats { 21 | &self.stats 22 | } 23 | 24 | pub fn reset(&self) { 25 | self.stats.reset(); 26 | } 27 | 28 | pub fn start_profiling(&self) { 29 | self.stats.start_profiling() 30 | } 31 | 32 | pub fn end_profiling(&self) { 33 | self.stats.end_profiling() 34 | } 35 | } 36 | 37 | unsafe impl GlobalAlloc for TracingAllocator { 38 | unsafe fn alloc(&self, layout: Layout) -> *mut u8 { 39 | self.stats.push_allocations(layout); 40 | self.inner.alloc(layout) 41 | } 42 | 43 | unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) { 44 | self.stats.push_deallocations(layout); 45 | self.inner.dealloc(ptr, layout); 46 | } 47 | } 48 | 49 | #[derive(Debug)] 50 | pub struct TracedStats { 51 | is_active: AtomicBool, 52 | len_allocations: AtomicUsize, 53 | len_deallocations: AtomicUsize, 54 | current_memory_usage: AtomicUsize, 55 | total_memory_usage: AtomicUsize, 56 | } 57 | 58 | impl TracedStats { 59 | const fn new() -> Self { 60 | Self { 61 | is_active: AtomicBool::new(false), 62 | len_allocations: AtomicUsize::new(0), 63 | len_deallocations: AtomicUsize::new(0), 64 | current_memory_usage: AtomicUsize::new(0), 65 | total_memory_usage: AtomicUsize::new(0), 66 | } 67 | } 68 | 69 | pub fn len_allocations(&self) -> usize { 70 | self.len_allocations.load(Ordering::SeqCst) 71 | } 72 | 73 | pub fn len_deallocations(&self) -> usize { 74 | self.len_deallocations.load(Ordering::SeqCst) 75 | } 76 | 77 | pub fn current_allocated_bytes(&self) -> usize { 78 | self.current_memory_usage.load(Ordering::SeqCst) 79 | } 80 | 81 | pub fn total_allocated_bytes(&self) -> usize { 82 | self.total_memory_usage.load(Ordering::SeqCst) 83 | } 84 | 85 | fn is_active(&self) -> bool { 86 | self.is_active.load(Ordering::SeqCst) 87 | } 88 | 89 | fn reset(&self) { 90 | self.len_allocations.store(0, Ordering::SeqCst); 91 | self.len_deallocations.store(0, Ordering::SeqCst); 92 | self.current_memory_usage.store(0, Ordering::SeqCst); 93 | self.total_memory_usage.store(0, Ordering::SeqCst); 94 | } 95 | 96 | fn start_profiling(&self) { 97 | self.is_active.store(true, Ordering::SeqCst); 98 | } 99 | 100 | fn end_profiling(&self) { 101 | self.is_active.store(false, Ordering::SeqCst); 102 | } 103 | 104 | fn push_allocations(&self, layout: Layout) { 105 | let size = layout.size(); 106 | if !self.is_active() || size == 0 { 107 | return; 108 | } 109 | self.len_allocations.fetch_add(1, Ordering::SeqCst); 110 | self.current_memory_usage.fetch_add(size, Ordering::SeqCst); 111 | self.total_memory_usage.fetch_add(size, Ordering::SeqCst); 112 | } 113 | 114 | fn push_deallocations(&self, layout: Layout) { 115 | let size = layout.size(); 116 | if !self.is_active() || size == 0 { 117 | return; 118 | } 119 | self.len_deallocations.fetch_add(1, Ordering::SeqCst); 120 | self.current_memory_usage.fetch_sub(size, Ordering::SeqCst); 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /benches/setup.rs: -------------------------------------------------------------------------------- 1 | use string_interner::{ 2 | backend::{Backend, BucketBackend, BufferBackend, StringBackend}, 3 | DefaultSymbol, 4 | StringInterner, 5 | }; 6 | 7 | /// Alphabet containing all characters that may be put into a benchmark string. 8 | const ALPHABET: [u8; 64] = [ 9 | b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o', b'p', 10 | b'q', b'r', b's', b't', b'u', b'v', b'w', b'x', b'y', b'z', b'A', b'B', b'C', b'D', b'E', b'F', 11 | b'G', b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O', b'P', b'Q', b'R', b'S', b'T', b'U', b'V', 12 | b'W', b'X', b'Y', b'Z', b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'_', b'-', 13 | ]; 14 | 15 | /// A word builder for benchmark purposes. 16 | /// 17 | /// Creates unique words of same sizes. 18 | struct WordBuilder { 19 | indices: Vec, 20 | } 21 | 22 | impl WordBuilder { 23 | /// Creates a new word builder for words with given length. 24 | pub fn new(word_len: usize) -> Self { 25 | Self { 26 | indices: vec![0x00; word_len], 27 | } 28 | } 29 | 30 | /// Fills the internal buffer with the next unique word indices. 31 | fn next_indices(&mut self) -> Option<&[u8]> { 32 | 'l: for index in &mut self.indices { 33 | if *index == (64 - 1) { 34 | *index = 0; 35 | continue 'l; 36 | } 37 | *index += 1; 38 | return Some(&self.indices[..]); 39 | } 40 | None 41 | } 42 | 43 | /// Returns the next unique word of the same size. 44 | fn next_word(&mut self) -> Option { 45 | self.next_indices() 46 | .map(|indices| { 47 | indices 48 | .iter() 49 | .map(|&index| { 50 | assert!(index < 64); 51 | ALPHABET[index as usize] 52 | }) 53 | .collect::>() 54 | }) 55 | .map(|bytes| String::from_utf8(bytes).unwrap()) 56 | } 57 | } 58 | 59 | impl Iterator for WordBuilder { 60 | type Item = String; 61 | 62 | fn next(&mut self) -> Option { 63 | self.next_word() 64 | } 65 | } 66 | 67 | /// Generates a vector of `len` unique words of the same given length. 68 | pub fn generate_test_strings(len: usize, word_len: usize) -> Vec { 69 | let words = WordBuilder::new(word_len).take(len).collect::>(); 70 | assert_eq!(words.len(), len); 71 | assert_eq!(words[0].len(), word_len); 72 | words 73 | } 74 | 75 | /// The number of strings that are going to be interned in the benchmarks. 76 | pub const BENCH_LEN_STRINGS: usize = 100_000; 77 | 78 | /// The length of a single interned string. 79 | pub const BENCH_STRING_LEN: usize = 5; 80 | 81 | type FxBuildHasher = fxhash::FxBuildHasher; 82 | type StringInternerWith = StringInterner; 83 | 84 | pub trait BackendBenchmark { 85 | const NAME: &'static str; 86 | type Backend: Backend; 87 | 88 | fn setup() -> StringInternerWith { 89 | >::new() 90 | } 91 | 92 | fn setup_with_capacity(cap: usize) -> StringInternerWith { 93 | >::with_capacity(cap) 94 | } 95 | 96 | fn setup_filled(words: &[String]) -> StringInternerWith { 97 | words.iter().collect::>() 98 | } 99 | 100 | fn setup_filled_with_ids( 101 | words: &[String], 102 | ) -> ( 103 | StringInternerWith, 104 | Vec<::Symbol>, 105 | ) { 106 | let mut interner = >::new(); 107 | let word_ids = words 108 | .iter() 109 | .map(|word| interner.get_or_intern(word)) 110 | .collect::>(); 111 | (interner, word_ids) 112 | } 113 | } 114 | 115 | pub struct BenchBucket; 116 | impl BackendBenchmark for BenchBucket { 117 | const NAME: &'static str = "BucketBackend"; 118 | type Backend = BucketBackend; 119 | } 120 | 121 | pub struct BenchString; 122 | impl BackendBenchmark for BenchString { 123 | const NAME: &'static str = "StringBackend"; 124 | type Backend = StringBackend; 125 | } 126 | 127 | pub struct BenchBuffer; 128 | impl BackendBenchmark for BenchBuffer { 129 | const NAME: &'static str = "BufferBackend"; 130 | type Backend = BufferBackend; 131 | } 132 | -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust - Continuous Integration 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | name: Build 12 | strategy: 13 | matrix: 14 | os: [ubuntu-latest, windows-latest, macos-latest] 15 | runs-on: ${{ matrix.os }} 16 | steps: 17 | - uses: actions/checkout@v4 18 | - uses: dtolnay/rust-toolchain@stable 19 | with: 20 | targets: wasm32-unknown-unknown, thumbv7em-none-eabi 21 | - name: Set up Rust cache 22 | uses: Swatinem/rust-cache@v2 23 | - name: Checkout Submodules 24 | run: git submodule update --init --recursive 25 | - name: Build 26 | run: cargo build 27 | - name: Build (All Features) 28 | run: cargo build --all-features 29 | - name: Build (no_std) 30 | run: cargo build --no-default-features --target thumbv7em-none-eabi 31 | - name: Build (Wasm) 32 | run: cargo build --no-default-features --target wasm32-unknown-unknown 33 | 34 | test: 35 | name: Test 36 | strategy: 37 | matrix: 38 | os: [ubuntu-latest, windows-latest, macos-latest] 39 | runs-on: ${{ matrix.os }} 40 | steps: 41 | - uses: actions/checkout@v4 42 | - uses: dtolnay/rust-toolchain@stable 43 | - name: Set up Rust cache 44 | uses: Swatinem/rust-cache@v2 45 | - name: Checkout Submodules 46 | run: git submodule update --init --recursive 47 | - name: Test (Release) 48 | env: 49 | RUSTFLAGS: "-C debug-assertions" 50 | run: cargo test --release 51 | 52 | test-memory: 53 | name: Test Memory Consumption 54 | strategy: 55 | matrix: 56 | os: [ubuntu-latest, windows-latest, macos-latest] 57 | runs-on: ${{ matrix.os }} 58 | steps: 59 | - uses: actions/checkout@v4 60 | - uses: dtolnay/rust-toolchain@stable 61 | - name: Set up Rust cache 62 | uses: Swatinem/rust-cache@v2 63 | - name: Test (Memory Consumption) 64 | run: cargo test --release --features test-allocations -- --test-threads 1 test_memory_consumption 65 | 66 | fmt: 67 | name: Formatting 68 | runs-on: ubuntu-latest 69 | steps: 70 | - uses: actions/checkout@v4 71 | - uses: dtolnay/rust-toolchain@nightly 72 | with: 73 | components: rustfmt 74 | - name: Check Formatting 75 | run: cargo fmt --all -- --check 76 | 77 | doc: 78 | name: Documentation 79 | runs-on: ubuntu-latest 80 | steps: 81 | - uses: actions/checkout@v4 82 | - uses: dtolnay/rust-toolchain@stable 83 | with: 84 | components: rust-docs, rust-src 85 | - name: Set up Rust cache 86 | uses: Swatinem/rust-cache@v2 87 | - name: Check Docs 88 | env: 89 | RUSTDOCFLAGS: '-D warnings' 90 | run: cargo doc --workspace --no-deps --document-private-items 91 | 92 | clippy: 93 | name: Clippy 94 | runs-on: ubuntu-latest 95 | steps: 96 | - uses: actions/checkout@v4 97 | - uses: dtolnay/rust-toolchain@nightly 98 | with: 99 | components: clippy 100 | - name: Set up Rust cache 101 | uses: Swatinem/rust-cache@v2 102 | - name: Check Clippy 103 | run: cargo clippy -- -D warnings 104 | 105 | outdated: 106 | name: Outdated Dependencies 107 | runs-on: ubuntu-latest 108 | steps: 109 | - uses: actions/checkout@v4 110 | - uses: dtolnay/rust-toolchain@stable 111 | - name: Set up Rust cache 112 | uses: Swatinem/rust-cache@v2 113 | - name: Install `cargo-outdated` 114 | run: cargo install cargo-outdated 115 | - name: Check Dependencies 116 | run: cargo outdated --root-deps-only --workspace --exit-code 1 117 | 118 | audit: 119 | name: Audit 120 | runs-on: ubuntu-latest 121 | steps: 122 | - uses: actions/checkout@v4 123 | - uses: dtolnay/rust-toolchain@stable 124 | - name: Set up Rust cache 125 | uses: Swatinem/rust-cache@v2 126 | - name: Install `cargo-audit` 127 | run: cargo install cargo-audit 128 | - name: Audit Dependencies 129 | run: cargo audit --deny warnings 130 | 131 | miri: 132 | name: Miri 133 | runs-on: ubuntu-latest 134 | steps: 135 | - uses: actions/checkout@v4 136 | - uses: dtolnay/rust-toolchain@nightly 137 | with: 138 | components: miri 139 | - name: Set up Rust cache 140 | uses: Swatinem/rust-cache@v2 141 | - name: Test with Miri 142 | run: cargo miri test 143 | 144 | coverage: 145 | name: Coverage 146 | runs-on: ubuntu-latest 147 | container: 148 | image: xd009642/tarpaulin:develop-nightly 149 | options: --security-opt seccomp=unconfined 150 | steps: 151 | - name: Checkout repository 152 | uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 153 | with: 154 | submodules: true 155 | fetch-depth: 0 156 | - uses: dtolnay/rust-toolchain@nightly 157 | - name: Set up Rust cache 158 | uses: Swatinem/rust-cache@v2 159 | - name: Generate code coverage 160 | run: | 161 | cargo tarpaulin --verbose --all-features --workspace --timeout 120 --out xml -- --test-threads 1 162 | - name: Upload to codecov.io 163 | uses: codecov/codecov-action@v5.3.1 164 | with: 165 | token: ${{secrets.CODECOV_TOKEN}} 166 | fail_ci_if_error: true 167 | - name: Archive code coverage results 168 | uses: actions/upload-artifact@v4 169 | with: 170 | name: code-coverage-report 171 | path: cobertura.xml 172 | -------------------------------------------------------------------------------- /src/symbol.rs: -------------------------------------------------------------------------------- 1 | //! Interfaces and types to be used as symbols for the 2 | //! [`StringInterner`](`crate::StringInterner`). 3 | //! 4 | //! The [`StringInterner::get_or_intern`](`crate::StringInterner::get_or_intern`) 5 | //! method returns `Symbol` types that allow to look-up the original string 6 | //! using [`StringInterner::resolve`](`crate::StringInterner::resolve`). 7 | 8 | use core::num::{NonZeroU16, NonZeroU32, NonZeroUsize}; 9 | 10 | /// Types implementing this trait can be used as symbols for string interners. 11 | /// 12 | /// The [`StringInterner::get_or_intern`](`crate::StringInterner::get_or_intern`) 13 | /// method returns `Symbol` types that allow to look-up the original string 14 | /// using [`StringInterner::resolve`](`crate::StringInterner::resolve`). 15 | /// 16 | /// # Note 17 | /// 18 | /// Optimal symbols allow for efficient comparisons and have a small memory footprint. 19 | pub trait Symbol: Copy + Eq { 20 | /// Creates a symbol from a `usize`. 21 | /// 22 | /// Returns `None` if `index` is out of bounds for the symbol. 23 | fn try_from_usize(index: usize) -> Option; 24 | 25 | /// Returns the `usize` representation of `self`. 26 | fn to_usize(self) -> usize; 27 | } 28 | 29 | /// Creates the symbol `S` from the given `usize`. 30 | /// 31 | /// # Panics 32 | /// 33 | /// Panics if the conversion is invalid. 34 | #[cfg(feature = "backends")] 35 | #[inline] 36 | pub(crate) fn expect_valid_symbol(index: usize) -> S 37 | where 38 | S: Symbol, 39 | { 40 | S::try_from_usize(index).expect("encountered invalid symbol") 41 | } 42 | 43 | /// The symbol type that is used by default. 44 | pub type DefaultSymbol = SymbolU32; 45 | 46 | impl Symbol for usize { 47 | #[inline] 48 | fn try_from_usize(index: usize) -> Option { 49 | Some(index) 50 | } 51 | 52 | #[inline] 53 | fn to_usize(self) -> usize { 54 | self 55 | } 56 | } 57 | 58 | macro_rules! gen_symbol_for { 59 | ( 60 | $( #[$doc:meta] )* 61 | struct $name:ident($non_zero:ty; $base_ty:ty); 62 | ) => { 63 | $( #[$doc] )* 64 | #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] 65 | pub struct $name { 66 | pub(crate) value: $non_zero, 67 | } 68 | 69 | impl $name { 70 | pub(crate) fn new(index: $base_ty) -> Option { 71 | <$non_zero>::new((index).wrapping_add(1)) 72 | .map(|value| Self { value }) 73 | } 74 | } 75 | 76 | impl Symbol for $name { 77 | #[inline] 78 | fn try_from_usize(index: usize) -> Option { 79 | Self::new(index as $base_ty) 80 | } 81 | 82 | #[inline] 83 | fn to_usize(self) -> usize { 84 | self.value.get() as usize - 1 85 | } 86 | } 87 | }; 88 | } 89 | gen_symbol_for!( 90 | /// Symbol that is 16-bit in size. 91 | /// 92 | /// Is space-optimized for used in `Option`. 93 | struct SymbolU16(NonZeroU16; u16); 94 | ); 95 | gen_symbol_for!( 96 | /// Symbol that is 32-bit in size. 97 | /// 98 | /// Is space-optimized for used in `Option`. 99 | struct SymbolU32(NonZeroU32; u32); 100 | ); 101 | gen_symbol_for!( 102 | /// Symbol that is the same size as a pointer (`usize`). 103 | /// 104 | /// Is space-optimized for used in `Option`. 105 | struct SymbolUsize(NonZeroUsize; usize); 106 | ); 107 | 108 | #[cfg(test)] 109 | mod tests { 110 | use super::*; 111 | use core::mem::size_of; 112 | 113 | #[test] 114 | fn same_size_as_u32() { 115 | assert_eq!(size_of::(), size_of::()); 116 | } 117 | 118 | #[test] 119 | fn same_size_as_optional() { 120 | assert_eq!( 121 | size_of::(), 122 | size_of::>() 123 | ); 124 | } 125 | 126 | #[test] 127 | fn try_from_usize_works() { 128 | assert_eq!( 129 | SymbolU16::try_from_usize(0), 130 | Some(SymbolU16 { 131 | value: NonZeroU16::new(1).unwrap() 132 | }) 133 | ); 134 | assert_eq!( 135 | SymbolU16::try_from_usize(u16::MAX as usize - 1), 136 | Some(SymbolU16 { 137 | value: NonZeroU16::new(u16::MAX).unwrap() 138 | }) 139 | ); 140 | assert_eq!(SymbolU16::try_from_usize(u16::MAX as usize), None); 141 | assert_eq!(SymbolU16::try_from_usize(usize::MAX), None); 142 | } 143 | 144 | macro_rules! gen_test_for { 145 | ( $test_name:ident: struct $name:ident($non_zero:ty; $base_ty:ty); ) => { 146 | #[test] 147 | fn $test_name() { 148 | for val in 0..10 { 149 | assert_eq!( 150 | <$name>::try_from_usize(val), 151 | Some($name { 152 | value: <$non_zero>::new(val as $base_ty + 1).unwrap() 153 | }) 154 | ); 155 | } 156 | assert_eq!( 157 | <$name>::try_from_usize(<$base_ty>::MAX as usize - 1), 158 | Some($name { 159 | value: <$non_zero>::new(<$base_ty>::MAX).unwrap() 160 | }) 161 | ); 162 | assert_eq!(<$name>::try_from_usize(<$base_ty>::MAX as usize), None); 163 | assert_eq!(<$name>::try_from_usize(::MAX), None); 164 | } 165 | }; 166 | } 167 | gen_test_for!( 168 | try_from_usize_works_for_u16: 169 | struct SymbolU16(NonZeroU16; u16); 170 | ); 171 | gen_test_for!( 172 | try_from_usize_works_for_u32: 173 | struct SymbolU32(NonZeroU32; u32); 174 | ); 175 | gen_test_for!( 176 | try_from_usize_works_for_usize: 177 | struct SymbolUsize(NonZeroUsize; usize); 178 | ); 179 | } 180 | -------------------------------------------------------------------------------- /src/serde_impl.rs: -------------------------------------------------------------------------------- 1 | use crate::{backend::Backend, StringInterner, Symbol}; 2 | use alloc::boxed::Box; 3 | use core::{default::Default, fmt, hash::BuildHasher, marker}; 4 | use serde::{ 5 | de::{Deserialize, Deserializer, SeqAccess, Visitor}, 6 | ser::{Serialize, SerializeSeq, Serializer}, 7 | }; 8 | 9 | impl Serialize for StringInterner 10 | where 11 | B: Backend, 12 | ::Symbol: Symbol, 13 | for<'a> &'a B: IntoIterator::Symbol, &'a str)>, 14 | H: BuildHasher, 15 | { 16 | fn serialize(&self, serializer: T) -> Result 17 | where 18 | T: Serializer, 19 | { 20 | let mut seq = serializer.serialize_seq(Some(self.len()))?; 21 | for (_symbol, string) in self { 22 | seq.serialize_element(string)? 23 | } 24 | seq.end() 25 | } 26 | } 27 | 28 | impl<'de, B, H> Deserialize<'de> for StringInterner 29 | where 30 | B: Backend, 31 | ::Symbol: Symbol, 32 | H: BuildHasher + Default, 33 | { 34 | fn deserialize(deserializer: D) -> Result, D::Error> 35 | where 36 | D: Deserializer<'de>, 37 | { 38 | deserializer.deserialize_seq(StringInternerVisitor::default()) 39 | } 40 | } 41 | 42 | struct StringInternerVisitor 43 | where 44 | B: Backend, 45 | ::Symbol: Symbol, 46 | H: BuildHasher, 47 | { 48 | mark: marker::PhantomData<(::Symbol, B, H)>, 49 | } 50 | 51 | impl Default for StringInternerVisitor 52 | where 53 | B: Backend, 54 | ::Symbol: Symbol, 55 | H: BuildHasher, 56 | { 57 | fn default() -> Self { 58 | StringInternerVisitor { 59 | mark: marker::PhantomData, 60 | } 61 | } 62 | } 63 | 64 | impl<'de, B, H> Visitor<'de> for StringInternerVisitor 65 | where 66 | B: Backend, 67 | ::Symbol: Symbol, 68 | H: BuildHasher + Default, 69 | { 70 | type Value = StringInterner; 71 | 72 | fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { 73 | formatter.write_str("Expected a contiguous sequence of strings.") 74 | } 75 | 76 | fn visit_seq(self, mut seq: A) -> Result 77 | where 78 | A: SeqAccess<'de>, 79 | { 80 | let mut interner: StringInterner = 81 | StringInterner::with_capacity_and_hasher(seq.size_hint().unwrap_or(0), H::default()); 82 | while let Some(s) = seq.next_element::>()? { 83 | interner.get_or_intern(s); 84 | } 85 | Ok(interner) 86 | } 87 | } 88 | 89 | macro_rules! impl_serde_for_symbol { 90 | ($name:ident, $ty:ty) => { 91 | impl ::serde::Serialize for $crate::symbol::$name { 92 | fn serialize( 93 | &self, 94 | serializer: T, 95 | ) -> ::core::result::Result { 96 | self.to_usize().serialize(serializer) 97 | } 98 | } 99 | 100 | impl<'de> ::serde::Deserialize<'de> for $crate::symbol::$name { 101 | fn deserialize>( 102 | deserializer: D, 103 | ) -> ::core::result::Result { 104 | let index = <$ty as ::serde::Deserialize<'de>>::deserialize(deserializer)?; 105 | let ::core::option::Option::Some(symbol) = Self::new(index) else { 106 | return ::core::result::Result::Err(::custom( 107 | ::core::concat!( 108 | "invalid index value for `", 109 | ::core::stringify!($name), 110 | "`" 111 | ), 112 | )); 113 | }; 114 | ::core::result::Result::Ok(symbol) 115 | } 116 | } 117 | }; 118 | } 119 | impl_serde_for_symbol!(SymbolU16, u16); 120 | impl_serde_for_symbol!(SymbolU32, u32); 121 | impl_serde_for_symbol!(SymbolUsize, usize); 122 | 123 | #[cfg(test)] 124 | mod tests { 125 | use crate::{ 126 | symbol::{SymbolU16, SymbolU32, SymbolUsize}, 127 | Symbol, 128 | }; 129 | use serde_json; 130 | 131 | fn symbol_round_trip_serializes(symbol: S) -> bool 132 | where 133 | S: Symbol + std::fmt::Debug + serde::Serialize + serde::de::DeserializeOwned + PartialEq, 134 | { 135 | let serialized = serde_json::to_string(&symbol).expect("serialization should succeed"); 136 | let deserialized: S = 137 | serde_json::from_str(&serialized).expect("deserialization should succeed"); 138 | symbol == deserialized 139 | } 140 | 141 | #[test] 142 | fn symbol_u16_round_trips() { 143 | assert!(symbol_round_trip_serializes( 144 | SymbolU16::try_from_usize(0).unwrap() 145 | )); 146 | assert!(symbol_round_trip_serializes( 147 | SymbolU16::try_from_usize(42).unwrap() 148 | )); 149 | assert!(symbol_round_trip_serializes( 150 | SymbolU16::try_from_usize(u16::MAX as usize - 1).unwrap() 151 | )); 152 | } 153 | 154 | #[test] 155 | fn symbol_u32_round_trips() { 156 | assert!(symbol_round_trip_serializes( 157 | SymbolU32::try_from_usize(0).unwrap() 158 | )); 159 | assert!(symbol_round_trip_serializes( 160 | SymbolU32::try_from_usize(42).unwrap() 161 | )); 162 | assert!(symbol_round_trip_serializes( 163 | SymbolU32::try_from_usize(u32::MAX as usize - 1).unwrap() 164 | )); 165 | } 166 | 167 | #[test] 168 | fn symbol_usize_round_trips() { 169 | assert!(symbol_round_trip_serializes( 170 | SymbolUsize::try_from_usize(0).unwrap() 171 | )); 172 | assert!(symbol_round_trip_serializes( 173 | SymbolUsize::try_from_usize(42).unwrap() 174 | )); 175 | assert!(symbol_round_trip_serializes( 176 | SymbolUsize::try_from_usize(usize::MAX as usize - 1).unwrap() 177 | )); 178 | } 179 | 180 | #[test] 181 | fn raw_usize_round_trips() { 182 | assert!(symbol_round_trip_serializes( 183 | usize::try_from_usize(0).unwrap() 184 | )); 185 | assert!(symbol_round_trip_serializes( 186 | usize::try_from_usize(42).unwrap() 187 | )); 188 | assert!(symbol_round_trip_serializes( 189 | usize::try_from_usize(usize::MAX).unwrap() 190 | )); 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![no_std] 2 | #![doc(html_root_url = "https://docs.rs/crate/string-interner/0.18.0")] 3 | #![warn(unsafe_op_in_unsafe_fn, clippy::redundant_closure_for_method_calls)] 4 | 5 | //! Caches strings efficiently, with minimal memory footprint and associates them with unique symbols. 6 | //! These symbols allow constant time comparisons and look-ups to the underlying interned strings. 7 | //! 8 | //! ### Example: Interning & Symbols 9 | //! 10 | //! ``` 11 | //! use string_interner::StringInterner; 12 | //! 13 | //! let mut interner = StringInterner::default(); 14 | //! let sym0 = interner.get_or_intern("Elephant"); 15 | //! let sym1 = interner.get_or_intern("Tiger"); 16 | //! let sym2 = interner.get_or_intern("Horse"); 17 | //! let sym3 = interner.get_or_intern("Tiger"); 18 | //! assert_ne!(sym0, sym1); 19 | //! assert_ne!(sym0, sym2); 20 | //! assert_ne!(sym1, sym2); 21 | //! assert_eq!(sym1, sym3); // same! 22 | //! ``` 23 | //! 24 | //! ### Example: Creation by `FromIterator` 25 | //! 26 | //! ``` 27 | //! # use string_interner::DefaultStringInterner; 28 | //! let interner = ["Elephant", "Tiger", "Horse", "Tiger"] 29 | //! .into_iter() 30 | //! .collect::(); 31 | //! ``` 32 | //! 33 | //! ### Example: Look-up 34 | //! 35 | //! ``` 36 | //! # use string_interner::StringInterner; 37 | //! let mut interner = StringInterner::default(); 38 | //! let sym = interner.get_or_intern("Banana"); 39 | //! assert_eq!(interner.resolve(sym), Some("Banana")); 40 | //! ``` 41 | //! 42 | //! ### Example: Iteration 43 | //! 44 | //! ``` 45 | //! # use string_interner::{DefaultStringInterner, Symbol}; 46 | //! let interner = ::from_iter(["Earth", "Water", "Fire", "Air"]); 47 | //! for (sym, str) in &interner { 48 | //! println!("{} = {}", sym.to_usize(), str); 49 | //! } 50 | //! ``` 51 | //! 52 | //! ### Example: Use Different Backend 53 | //! 54 | //! ``` 55 | //! # use string_interner::StringInterner; 56 | //! use string_interner::backend::BufferBackend; 57 | //! type Interner = StringInterner; 58 | //! let mut interner = Interner::new(); 59 | //! let sym1 = interner.get_or_intern("Tiger"); 60 | //! let sym2 = interner.get_or_intern("Horse"); 61 | //! let sym3 = interner.get_or_intern("Tiger"); 62 | //! assert_ne!(sym1, sym2); 63 | //! assert_eq!(sym1, sym3); // same! 64 | //! ``` 65 | //! 66 | //! ### Example: Use Different Backend & Symbol 67 | //! 68 | //! ``` 69 | //! # use string_interner::StringInterner; 70 | //! use string_interner::{backend::BucketBackend, symbol::SymbolU16}; 71 | //! type Interner = StringInterner>; 72 | //! let mut interner = Interner::new(); 73 | //! let sym1 = interner.get_or_intern("Tiger"); 74 | //! let sym2 = interner.get_or_intern("Horse"); 75 | //! let sym3 = interner.get_or_intern("Tiger"); 76 | //! assert_ne!(sym1, sym2); 77 | //! assert_eq!(sym1, sym3); // same! 78 | //! ``` 79 | //! 80 | //! ## Backends 81 | //! 82 | //! The `string_interner` crate provides different backends with different strengths. 83 | //! The table below compactly shows when to use which backend according to the following 84 | //! performance characteristics and properties. 85 | //! 86 | //! | **Property** | **BucketBackend** | **StringBackend** | **BufferBackend** | | Explanation | 87 | //! |:-------------|:-----------------:|:-----------------:|:-----------------:|:--|:--| 88 | //! | Fill | 🤷 | 👍 | ⭐ | | Efficiency of filling an empty string interner. | 89 | //! | Fill Duplicates | 1) | 1) | 1) | | Efficiency of filling a string interner with strings that are already interned. | 90 | //! | Resolve | ⭐ | 👍 | 👎 | | Efficiency of resolving a symbol of an interned string. | 91 | //! | Resolve Unchecked | 👍 | 👍 | ⭐ 2) | | Efficiency of unchecked resolving a symbol of an interned string. | 92 | //! | Allocations | 🤷 | 👍 | ⭐ | | The number of allocations performed by the backend. | 93 | //! | Footprint | 🤷 | 👍 | ⭐ | | The total heap memory consumed by the backend. | 94 | //! | Iteration | ⭐ | 👍 | 👎 | | Efficiency of iterating over the interned strings. | 95 | //! | | | | | | | 96 | //! | Contiguous | ✅ | ✅ | ❌ | | The returned symbols have contiguous values. | 97 | //! | Stable Refs | ✅ | ❌ | ❌ | | The interned strings have stable references. | 98 | //! | Static Strings | ✅ | ❌ | ❌ | | Allows to intern `&'static str` without heap allocations. | 99 | //! 100 | //! 1. Performance of interning pre-interned string is the same for all backends since 101 | //! this is implemented in the `StringInterner` front-end via a `HashMap` query for 102 | //! all `StringInterner` instances. 103 | //! 104 | //! 2. `BufferBackend` is slow with checked resolving because its internal representation 105 | //! is extremely sensible to the correctness of the symbols, thus a lot of checks 106 | //! are performed. If you will only use symbols provided by the same instance of 107 | //! `BufferBackend`, `resolve_unchecked` is a lot faster. 108 | //! 109 | //! ### Legend 110 | //! 111 | //! | ⭐ | **best performance** | 👍 | **good performance** | 🤷 | **okay performance** | 👎 | **bad performance** | 112 | //! |-|-|-|-|-|-|-|-| 113 | //! 114 | //! ## When to use which backend? 115 | //! 116 | //! ### Bucket Backend 117 | //! 118 | //! Given the table above the `BucketBackend` might seem inferior to the other backends. 119 | //! However, it allows to efficiently intern `&'static str` and avoids deallocations. 120 | //! 121 | //! ### String Backend 122 | //! 123 | //! Overall the `StringBackend` performs really well and therefore is the backend 124 | //! that the `StringInterner` uses by default. 125 | //! 126 | //! ### Buffer Backend 127 | //! 128 | //! The `BufferBackend` is in some sense similar to the `StringBackend` on steroids. 129 | //! Some operations are even slightly more efficient and it consumes less memory. 130 | //! However, all this is at the costs of a less efficient resolution of symbols. 131 | //! Note that the symbols generated by the `BufferBackend` are not contiguous. 132 | //! 133 | //! ## Customizing String Hashing 134 | //! 135 | //! To ensure only one copy of each string is interned, [`StringInterner`] relies on [hashbrown]'s 136 | //! [hashmap](hashbrown::HashMap), which necessitates choosing a hashing function for hashing the 137 | //! strings. 138 | //! 139 | //! By default, [`StringInterner`] will use hashbrown's [`DefaultHashBuilder`], which should be 140 | //! appropriate for most users. However, you may customize the hash function via 141 | //! [`StringInterner`]'s second type parameter: 142 | //! 143 | //! ``` 144 | //! use std::hash::RandomState; 145 | //! use string_interner::{StringInterner, DefaultBackend}; 146 | //! 147 | //! // create a StringInterner with the default backend but using std's RandomState hasher 148 | //! let interned_strs: StringInterner = StringInterner::new(); 149 | //! ``` 150 | //! 151 | //! NB: as of hashbrown v0.15.2, the [`DefaultHashBuilder`] is [foldhash's 152 | //! RandomState](https://docs.rs/foldhash/latest/foldhash/fast/struct.RandomState.html), which 153 | //! relies on a one-time random initialization of shared global state; if you need stable hashes 154 | //! then you may wish to use [foldhash's 155 | //! FixedState](https://docs.rs/foldhash/latest/foldhash/fast/struct.FixedState.html) (or similar) 156 | //! instead. 157 | 158 | extern crate alloc; 159 | #[cfg(feature = "std")] 160 | #[macro_use] 161 | extern crate std; 162 | 163 | #[cfg(feature = "serde")] 164 | mod serde_impl; 165 | 166 | pub mod backend; 167 | mod interner; 168 | pub mod symbol; 169 | 170 | /// A convenience [`StringInterner`] type based on the [`DefaultBackend`]. 171 | #[cfg(feature = "backends")] 172 | pub type DefaultStringInterner = 173 | self::interner::StringInterner; 174 | 175 | #[cfg(feature = "backends")] 176 | #[doc(inline)] 177 | pub use self::backend::DefaultBackend; 178 | #[doc(inline)] 179 | pub use self::{ 180 | interner::StringInterner, 181 | symbol::{DefaultSymbol, Symbol}, 182 | }; 183 | 184 | #[doc(inline)] 185 | pub use hashbrown::DefaultHashBuilder; 186 | -------------------------------------------------------------------------------- /src/backend/string.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "backends")] 2 | 3 | use super::Backend; 4 | use crate::{symbol::expect_valid_symbol, DefaultSymbol, Symbol}; 5 | use alloc::{string::String, vec::Vec}; 6 | use core::{iter::Enumerate, marker::PhantomData, slice}; 7 | 8 | /// An interner backend that accumulates all interned string contents into one string. 9 | /// 10 | /// # Note 11 | /// 12 | /// Implementation inspired by [CAD97's](https://github.com/CAD97) research 13 | /// project [`strena`](https://github.com/CAD97/strena). 14 | /// 15 | /// # Usage Hint 16 | /// 17 | /// Use this backend if runtime performance is what matters most to you. 18 | /// 19 | /// # Usage 20 | /// 21 | /// - **Fill:** Efficiency of filling an empty string interner. 22 | /// - **Resolve:** Efficiency of interned string look-up given a symbol. 23 | /// - **Allocations:** The number of allocations performed by the backend. 24 | /// - **Footprint:** The total heap memory consumed by the backend. 25 | /// - **Contiguous:** True if the returned symbols have contiguous values. 26 | /// - **Iteration:** Efficiency of iterating over the interned strings. 27 | /// 28 | /// Rating varies between **bad**, **ok**, **good** and **best**. 29 | /// 30 | /// | Scenario | Rating | 31 | /// |:------------|:--------:| 32 | /// | Fill | **good** | 33 | /// | Resolve | **ok** | 34 | /// | Allocations | **good** | 35 | /// | Footprint | **good** | 36 | /// | Supports `get_or_intern_static` | **no** | 37 | /// | `Send` + `Sync` | **yes** | 38 | /// | Contiguous | **yes** | 39 | /// | Iteration | **good** | 40 | #[derive(Debug)] 41 | pub struct StringBackend { 42 | ends: Vec, 43 | buffer: String, 44 | marker: PhantomData S>, 45 | } 46 | 47 | /// Represents a `[from, to)` index into the `StringBackend` buffer. 48 | #[derive(Debug, Copy, Clone, PartialEq, Eq)] 49 | pub struct Span { 50 | from: usize, 51 | to: usize, 52 | } 53 | 54 | impl PartialEq for StringBackend 55 | where 56 | S: Symbol, 57 | { 58 | fn eq(&self, other: &Self) -> bool { 59 | if self.ends.len() != other.ends.len() { 60 | return false; 61 | } 62 | for ((_, lhs), (_, rhs)) in self.into_iter().zip(other) { 63 | if lhs != rhs { 64 | return false; 65 | } 66 | } 67 | true 68 | } 69 | } 70 | 71 | impl Eq for StringBackend where S: Symbol {} 72 | 73 | impl Clone for StringBackend { 74 | fn clone(&self) -> Self { 75 | Self { 76 | ends: self.ends.clone(), 77 | buffer: self.buffer.clone(), 78 | marker: Default::default(), 79 | } 80 | } 81 | } 82 | 83 | impl Default for StringBackend { 84 | #[cfg_attr(feature = "inline-more", inline)] 85 | fn default() -> Self { 86 | Self { 87 | ends: Vec::default(), 88 | buffer: String::default(), 89 | marker: Default::default(), 90 | } 91 | } 92 | } 93 | 94 | impl StringBackend 95 | where 96 | S: Symbol, 97 | { 98 | /// Returns the next available symbol. 99 | fn next_symbol(&self) -> S { 100 | expect_valid_symbol(self.ends.len()) 101 | } 102 | 103 | /// Returns the string associated to the span. 104 | fn span_to_str(&self, span: Span) -> &str { 105 | // SAFETY: - We convert a `String` into its underlying bytes and then 106 | // directly reinterpret it as `&str` again which is safe. 107 | // - Nothing mutates the string in between since this is a `&self` 108 | // method. 109 | // - The spans we use for `(start..end]` ranges are always 110 | // constructed in accordance to valid utf8 byte ranges. 111 | unsafe { core::str::from_utf8_unchecked(&self.buffer.as_bytes()[span.from..span.to]) } 112 | } 113 | 114 | /// Returns the span for the given symbol if any. 115 | fn symbol_to_span(&self, symbol: S) -> Option { 116 | let index = symbol.to_usize(); 117 | self.ends.get(index).copied().map(|to| { 118 | let from = self.ends.get(index.wrapping_sub(1)).copied().unwrap_or(0); 119 | Span { from, to } 120 | }) 121 | } 122 | 123 | /// Returns the span for the given symbol if any. 124 | unsafe fn symbol_to_span_unchecked(&self, symbol: S) -> Span { 125 | let index = symbol.to_usize(); 126 | // SAFETY: The function is marked unsafe so that the caller guarantees 127 | // that required invariants are checked. 128 | let to = unsafe { *self.ends.get_unchecked(index) }; 129 | let from = self.ends.get(index.wrapping_sub(1)).copied().unwrap_or(0); 130 | Span { from, to } 131 | } 132 | 133 | /// Pushes the given string into the buffer and returns its span. 134 | /// 135 | /// # Panics 136 | /// 137 | /// If the backend ran out of symbols. 138 | fn push_string(&mut self, string: &str) -> S { 139 | self.buffer.push_str(string); 140 | let to = self.buffer.len(); 141 | let symbol = self.next_symbol(); 142 | self.ends.push(to); 143 | symbol 144 | } 145 | } 146 | 147 | impl Backend for StringBackend 148 | where 149 | S: Symbol, 150 | { 151 | type Symbol = S; 152 | type Iter<'a> 153 | = Iter<'a, S> 154 | where 155 | Self: 'a; 156 | 157 | #[cfg_attr(feature = "inline-more", inline)] 158 | fn with_capacity(cap: usize) -> Self { 159 | // According to google the approx. word length is 5. 160 | let default_word_len = 5; 161 | Self { 162 | ends: Vec::with_capacity(cap), 163 | buffer: String::with_capacity(cap * default_word_len), 164 | marker: Default::default(), 165 | } 166 | } 167 | 168 | #[inline] 169 | fn intern(&mut self, string: &str) -> Self::Symbol { 170 | self.push_string(string) 171 | } 172 | 173 | #[inline] 174 | fn resolve(&self, symbol: Self::Symbol) -> Option<&str> { 175 | self.symbol_to_span(symbol) 176 | .map(|span| self.span_to_str(span)) 177 | } 178 | 179 | fn shrink_to_fit(&mut self) { 180 | self.ends.shrink_to_fit(); 181 | self.buffer.shrink_to_fit(); 182 | } 183 | 184 | #[inline] 185 | unsafe fn resolve_unchecked(&self, symbol: Self::Symbol) -> &str { 186 | // SAFETY: The function is marked unsafe so that the caller guarantees 187 | // that required invariants are checked. 188 | unsafe { self.span_to_str(self.symbol_to_span_unchecked(symbol)) } 189 | } 190 | 191 | #[inline] 192 | fn iter(&self) -> Self::Iter<'_> { 193 | Iter::new(self) 194 | } 195 | } 196 | 197 | impl<'a, S> IntoIterator for &'a StringBackend 198 | where 199 | S: Symbol, 200 | { 201 | type Item = (S, &'a str); 202 | type IntoIter = Iter<'a, S>; 203 | 204 | #[cfg_attr(feature = "inline-more", inline)] 205 | fn into_iter(self) -> Self::IntoIter { 206 | self.iter() 207 | } 208 | } 209 | 210 | pub struct Iter<'a, S> { 211 | backend: &'a StringBackend, 212 | start: usize, 213 | ends: Enumerate>, 214 | } 215 | 216 | impl<'a, S> Iter<'a, S> { 217 | #[cfg_attr(feature = "inline-more", inline)] 218 | pub fn new(backend: &'a StringBackend) -> Self { 219 | Self { 220 | backend, 221 | start: 0, 222 | ends: backend.ends.iter().enumerate(), 223 | } 224 | } 225 | } 226 | 227 | impl<'a, S> Iterator for Iter<'a, S> 228 | where 229 | S: Symbol, 230 | { 231 | type Item = (S, &'a str); 232 | 233 | #[inline] 234 | fn size_hint(&self) -> (usize, Option) { 235 | self.ends.size_hint() 236 | } 237 | 238 | #[inline] 239 | fn next(&mut self) -> Option { 240 | self.ends.next().map(|(id, &to)| { 241 | let from = core::mem::replace(&mut self.start, to); 242 | ( 243 | expect_valid_symbol(id), 244 | self.backend.span_to_str(Span { from, to }), 245 | ) 246 | }) 247 | } 248 | } 249 | -------------------------------------------------------------------------------- /src/backend/bucket/mod.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "backends")] 2 | 3 | mod fixed_str; 4 | mod interned_str; 5 | 6 | use self::{fixed_str::FixedString, interned_str::InternedStr}; 7 | use super::Backend; 8 | use crate::{symbol::expect_valid_symbol, DefaultSymbol, Symbol}; 9 | use alloc::{string::String, vec::Vec}; 10 | use core::{iter::Enumerate, marker::PhantomData, slice}; 11 | 12 | /// An interner backend that reduces memory allocations by using string buckets. 13 | /// 14 | /// # Note 15 | /// 16 | /// Implementation inspired by matklad's blog post that can be found here: 17 | /// 18 | /// 19 | /// # Usage Hint 20 | /// 21 | /// Use when deallocations or copy overhead is costly or when 22 | /// interning of static strings is especially common. 23 | /// 24 | /// # Usage 25 | /// 26 | /// - **Fill:** Efficiency of filling an empty string interner. 27 | /// - **Resolve:** Efficiency of interned string look-up given a symbol. 28 | /// - **Allocations:** The number of allocations performed by the backend. 29 | /// - **Footprint:** The total heap memory consumed by the backend. 30 | /// - **Contiguous:** True if the returned symbols have contiguous values. 31 | /// - **Iteration:** Efficiency of iterating over the interned strings. 32 | /// 33 | /// Rating varies between **bad**, **ok**, **good** and **best**. 34 | /// 35 | /// | Scenario | Rating | 36 | /// |:------------|:--------:| 37 | /// | Fill | **good** | 38 | /// | Resolve | **best** | 39 | /// | Allocations | **good** | 40 | /// | Footprint | **ok** | 41 | /// | Supports `get_or_intern_static` | **yes** | 42 | /// | `Send` + `Sync` | **yes** | 43 | /// | Contiguous | **yes** | 44 | /// | Iteration | **best** | 45 | #[derive(Debug)] 46 | pub struct BucketBackend { 47 | spans: Vec, 48 | head: FixedString, 49 | full: Vec, 50 | marker: PhantomData S>, 51 | } 52 | 53 | /// # Safety 54 | /// 55 | /// The bucket backend requires a manual [`Send`] impl because it is self 56 | /// referential. When cloning a bucket backend a deep clone is performed and 57 | /// all references to itself are updated for the clone. 58 | unsafe impl Send for BucketBackend where S: Symbol {} 59 | 60 | /// # Safety 61 | /// 62 | /// The bucket backend requires a manual [`Send`] impl because it is self 63 | /// referential. Those references won't escape its own scope and also 64 | /// the bucket backend has no interior mutability. 65 | unsafe impl Sync for BucketBackend where S: Symbol {} 66 | 67 | impl Default for BucketBackend { 68 | #[cfg_attr(feature = "inline-more", inline)] 69 | fn default() -> Self { 70 | Self { 71 | spans: Vec::new(), 72 | head: FixedString::default(), 73 | full: Vec::new(), 74 | marker: Default::default(), 75 | } 76 | } 77 | } 78 | 79 | impl Backend for BucketBackend 80 | where 81 | S: Symbol, 82 | { 83 | type Symbol = S; 84 | type Iter<'a> 85 | = Iter<'a, S> 86 | where 87 | Self: 'a; 88 | 89 | #[cfg_attr(feature = "inline-more", inline)] 90 | fn with_capacity(cap: usize) -> Self { 91 | Self { 92 | spans: Vec::with_capacity(cap), 93 | head: FixedString::with_capacity(cap), 94 | full: Vec::new(), 95 | marker: Default::default(), 96 | } 97 | } 98 | 99 | #[inline] 100 | fn intern(&mut self, string: &str) -> Self::Symbol { 101 | // SAFETY: This is safe because we never hand out the returned 102 | // interned string instance to the outside and only operate 103 | // on it within this backend. 104 | let interned = unsafe { self.alloc(string) }; 105 | self.push_span(interned) 106 | } 107 | 108 | #[cfg_attr(feature = "inline-more", inline)] 109 | fn intern_static(&mut self, string: &'static str) -> Self::Symbol { 110 | let interned = InternedStr::new(string); 111 | self.push_span(interned) 112 | } 113 | 114 | fn shrink_to_fit(&mut self) { 115 | self.spans.shrink_to_fit(); 116 | // Commenting out the below line fixes: https://github.com/Robbepop/string-interner/issues/46 117 | // self.head.shrink_to_fit(); 118 | self.full.shrink_to_fit(); 119 | } 120 | 121 | #[inline] 122 | fn resolve(&self, symbol: Self::Symbol) -> Option<&str> { 123 | self.spans.get(symbol.to_usize()).map(InternedStr::as_str) 124 | } 125 | 126 | #[inline] 127 | unsafe fn resolve_unchecked(&self, symbol: Self::Symbol) -> &str { 128 | // SAFETY: The function is marked unsafe so that the caller guarantees 129 | // that required invariants are checked. 130 | unsafe { self.spans.get_unchecked(symbol.to_usize()).as_str() } 131 | } 132 | 133 | #[inline] 134 | fn iter(&self) -> Self::Iter<'_> { 135 | Iter::new(self) 136 | } 137 | } 138 | 139 | impl BucketBackend 140 | where 141 | S: Symbol, 142 | { 143 | /// Returns the next available symbol. 144 | fn next_symbol(&self) -> S { 145 | expect_valid_symbol(self.spans.len()) 146 | } 147 | 148 | /// Pushes the given interned string into the spans and returns its symbol. 149 | fn push_span(&mut self, interned: InternedStr) -> S { 150 | let symbol = self.next_symbol(); 151 | self.spans.push(interned); 152 | symbol 153 | } 154 | 155 | /// Interns a new string into the backend and returns a reference to it. 156 | unsafe fn alloc(&mut self, string: &str) -> InternedStr { 157 | let cap = self.head.capacity(); 158 | if cap < self.head.len() + string.len() { 159 | let new_cap = (usize::max(cap, string.len()) + 1).next_power_of_two(); 160 | let new_head = FixedString::with_capacity(new_cap); 161 | let old_head = core::mem::replace(&mut self.head, new_head); 162 | self.full.push(old_head.finish()); 163 | } 164 | self.head 165 | .push_str(string) 166 | .expect("encountered invalid head capacity (2)") 167 | } 168 | } 169 | 170 | impl Clone for BucketBackend { 171 | fn clone(&self) -> Self { 172 | // For performance reasons we copy all cloned strings into a single cloned 173 | // head string leaving the cloned `full` empty. 174 | let new_head_cap = 175 | self.head.capacity() + self.full.iter().fold(0, |lhs, rhs| lhs + rhs.len()); 176 | let mut head = FixedString::with_capacity(new_head_cap); 177 | let mut spans = Vec::with_capacity(self.spans.len()); 178 | for span in &self.spans { 179 | let string = span.as_str(); 180 | let interned = head 181 | .push_str(string) 182 | .expect("encountered invalid head capacity"); 183 | spans.push(interned); 184 | } 185 | Self { 186 | spans, 187 | head, 188 | full: Vec::new(), 189 | marker: Default::default(), 190 | } 191 | } 192 | } 193 | 194 | impl Eq for BucketBackend where S: Symbol {} 195 | 196 | impl PartialEq for BucketBackend 197 | where 198 | S: Symbol, 199 | { 200 | #[cfg_attr(feature = "inline-more", inline)] 201 | fn eq(&self, other: &Self) -> bool { 202 | self.spans == other.spans 203 | } 204 | } 205 | 206 | impl<'a, S> IntoIterator for &'a BucketBackend 207 | where 208 | S: Symbol, 209 | { 210 | type Item = (S, &'a str); 211 | type IntoIter = Iter<'a, S>; 212 | 213 | #[cfg_attr(feature = "inline-more", inline)] 214 | fn into_iter(self) -> Self::IntoIter { 215 | self.iter() 216 | } 217 | } 218 | 219 | pub struct Iter<'a, S> { 220 | iter: Enumerate>, 221 | symbol_marker: PhantomData S>, 222 | } 223 | 224 | impl<'a, S> Iter<'a, S> { 225 | #[cfg_attr(feature = "inline-more", inline)] 226 | pub fn new(backend: &'a BucketBackend) -> Self { 227 | Self { 228 | iter: backend.spans.iter().enumerate(), 229 | symbol_marker: Default::default(), 230 | } 231 | } 232 | } 233 | 234 | impl<'a, S> Iterator for Iter<'a, S> 235 | where 236 | S: Symbol, 237 | { 238 | type Item = (S, &'a str); 239 | 240 | #[inline] 241 | fn size_hint(&self) -> (usize, Option) { 242 | self.iter.size_hint() 243 | } 244 | 245 | #[inline] 246 | fn next(&mut self) -> Option { 247 | self.iter 248 | .next() 249 | .map(|(id, interned)| (expect_valid_symbol(id), interned.as_str())) 250 | } 251 | } 252 | -------------------------------------------------------------------------------- /benches/bench.rs: -------------------------------------------------------------------------------- 1 | mod setup; 2 | 3 | use self::setup::{ 4 | generate_test_strings, 5 | BackendBenchmark, 6 | BenchBucket, 7 | BenchBuffer, 8 | BenchString, 9 | BENCH_LEN_STRINGS, 10 | BENCH_STRING_LEN, 11 | }; 12 | use criterion::{ 13 | black_box, 14 | criterion_group, 15 | criterion_main, 16 | measurement::WallTime, 17 | BatchSize, 18 | BenchmarkGroup, 19 | Criterion, 20 | Throughput, 21 | }; 22 | use string_interner::backend::Backend; 23 | 24 | criterion_group!( 25 | bench_resolve, 26 | bench_resolve_already_filled, 27 | bench_resolve_unchecked_already_filled 28 | ); 29 | criterion_group!(bench_get, bench_get_already_filled); 30 | criterion_group!(bench_iter, bench_iter_already_filled); 31 | criterion_group!( 32 | bench_get_or_intern, 33 | bench_get_or_intern_fill, 34 | bench_get_or_intern_fill_with_capacity, 35 | bench_get_or_intern_already_filled, 36 | bench_get_or_intern_static, 37 | ); 38 | criterion_main!(bench_get_or_intern, bench_resolve, bench_get, bench_iter); 39 | 40 | fn bench_get_or_intern_static(c: &mut Criterion) { 41 | let mut g = c.benchmark_group("get_or_intern_static"); 42 | fn bench_for_backend(g: &mut BenchmarkGroup) { 43 | #[rustfmt::skip] 44 | let static_strings = &[ 45 | "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", 46 | "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", 47 | "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", 48 | "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", 49 | ]; 50 | g.throughput(Throughput::Elements(static_strings.len() as u64)); 51 | g.bench_with_input( 52 | format!("{}/{}", BB::NAME, "get_or_intern"), 53 | static_strings, 54 | |bencher, words| { 55 | bencher.iter_batched_ref( 56 | || BB::setup(), 57 | |interner| { 58 | for word in words.iter().copied() { 59 | black_box(interner.get_or_intern(word)); 60 | } 61 | }, 62 | BatchSize::SmallInput, 63 | ) 64 | }, 65 | ); 66 | g.bench_with_input( 67 | format!("{}/{}", BB::NAME, "get_or_intern_static"), 68 | static_strings, 69 | |bencher, words| { 70 | bencher.iter_batched_ref( 71 | || BB::setup(), 72 | |interner| { 73 | for word in words.iter().copied() { 74 | black_box(interner.get_or_intern_static(word)); 75 | } 76 | }, 77 | BatchSize::SmallInput, 78 | ) 79 | }, 80 | ); 81 | } 82 | bench_for_backend::(&mut g); 83 | bench_for_backend::(&mut g); 84 | bench_for_backend::(&mut g); 85 | } 86 | 87 | fn bench_get_or_intern_fill_with_capacity(c: &mut Criterion) { 88 | let mut g = c.benchmark_group("get_or_intern/fill-empty/with_capacity"); 89 | g.throughput(Throughput::Elements(BENCH_LEN_STRINGS as u64)); 90 | fn bench_for_backend(g: &mut BenchmarkGroup) { 91 | g.bench_with_input( 92 | BB::NAME, 93 | &(BENCH_LEN_STRINGS, BENCH_STRING_LEN), 94 | |bencher, &(len_words, word_len)| { 95 | let words = generate_test_strings(len_words, word_len); 96 | bencher.iter_batched_ref( 97 | || BB::setup_with_capacity(BENCH_LEN_STRINGS), 98 | |interner| { 99 | for word in &words { 100 | black_box(interner.get_or_intern(word)); 101 | } 102 | }, 103 | BatchSize::SmallInput, 104 | ) 105 | }, 106 | ); 107 | } 108 | bench_for_backend::(&mut g); 109 | bench_for_backend::(&mut g); 110 | bench_for_backend::(&mut g); 111 | } 112 | 113 | fn bench_get_or_intern_fill(c: &mut Criterion) { 114 | let mut g = c.benchmark_group("get_or_intern/fill-empty/new"); 115 | g.throughput(Throughput::Elements(BENCH_LEN_STRINGS as u64)); 116 | fn bench_for_backend(g: &mut BenchmarkGroup) { 117 | g.bench_with_input( 118 | BB::NAME, 119 | &(BENCH_LEN_STRINGS, BENCH_STRING_LEN), 120 | |bencher, &(len_words, word_len)| { 121 | let words = generate_test_strings(len_words, word_len); 122 | bencher.iter_batched_ref( 123 | || BB::setup(), 124 | |interner| { 125 | for word in &words { 126 | black_box(interner.get_or_intern(word)); 127 | } 128 | }, 129 | BatchSize::SmallInput, 130 | ) 131 | }, 132 | ); 133 | } 134 | bench_for_backend::(&mut g); 135 | bench_for_backend::(&mut g); 136 | bench_for_backend::(&mut g); 137 | } 138 | 139 | fn bench_get_or_intern_already_filled(c: &mut Criterion) { 140 | let mut g = c.benchmark_group("get_or_intern/already-filled"); 141 | g.throughput(Throughput::Elements(BENCH_LEN_STRINGS as u64)); 142 | fn bench_for_backend(g: &mut BenchmarkGroup) { 143 | g.bench_with_input( 144 | BB::NAME, 145 | &(BENCH_LEN_STRINGS, BENCH_STRING_LEN), 146 | |bencher, &(len_words, word_len)| { 147 | let words = generate_test_strings(len_words, word_len); 148 | bencher.iter_batched_ref( 149 | || BB::setup_filled(&words), 150 | |interner| { 151 | for word in &words { 152 | black_box(interner.get_or_intern(word)); 153 | } 154 | }, 155 | BatchSize::SmallInput, 156 | ) 157 | }, 158 | ); 159 | } 160 | bench_for_backend::(&mut g); 161 | bench_for_backend::(&mut g); 162 | bench_for_backend::(&mut g); 163 | } 164 | 165 | fn bench_resolve_already_filled(c: &mut Criterion) { 166 | let mut g = c.benchmark_group("resolve/already-filled"); 167 | g.throughput(Throughput::Elements(BENCH_LEN_STRINGS as u64)); 168 | fn bench_for_backend(g: &mut BenchmarkGroup) { 169 | g.bench_with_input( 170 | BB::NAME, 171 | &(BENCH_LEN_STRINGS, BENCH_STRING_LEN), 172 | |bencher, &(len_words, word_len)| { 173 | let words = generate_test_strings(len_words, word_len); 174 | bencher.iter_batched_ref( 175 | || BB::setup_filled_with_ids(&words), 176 | |(interner, word_ids)| { 177 | for &word_id in &*word_ids { 178 | black_box(interner.resolve(word_id)); 179 | } 180 | }, 181 | BatchSize::SmallInput, 182 | ) 183 | }, 184 | ); 185 | } 186 | bench_for_backend::(&mut g); 187 | bench_for_backend::(&mut g); 188 | bench_for_backend::(&mut g); 189 | } 190 | 191 | fn bench_resolve_unchecked_already_filled(c: &mut Criterion) { 192 | let mut g = c.benchmark_group("resolve_unchecked/already-filled"); 193 | g.throughput(Throughput::Elements(BENCH_LEN_STRINGS as u64)); 194 | fn bench_for_backend(g: &mut BenchmarkGroup) { 195 | g.bench_with_input( 196 | BB::NAME, 197 | &(BENCH_LEN_STRINGS, BENCH_STRING_LEN), 198 | |bencher, &(len_words, word_len)| { 199 | let words = generate_test_strings(len_words, word_len); 200 | bencher.iter_batched_ref( 201 | || BB::setup_filled_with_ids(&words), 202 | |(interner, word_ids)| { 203 | for &word_id in &*word_ids { 204 | black_box( 205 | // SAFETY: We provide only valid symbols to the tested interners. 206 | unsafe { interner.resolve_unchecked(word_id) }, 207 | ); 208 | } 209 | }, 210 | BatchSize::SmallInput, 211 | ) 212 | }, 213 | ); 214 | } 215 | bench_for_backend::(&mut g); 216 | bench_for_backend::(&mut g); 217 | bench_for_backend::(&mut g); 218 | } 219 | 220 | fn bench_get_already_filled(c: &mut Criterion) { 221 | let mut g = c.benchmark_group("get/already-filled"); 222 | g.throughput(Throughput::Elements(BENCH_LEN_STRINGS as u64)); 223 | fn bench_for_backend(g: &mut BenchmarkGroup) { 224 | g.bench_with_input( 225 | BB::NAME, 226 | &(BENCH_LEN_STRINGS, BENCH_STRING_LEN), 227 | |bencher, &(len_words, word_len)| { 228 | let words = generate_test_strings(len_words, word_len); 229 | bencher.iter_batched_ref( 230 | || BB::setup_filled(&words), 231 | |interner| { 232 | for word in &words { 233 | black_box(interner.get(word)); 234 | } 235 | }, 236 | BatchSize::SmallInput, 237 | ) 238 | }, 239 | ); 240 | } 241 | bench_for_backend::(&mut g); 242 | bench_for_backend::(&mut g); 243 | bench_for_backend::(&mut g); 244 | } 245 | 246 | fn bench_iter_already_filled(c: &mut Criterion) { 247 | let mut g = c.benchmark_group("iter/already-filled"); 248 | g.throughput(Throughput::Elements(BENCH_LEN_STRINGS as u64)); 249 | fn bench_for_backend(g: &mut BenchmarkGroup) 250 | where 251 | for<'a> &'a ::Backend: IntoIterator< 252 | Item = ( 253 | <::Backend as Backend>::Symbol, 254 | &'a str, 255 | ), 256 | >, 257 | { 258 | g.bench_with_input( 259 | BB::NAME, 260 | &(BENCH_LEN_STRINGS, BENCH_STRING_LEN), 261 | |bencher, &(len_words, word_len)| { 262 | let words = generate_test_strings(len_words, word_len); 263 | bencher.iter_batched_ref( 264 | || BB::setup_filled(&words), 265 | |interner| { 266 | for word in &*interner { 267 | black_box(word); 268 | } 269 | }, 270 | BatchSize::SmallInput, 271 | ) 272 | }, 273 | ); 274 | } 275 | bench_for_backend::(&mut g); 276 | bench_for_backend::(&mut g); 277 | bench_for_backend::(&mut g); 278 | } 279 | -------------------------------------------------------------------------------- /src/interner.rs: -------------------------------------------------------------------------------- 1 | use crate::{backend::Backend, Symbol}; 2 | use core::{ 3 | fmt, 4 | fmt::{Debug, Formatter}, 5 | hash::{BuildHasher, Hash, Hasher}, 6 | iter::FromIterator, 7 | }; 8 | use hashbrown::{DefaultHashBuilder, HashMap}; 9 | 10 | /// Creates the `u64` hash value for the given value using the given hash builder. 11 | fn make_hash(builder: &impl BuildHasher, value: &T) -> u64 12 | where 13 | T: ?Sized + Hash, 14 | { 15 | let state = &mut builder.build_hasher(); 16 | value.hash(state); 17 | state.finish() 18 | } 19 | 20 | /// Data structure to intern and resolve strings. 21 | /// 22 | /// Caches strings efficiently, with minimal memory footprint and associates them with unique symbols. 23 | /// These symbols allow constant time comparisons and look-ups to the underlying interned strings. 24 | /// 25 | /// The following API covers the main functionality: 26 | /// 27 | /// - [`StringInterner::get_or_intern`]: To intern a new string. 28 | /// - This maps from `string` type to `symbol` type. 29 | /// - [`StringInterner::resolve`]: To resolve your already interned strings. 30 | /// - This maps from `symbol` type to `string` type. 31 | pub struct StringInterner 32 | where 33 | B: Backend, 34 | { 35 | dedup: HashMap<::Symbol, (), ()>, 36 | hasher: H, 37 | backend: B, 38 | } 39 | 40 | impl Debug for StringInterner 41 | where 42 | B: Backend + Debug, 43 | ::Symbol: Symbol + Debug, 44 | H: BuildHasher, 45 | { 46 | fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { 47 | f.debug_struct("StringInterner") 48 | .field("dedup", &self.dedup) 49 | .field("backend", &self.backend) 50 | .finish() 51 | } 52 | } 53 | 54 | #[cfg(feature = "backends")] 55 | impl Default for StringInterner { 56 | #[cfg_attr(feature = "inline-more", inline)] 57 | fn default() -> Self { 58 | StringInterner::new() 59 | } 60 | } 61 | 62 | impl Clone for StringInterner 63 | where 64 | B: Backend + Clone, 65 | ::Symbol: Symbol, 66 | H: BuildHasher + Clone, 67 | { 68 | fn clone(&self) -> Self { 69 | Self { 70 | dedup: self.dedup.clone(), 71 | hasher: self.hasher.clone(), 72 | backend: self.backend.clone(), 73 | } 74 | } 75 | } 76 | 77 | impl PartialEq for StringInterner 78 | where 79 | B: Backend + PartialEq, 80 | ::Symbol: Symbol, 81 | H: BuildHasher, 82 | { 83 | fn eq(&self, rhs: &Self) -> bool { 84 | self.len() == rhs.len() && self.backend == rhs.backend 85 | } 86 | } 87 | 88 | impl Eq for StringInterner 89 | where 90 | B: Backend + Eq, 91 | ::Symbol: Symbol, 92 | H: BuildHasher, 93 | { 94 | } 95 | 96 | impl StringInterner 97 | where 98 | B: Backend, 99 | ::Symbol: Symbol, 100 | H: BuildHasher + Default, 101 | { 102 | /// Creates a new empty `StringInterner`. 103 | #[cfg_attr(feature = "inline-more", inline)] 104 | pub fn new() -> Self { 105 | Self { 106 | dedup: HashMap::default(), 107 | hasher: Default::default(), 108 | backend: B::default(), 109 | } 110 | } 111 | 112 | /// Creates a new `StringInterner` with the given initial capacity. 113 | #[cfg_attr(feature = "inline-more", inline)] 114 | pub fn with_capacity(cap: usize) -> Self { 115 | Self { 116 | dedup: HashMap::with_capacity_and_hasher(cap, ()), 117 | hasher: Default::default(), 118 | backend: B::with_capacity(cap), 119 | } 120 | } 121 | } 122 | 123 | impl StringInterner 124 | where 125 | B: Backend, 126 | ::Symbol: Symbol, 127 | H: BuildHasher, 128 | { 129 | /// Creates a new empty `StringInterner` with the given hasher. 130 | #[cfg_attr(feature = "inline-more", inline)] 131 | pub fn with_hasher(hash_builder: H) -> Self { 132 | StringInterner { 133 | dedup: HashMap::default(), 134 | hasher: hash_builder, 135 | backend: B::default(), 136 | } 137 | } 138 | 139 | /// Creates a new empty `StringInterner` with the given initial capacity and the given hasher. 140 | #[cfg_attr(feature = "inline-more", inline)] 141 | pub fn with_capacity_and_hasher(cap: usize, hash_builder: H) -> Self { 142 | StringInterner { 143 | dedup: HashMap::with_capacity_and_hasher(cap, ()), 144 | hasher: hash_builder, 145 | backend: B::with_capacity(cap), 146 | } 147 | } 148 | 149 | /// Returns the number of strings interned by the interner. 150 | #[cfg_attr(feature = "inline-more", inline)] 151 | pub fn len(&self) -> usize { 152 | self.dedup.len() 153 | } 154 | 155 | /// Returns `true` if the string interner has no interned strings. 156 | #[cfg_attr(feature = "inline-more", inline)] 157 | pub fn is_empty(&self) -> bool { 158 | self.len() == 0 159 | } 160 | 161 | /// Returns the symbol for the given string if any. 162 | /// 163 | /// Can be used to query if a string has already been interned without interning. 164 | #[inline] 165 | pub fn get(&self, string: T) -> Option<::Symbol> 166 | where 167 | T: AsRef, 168 | { 169 | let string = string.as_ref(); 170 | let Self { 171 | dedup, 172 | hasher, 173 | backend, 174 | } = self; 175 | let hash = make_hash(hasher, string); 176 | dedup 177 | .raw_entry() 178 | .from_hash(hash, |symbol| { 179 | // SAFETY: This is safe because we only operate on symbols that 180 | // we receive from our backend making them valid. 181 | string == unsafe { backend.resolve_unchecked(*symbol) } 182 | }) 183 | .map(|(&symbol, &())| symbol) 184 | } 185 | 186 | /// Interns the given string. 187 | /// 188 | /// This is used as backend by [`get_or_intern`][1] and [`get_or_intern_static`][2]. 189 | /// 190 | /// [1]: [`StringInterner::get_or_intern`] 191 | /// [2]: [`StringInterner::get_or_intern_static`] 192 | #[cfg_attr(feature = "inline-more", inline)] 193 | fn get_or_intern_using( 194 | &mut self, 195 | string: T, 196 | intern_fn: fn(&mut B, T) -> ::Symbol, 197 | ) -> ::Symbol 198 | where 199 | T: Copy + Hash + AsRef + for<'a> PartialEq<&'a str>, 200 | { 201 | let Self { 202 | dedup, 203 | hasher, 204 | backend, 205 | } = self; 206 | let hash = make_hash(hasher, string.as_ref()); 207 | let entry = dedup.raw_entry_mut().from_hash(hash, |symbol| { 208 | // SAFETY: This is safe because we only operate on symbols that 209 | // we receive from our backend making them valid. 210 | string == unsafe { backend.resolve_unchecked(*symbol) } 211 | }); 212 | use hashbrown::hash_map::RawEntryMut; 213 | let (&mut symbol, &mut ()) = match entry { 214 | RawEntryMut::Occupied(occupied) => occupied.into_key_value(), 215 | RawEntryMut::Vacant(vacant) => { 216 | let symbol = intern_fn(backend, string); 217 | vacant.insert_with_hasher(hash, symbol, (), |symbol| { 218 | // SAFETY: This is safe because we only operate on symbols that 219 | // we receive from our backend making them valid. 220 | let string = unsafe { backend.resolve_unchecked(*symbol) }; 221 | make_hash(hasher, string) 222 | }) 223 | } 224 | }; 225 | symbol 226 | } 227 | 228 | /// Interns the given string. 229 | /// 230 | /// Returns a symbol for resolution into the original string. 231 | /// 232 | /// # Panics 233 | /// 234 | /// If the interner already interns the maximum number of strings possible 235 | /// by the chosen symbol type. 236 | #[inline] 237 | pub fn get_or_intern(&mut self, string: T) -> ::Symbol 238 | where 239 | T: AsRef, 240 | { 241 | self.get_or_intern_using(string.as_ref(), B::intern) 242 | } 243 | 244 | /// Interns the given `'static` string. 245 | /// 246 | /// Returns a symbol for resolution into the original string. 247 | /// 248 | /// # Note 249 | /// 250 | /// This is more efficient than [`StringInterner::get_or_intern`] since it might 251 | /// avoid some memory allocations if the backends supports this. 252 | /// 253 | /// # Panics 254 | /// 255 | /// If the interner already interns the maximum number of strings possible 256 | /// by the chosen symbol type. 257 | #[inline] 258 | pub fn get_or_intern_static(&mut self, string: &'static str) -> ::Symbol { 259 | self.get_or_intern_using(string, B::intern_static) 260 | } 261 | 262 | /// Shrink backend capacity to fit the interned strings exactly. 263 | pub fn shrink_to_fit(&mut self) { 264 | self.backend.shrink_to_fit() 265 | } 266 | 267 | /// Returns the string for the given `symbol`` if any. 268 | #[inline] 269 | pub fn resolve(&self, symbol: ::Symbol) -> Option<&str> { 270 | self.backend.resolve(symbol) 271 | } 272 | 273 | /// Returns the string for the given `symbol` without performing any checks. 274 | /// 275 | /// # Safety 276 | /// 277 | /// It is the caller's responsibility to provide this method with `symbol`s 278 | /// that are valid for the [`StringInterner`]. 279 | #[inline] 280 | pub unsafe fn resolve_unchecked(&self, symbol: ::Symbol) -> &str { 281 | unsafe { self.backend.resolve_unchecked(symbol) } 282 | } 283 | 284 | /// Returns an iterator that yields all interned strings and their symbols. 285 | #[inline] 286 | pub fn iter(&self) -> ::Iter<'_> { 287 | self.backend.iter() 288 | } 289 | } 290 | 291 | impl FromIterator for StringInterner 292 | where 293 | B: Backend, 294 | ::Symbol: Symbol, 295 | H: BuildHasher + Default, 296 | T: AsRef, 297 | { 298 | fn from_iter(iter: I) -> Self 299 | where 300 | I: IntoIterator, 301 | { 302 | let iter = iter.into_iter(); 303 | let (capacity, _) = iter.size_hint(); 304 | let mut interner = Self::with_capacity(capacity); 305 | interner.extend(iter); 306 | interner 307 | } 308 | } 309 | 310 | impl Extend for StringInterner 311 | where 312 | B: Backend, 313 | ::Symbol: Symbol, 314 | H: BuildHasher, 315 | T: AsRef, 316 | { 317 | fn extend(&mut self, iter: I) 318 | where 319 | I: IntoIterator, 320 | { 321 | for s in iter { 322 | self.get_or_intern(s.as_ref()); 323 | } 324 | } 325 | } 326 | 327 | impl<'a, B, H> IntoIterator for &'a StringInterner 328 | where 329 | B: Backend, 330 | H: BuildHasher, 331 | { 332 | type Item = (::Symbol, &'a str); 333 | type IntoIter = ::Iter<'a>; 334 | 335 | #[cfg_attr(feature = "inline-more", inline)] 336 | fn into_iter(self) -> Self::IntoIter { 337 | self.backend.iter() 338 | } 339 | } 340 | -------------------------------------------------------------------------------- /RELEASE_NOTES.md: -------------------------------------------------------------------------------- 1 | # Release Notes 2 | 3 | ## 0.19.0 - 2025/02/11 4 | 5 | ## Fixed 6 | 7 | - Fixed the `IntoIterator` impl that triggered a Rust compiler miscompilation in some versions. [#80] 8 | - See the associated [`rustc` issue](https://github.com/rust-lang/rust/issues/136856). 9 | 10 | ## Improvements 11 | 12 | - Updated and improved documentation about backend properties. [#83] [#84] 13 | - Update outdated dependencies. [#82] 14 | - Added `serde::{Serialize,Deserialize}` impls for all crate defined symbol types. [#86] 15 | 16 | ## Internal 17 | 18 | - Improve and modernize GitHub Actions CI. [#81] 19 | 20 | [#80]: https://github.com/Robbepop/string-interner/pull/80 21 | [#81]: https://github.com/Robbepop/string-interner/pull/81 22 | [#82]: https://github.com/Robbepop/string-interner/pull/82 23 | [#83]: https://github.com/Robbepop/string-interner/pull/83 24 | [#84]: https://github.com/Robbepop/string-interner/pull/84 25 | [#86]: https://github.com/Robbepop/string-interner/pull/86 26 | 27 | ## 0.18.0 - 2024/11/12 28 | 29 | ## Fixed 30 | 31 | - The `serde` crate feature is no longer enabled via `std` crate feature. [#73] 32 | 33 | ## Removed 34 | 35 | - Removed the unused `cfg-if` dependency. [#73] 36 | 37 | ## Changed 38 | 39 | - Updated `hashbrown` dependency to `0.15.1`. [#73] 40 | 41 | ## Internal 42 | 43 | - Fixed many `clippy` and `formatting` issues. [#73] 44 | 45 | [#73]: https://github.com/Robbepop/string-interner/pull/73 46 | 47 | ## 0.17.0 - 2024/05/01 48 | 49 | ## Added 50 | 51 | - Added `StringInterner::resolve_unchecked` method. (https://github.com/Robbepop/string-interner/pull/68) 52 | 53 | ## Fixed 54 | 55 | - Fixed soundness issue in `BufferBackend::resolve`. (https://github.com/Robbepop/string-interner/pull/68) 56 | 57 | ## 0.16.0 - 2024/05/01 58 | 59 | ## Added 60 | 61 | - Added `StringInterner::iter` method. (https://github.com/Robbepop/string-interner/pull/65) 62 | 63 | ## Changed 64 | 65 | - Optimized `BufferBackend::{resolve, iter}` methods. (https://github.com/Robbepop/string-interner/pull/64) 66 | 67 | ## Fixed 68 | 69 | - Fixed unsoundness issue in `BucketBackend`. (https://github.com/Robbepop/string-interner/pull/66) 70 | 71 | ## Removed 72 | 73 | - Removed `SimpleBackend` since it served no real purpose. (https://github.com/Robbepop/string-interner/commit/549db6c2efeac5acb5e8084e69fa22891ae14019) 74 | 75 | ## 0.15.0 - 2024/02/08 76 | 77 | ## Changed 78 | 79 | - Update to `hashbrown` version `0.14.0`. (https://github.com/Robbepop/string-interner/pull/58) 80 | - Improve `no_std` support. (https://github.com/Robbepop/string-interner/pull/44) 81 | - Fix bug in `BufferBackend::with_capacity` method. (https://github.com/Robbepop/string-interner/pull/54) 82 | 83 | ## 0.14.0 - 2021/10/27 84 | 85 | ## Added 86 | 87 | - Added the new `BufferBackend` string interner backend. 88 | - This backend focuses on minimum memory consumption and allocations 89 | at the costs of decreased symbol resolution performance. 90 | - Use this when memory consumption is your main concern. 91 | - Added example of how to use a different string interner backend or symbol. 92 | - Added library docs comparing all the different string interner backends. 93 | 94 | ## Changed 95 | 96 | - The `string_interner` crate now uses the Rust 2021 edition. 97 | - The `DefaultBackend` now is the `StringBackend` and no longer the `BucketBackend`. 98 | - The generic `S` symbol parameter of all string interner backends 99 | now defaults to the `DefaultSymbol`. 100 | - The `Backend` trait is no longer generic over a symbol `S` but instead 101 | has a `Symbol` associated type now. 102 | - The `StringInterner` no longer has a generic `S` symbol parameter and 103 | now instead uses the `Symbol` associated type from its used backend `B`. 104 | 105 | ## Dev. Note 106 | 107 | - The `memory_consumption` tests now shrink the string interners before querying 108 | their memory consumption. This yields more stable numbers than before. 109 | - The `memory_consumption` test now also tests the total amount of allocations 110 | and deallocations made by the string interner backends. 111 | - Add `README` section about benchmarking the crate. 112 | 113 | ## 0.13.0 - 2021/08/25 114 | 115 | - Update `hashbrown` dependency from version `0.9` to version `0.11`. 116 | - Add `shrink_to_fit` method to `StringInterner` via backend. (#36) 117 | - Add support more than 4G of interned strings with `StringBackend`. (#37) 118 | - Remove `S: Symbol` trait bound from interner backends. 119 | - Remove `S: Symbol` trait bound from `Clone impl` for `StringBackend`. 120 | 121 | - Reworked the memory and allocation tests 122 | - Run them via `cargo test -- --test-threads 1` 123 | - CI now tests the whole build for windows, linux (ubuntu) and macos. 124 | - Add `cargo-audit` and `cargo-outdated` checks to CI pipeline. 125 | - Remove no longer needed `jemalloc` `dev-dependency`. 126 | 127 | ## 0.12.2 - 2021/01/11 128 | 129 | - Ensure cloned `StringInterner` can still look up the same symbols. 130 | [#34](https://github.com/Robbepop/string-interner/pull/34) (Thanks @alamb) 131 | - This requires `BuildHasher: Clone` trait bound for `StringInterner`'s `Clone` impl. 132 | 133 | ## 0.12.1 - 2020/11/14 134 | 135 | - The `BucketBackend` now implements `Send` + `Sync`. 136 | - Implemented some minor internal improvements. 137 | - Update dependencies: 138 | - `hashbrown 0.8` -> `0.9` 139 | - `cfg-if 0.1` -> `1.0` 140 | 141 | ## 0.12.0 - 2020/07/15 142 | 143 | - Make `DefaultBackend` generic over its symbol type. 144 | - This simplifies type ascription of string interners that do not use the 145 | default symbol type. 146 | - E.g. `StringInterner` is now possible to write (again). 147 | - Add `backends` crate feature. 148 | - Enabled by default. 149 | - Disable this if you do not use any of the backends provided by the 150 | `string-interner` crate. 151 | 152 | ## 0.11.3 - 2020/07/15 153 | 154 | - Add `Symbol` implementation for `usize`. 155 | 156 | ## 0.11.2 - 2020/07/15 157 | 158 | - Add new `StringBackend` that is optimized for memory allocations and footprint. 159 | - Use it if your memory constraints are most important to you. 160 | 161 | ## 0.11.1 - 2020/07/14 162 | 163 | Special thanks to [Ten0](https://github.com/Ten0) for help with this release! 164 | 165 | - Remove usage of `unsafe` in `Symbol::try_from_usize` methods. 166 | - Remove no longer correct `unsafe impls` for `Send` and `Sync` for `StringInterner`. 167 | - Add new crate feature `more-inline` that puts more `#[inline]` on public methods. 168 | - The new `more-inline` crate feature is enabled by default. If you want to 169 | turn it off use `--no-default-features`. 170 | - Enabling `more-inline` also enables `hashbrown/more-inline`. 171 | - Remove `&B: IntoIter` trait bound from `Clone` impl of `StringInterner` 172 | 173 | ## 0.11.0 - 2020/07/14 174 | 175 | Thanks a lot (again) to [CAD97](https://dev.to/cad97) who is the vanguard 176 | of the technical improvements in this release with their 177 | [blog post](https://dev.to/cad97/string-interners-in-rust-797). 178 | 179 | - Significantly improved `StringInterner`'s memory consumption independend 180 | from the used internment backend. 181 | - Significantly improved `StringInterner`'s throughput for interning strings. 182 | - Change the `Backend` trait: 183 | - `intern` is no longer `unsafe` 184 | - `intern` returns `S` (symbol type) instead of `(InternedStr, S)` 185 | - same as above for `intern_static` 186 | - add `unsafe fn resolve_unchecked` which does the same as `resolve` 187 | but explicitely without bounds checking 188 | - No longer export `backend::InternedStr` type 189 | - Make `hashbrown` a mandatory dependency. 190 | **Note:** We depend on it for the moment for its `raw_entry` API that has not yet been 191 | stabilized for Rust. Also benchmarks show that it is 20-30% faster than Rust's 192 | hashmap implementation. 193 | - Benchmarks now show performance when using `FxBuildHasher` as build hasher. 194 | 195 | ## 0.10.1 - 2020/07/14 196 | 197 | - Allow to intern `&'static str` using `get_or_intern_static` API. 198 | - This is a common use case and more efficient since the interner can 199 | skip some allocations in this special case. 200 | - Fix bug in `SymbolU16` and `SymbolU32` that instantiating them with values 201 | greater or equal to `u16::MAX` or `u32::MAX` respectively caused them to 202 | panic instead of returning `None`. 203 | - Special thanks to [Ten0](https://github.com/Ten0) for reporting the issue! 204 | - Add a bunch of additional unit tests to further solifidy the implementation. 205 | 206 | ## 0.10.0 - 2020/07/13 207 | 208 | Special thanks to [CAD97](https://dev.to/cad97) who motivated me to craft this 209 | release through [their blog post](https://dev.to/cad97/string-interners-in-rust-797) 210 | "String interners in Rust". 211 | Also I want to thank [matklad](https://github.com/matklad) who wrote a nice 212 | [blog post](https://matklad.github.io/2020/03/22/fast-simple-rust-interner.html) 213 | that inspired the design of the new `BucketBackend` for `StringInterner`. 214 | 215 | - Implement pluggable backends for `StringInterner`. 216 | Uses the new `BucketBackend` by default which results in significant 217 | performance boosts and lower memory consumption as well as fewer overall 218 | memory allocations. 219 | 220 | This makes it possible for dependencies to alter the behavior of internment. 221 | The `string-interner` crate comes with 2 predefined backends: 222 | 1. `SimpleBackend`: Which is how the `StringInterner` of previous versions 223 | worked by default. It performs one allocation per interned string. 224 | 2. `BucketBackend`: Tries to minimize memory allocations and packs 225 | interned strings densely. This is the new default behavior for this crate. 226 | - Due to the above introduction of backends some APIs have been removed: 227 | - `reserve` 228 | - `capacity` 229 | - the entire `iter` module 230 | - Note: Simple iteration through the `StringInterer`'s interned strings 231 | and their symbols is still possible if the used backend supports 232 | iteration. 233 | - `resolve_unchecked`: Has no replacement, yet but might be reintroduced 234 | in future versions again. 235 | - `shrink_to_fit`: The API design was never really a good fit for interners. 236 | 237 | ## 0.9.0 - 2020/07/12 238 | 239 | - Remove `Ord` trait bound from `Symbol` trait 240 | - Also change `Symbol::from_usize(usize) -> Self` to `Symbol::try_from_usize(usize) -> Option` 241 | - Minor performance improvements for `DefaultSymbol::try_from_usize` 242 | - Put all iterator types into the `iter` sub module 243 | - Put all symbol types into the `symbol` sub module 244 | - Add new symbol types: 245 | - `SymbolU16`: 16-bit wide symbol 246 | - `SymbolU32`: 32-bit wide symbol (default) 247 | - `SymbolUsize`: same size as `usize` 248 | - Various internal improvements and reorganizations 249 | 250 | ## 0.8.0 - 2020/07/12 251 | 252 | - Make it possible to use this crate in `no_std` environments 253 | - Use the new `hashbrown` crate feature together with `no_std` 254 | - Rename `Sym` to `DefaultSymbol` 255 | - Add `IntoIterator` impl for `&StringInterner` 256 | - Add some `#[inline]` annotations which improve performance for queries 257 | - Various internal improvements (uses `Pin` self-referentials now) 258 | 259 | ## 0.7.1 - 2019/09/01 260 | 261 | - **CRITICAL** fix use after free bug in `StringInterner::clone()` 262 | - implement `std::iter::Extend` for `StringInterner` 263 | - `Sym::from_usize` now avoids using `unsafe` code 264 | - optimize `FromIterator` impl of `StringInterner` 265 | - move to Rust 2018 edition 266 | 267 | Thanks [YOSHIOKA Takuma](https://github.com/lo48576) for implementing this release. 268 | 269 | ## 0.7.0 - 2019/08/07 270 | 271 | - changed license from MIT to MIT/APACHE2.0 272 | - removed generic impl of `Symbol` for types that are `From` and `Into` 273 | - removed `StringInterner::clear` API since its usage breaks invariants 274 | - added `StringInterner::{capacity, reserve}` APIs 275 | - introduced a new default symbol type `Sym` that is a thin wrapper around `NonZeroU32` (idea by [koute][gh-user-koute]) 276 | - made `DefaultStringInterner` a type alias for the new `StringInterner` 277 | - added convenient `FromIterator` impl to `StringInterner` 278 | - dev 279 | - rewrote all unit tests (serde tests are still missing) 280 | - entirely refactored benchmark framework 281 | - added `html_root_url` to crate root 282 | 283 | Thanks [matklad][gh-user-madklad] for suggestions and impulses 284 | 285 | ## 0.6.4 - 2019/09/04 286 | 287 | - **CRITICAL:** fix use after free bug in `StringInterner::clone` implementation. 288 | 289 | ## 0.6.3 - 2017/09/20 290 | 291 | - fixed a bug that `StringInterner`'s `Send` impl didn't respect its generic `HashBuilder` parameter. Fixes GitHub [issue #4][gh-issue-4]. 292 | 293 | ## 0.6.2 - 2017/08/13 294 | 295 | - added `shrink_to_fit` public method to `StringInterner` - (by artemshein) 296 | 297 | ## 0.6.1 - 2017/07/31 298 | 299 | - fixed a bug that inserting non-owning string types (e.g. `str`) was broken due to dangling pointers (Thanks to artemshein for fixing it!) 300 | 301 | ## 0.6.0 - 2017/07/09 302 | 303 | - added optional serde serialization and deserialization support 304 | - more efficient and generic `PartialEq` implementation for `StringInterner` 305 | - made `StringInterner` generic over `BuildHasher` to allow for custom hashers 306 | 307 | ## 0.5.0 - 2017/07/08 308 | 309 | - added `IntoIterator` trait implementation for `StringInterner` 310 | - greatly simplified iterator code 311 | 312 | ## 0.4.0 - 2017/05/20 313 | 314 | - removed restrictive constraint for `Unsigned` for `Symbol` 315 | 316 | ## 0.3.3 - 2017/02/27 317 | 318 | - added `Send` and `Sync` to `InternalStrRef` to make `StringInterner` itself `Send` and `Sync` 319 | 320 | ## 0.2.1 - 2017/02/10 321 | 322 | ## 0.2.0 - 2017/02/10 323 | 324 | ## 0.1.0 - 2017/02/06 325 | 326 | [gh-issue-4]: (https://github.com/Robbepop/string-interner/issues/4) 327 | 328 | [gh-user-koute]: https://github.com/koute 329 | [gh-user-madklad]: https://github.com/matklad 330 | -------------------------------------------------------------------------------- /src/backend/buffer.rs: -------------------------------------------------------------------------------- 1 | #![cfg(feature = "backends")] 2 | 3 | use super::Backend; 4 | use crate::{symbol::expect_valid_symbol, DefaultSymbol, Symbol}; 5 | use alloc::vec::Vec; 6 | use core::{marker::PhantomData, mem, str}; 7 | 8 | /// An interner backend that appends all interned string information in a single buffer. 9 | /// 10 | /// # Usage Hint 11 | /// 12 | /// Use this backend if memory consumption is what matters most to you. 13 | /// Note though that unlike all other backends symbol values are not contigous! 14 | /// 15 | /// # Usage 16 | /// 17 | /// - **Fill:** Efficiency of filling an empty string interner. 18 | /// - **Resolve:** Efficiency of interned string look-up given a symbol. 19 | /// - **Allocations:** The number of allocations performed by the backend. 20 | /// - **Footprint:** The total heap memory consumed by the backend. 21 | /// - **Contiguous:** True if the returned symbols have contiguous values. 22 | /// - **Iteration:** Efficiency of iterating over the interned strings. 23 | /// 24 | /// Rating varies between **bad**, **ok**, **good** and **best**. 25 | /// 26 | /// | Scenario | Rating | 27 | /// |:------------|:--------:| 28 | /// | Fill | **best** | 29 | /// | Resolve | **bad** | 30 | /// | Allocations | **best** | 31 | /// | Footprint | **best** | 32 | /// | Supports `get_or_intern_static` | **no** | 33 | /// | `Send` + `Sync` | **yes** | 34 | /// | Contiguous | **no** | 35 | /// | Iteration | **bad** | 36 | #[derive(Debug)] 37 | pub struct BufferBackend { 38 | len_strings: usize, 39 | buffer: Vec, 40 | marker: PhantomData S>, 41 | } 42 | 43 | impl PartialEq for BufferBackend 44 | where 45 | S: Symbol, 46 | { 47 | fn eq(&self, other: &Self) -> bool { 48 | self.len_strings.eq(&other.len_strings) && self.buffer.eq(&other.buffer) 49 | } 50 | } 51 | 52 | impl Eq for BufferBackend where S: Symbol {} 53 | 54 | impl Clone for BufferBackend { 55 | fn clone(&self) -> Self { 56 | Self { 57 | len_strings: self.len_strings, 58 | buffer: self.buffer.clone(), 59 | marker: Default::default(), 60 | } 61 | } 62 | } 63 | 64 | impl Default for BufferBackend { 65 | #[cfg_attr(feature = "inline-more", inline)] 66 | fn default() -> Self { 67 | Self { 68 | len_strings: 0, 69 | buffer: Default::default(), 70 | marker: Default::default(), 71 | } 72 | } 73 | } 74 | 75 | impl BufferBackend 76 | where 77 | S: Symbol, 78 | { 79 | /// Returns the next available symbol. 80 | #[inline] 81 | fn next_symbol(&self) -> S { 82 | expect_valid_symbol(self.buffer.len()) 83 | } 84 | 85 | /// Resolves the string for the given symbol if any. 86 | /// 87 | /// # Note 88 | /// 89 | /// Returns the string from the given index if any as well 90 | /// as the index of the next string in the buffer. 91 | fn resolve_index_to_str(&self, index: usize) -> Option<(&[u8], usize)> { 92 | let bytes = self.buffer.get(index..)?; 93 | let (str_len, str_len_bytes) = decode_var_usize(bytes)?; 94 | let index_str = index + str_len_bytes; 95 | let str_bytes = self.buffer.get(index_str..index_str + str_len)?; 96 | Some((str_bytes, index_str + str_len)) 97 | } 98 | 99 | /// Resolves the string for the given symbol. 100 | /// 101 | /// # Note 102 | /// 103 | /// It is undefined behavior if the index does not resemble a string. 104 | /// 105 | /// # Safety 106 | /// 107 | /// The caller of the function has to ensure that calling this method 108 | /// is safe to do. 109 | unsafe fn resolve_index_to_str_unchecked(&self, index: usize) -> &str { 110 | // SAFETY: The function is marked unsafe so that the caller guarantees 111 | // that required invariants are checked. 112 | let bytes = unsafe { self.buffer.get_unchecked(index..) }; 113 | // SAFETY: The function is marked unsafe so that the caller guarantees 114 | // that required invariants are checked. 115 | let (str_len, str_len_bytes) = unsafe { decode_var_usize_unchecked(bytes) }; 116 | let index_str = index + str_len_bytes; 117 | let str_bytes = 118 | // SAFETY: The function is marked unsafe so that the caller guarantees 119 | // that required invariants are checked. 120 | unsafe { self.buffer.get_unchecked(index_str..index_str + str_len) }; 121 | // SAFETY: It is guaranteed by the backend that only valid strings 122 | // are stored in this portion of the buffer. 123 | unsafe { str::from_utf8_unchecked(str_bytes) } 124 | } 125 | 126 | /// Pushes the given value onto the buffer with `var7` encoding. 127 | /// 128 | /// Returns the amount of `var7` encoded bytes. 129 | #[inline] 130 | fn encode_var_usize(&mut self, value: usize) -> usize { 131 | encode_var_usize(&mut self.buffer, value) 132 | } 133 | 134 | /// Pushes the given string into the buffer and returns its span. 135 | /// 136 | /// # Panics 137 | /// 138 | /// If the backend ran out of symbols. 139 | fn push_string(&mut self, string: &str) -> S { 140 | let symbol = self.next_symbol(); 141 | let str_len = string.len(); 142 | let str_bytes = string.as_bytes(); 143 | self.encode_var_usize(str_len); 144 | self.buffer.extend(str_bytes); 145 | self.len_strings += 1; 146 | symbol 147 | } 148 | } 149 | 150 | impl Backend for BufferBackend 151 | where 152 | S: Symbol, 153 | { 154 | type Symbol = S; 155 | type Iter<'a> 156 | = Iter<'a, S> 157 | where 158 | Self: 'a; 159 | 160 | #[cfg_attr(feature = "inline-more", inline)] 161 | fn with_capacity(capacity: usize) -> Self { 162 | /// We encode the `usize` string length into the buffer as well. 163 | const LEN_USIZE: usize = mem::size_of::(); 164 | /// According to google the approx. word length is 5. 165 | const DEFAULT_STR_LEN: usize = 5; 166 | let bytes_per_string = DEFAULT_STR_LEN + LEN_USIZE; 167 | Self { 168 | len_strings: 0, 169 | buffer: Vec::with_capacity(capacity * bytes_per_string), 170 | marker: Default::default(), 171 | } 172 | } 173 | 174 | #[inline] 175 | fn intern(&mut self, string: &str) -> Self::Symbol { 176 | self.push_string(string) 177 | } 178 | 179 | #[inline] 180 | fn resolve(&self, symbol: Self::Symbol) -> Option<&str> { 181 | match self.resolve_index_to_str(symbol.to_usize()) { 182 | None => None, 183 | Some((bytes, _)) => str::from_utf8(bytes).ok(), 184 | } 185 | } 186 | 187 | fn shrink_to_fit(&mut self) { 188 | self.buffer.shrink_to_fit(); 189 | } 190 | 191 | #[inline] 192 | unsafe fn resolve_unchecked(&self, symbol: Self::Symbol) -> &str { 193 | // SAFETY: The function is marked unsafe so that the caller guarantees 194 | // that required invariants are checked. 195 | unsafe { self.resolve_index_to_str_unchecked(symbol.to_usize()) } 196 | } 197 | 198 | #[inline] 199 | fn iter(&self) -> Self::Iter<'_> { 200 | Iter::new(self) 201 | } 202 | } 203 | 204 | /// Encodes the value using variable length encoding into the buffer. 205 | /// 206 | /// Returns the amount of bytes used for the encoding. 207 | #[inline] 208 | fn encode_var_usize(buffer: &mut Vec, mut value: usize) -> usize { 209 | if value <= 0x7F { 210 | // Shortcut the common case for low value. 211 | buffer.push(value as u8); 212 | return 1; 213 | } 214 | let mut len_chunks = 0; 215 | loop { 216 | let mut chunk = (value as u8) & 0x7F_u8; 217 | value >>= 7; 218 | chunk |= ((value != 0) as u8) << 7; 219 | buffer.push(chunk); 220 | len_chunks += 1; 221 | if value == 0 { 222 | break; 223 | } 224 | } 225 | len_chunks 226 | } 227 | 228 | /// Decodes from a variable length encoded `usize` from the buffer. 229 | /// 230 | /// Returns the decoded value as first return value. 231 | /// Returns the number of decoded bytes as second return value. 232 | /// 233 | /// # Safety 234 | /// 235 | /// The caller has to make sure that the buffer contains the necessary 236 | /// bytes needed to properly decode a valid `usize` value. 237 | #[inline] 238 | unsafe fn decode_var_usize_unchecked(buffer: &[u8]) -> (usize, usize) { 239 | let first = unsafe { *buffer.get_unchecked(0) }; 240 | match first { 241 | byte if byte <= 0x7F_u8 => (byte as usize, 1), 242 | _ => unsafe { decode_var_usize_unchecked_cold(buffer) }, 243 | } 244 | } 245 | 246 | /// Decodes from a variable length encoded `usize` from the buffer. 247 | /// 248 | /// Returns the decoded value as first return value. 249 | /// Returns the number of decoded bytes as second return value. 250 | /// 251 | /// # Safety 252 | /// 253 | /// The caller has to make sure that the buffer contains the necessary 254 | /// bytes needed to properly decode a valid `usize` value. 255 | /// 256 | /// Uncommon case for string lengths of 254 or greater. 257 | #[inline] 258 | #[cold] 259 | unsafe fn decode_var_usize_unchecked_cold(buffer: &[u8]) -> (usize, usize) { 260 | let mut result: usize = 0; 261 | let mut i = 0; 262 | loop { 263 | let byte = unsafe { *buffer.get_unchecked(i) }; 264 | let shifted = ((byte & 0x7F_u8) as usize) << ((i * 7) as u32); 265 | result += shifted; 266 | if (byte & 0x80) == 0 { 267 | break; 268 | } 269 | i += 1; 270 | } 271 | (result, i + 1) 272 | } 273 | 274 | /// Decodes from a variable length encoded `usize` from the buffer. 275 | /// 276 | /// Returns the decoded value as first return value. 277 | /// Returns the number of decoded bytes as second return value. 278 | #[inline] 279 | fn decode_var_usize(buffer: &[u8]) -> Option<(usize, usize)> { 280 | match buffer.first() { 281 | None => None, 282 | Some(&byte) if byte <= 0x7F_u8 => Some((byte as usize, 1)), 283 | _ => decode_var_usize_cold(buffer), 284 | } 285 | } 286 | 287 | /// Decodes from a variable length encoded `usize` from the buffer. 288 | /// 289 | /// Returns the decoded value as first return value. 290 | /// Returns the number of decoded bytes as second return value. 291 | /// 292 | /// Uncommon case for string lengths of 254 or greater. 293 | #[inline] 294 | #[cold] 295 | fn decode_var_usize_cold(buffer: &[u8]) -> Option<(usize, usize)> { 296 | let mut result: usize = 0; 297 | let mut i = 0; 298 | loop { 299 | let byte = *buffer.get(i)?; 300 | let shifted = ((byte & 0x7F_u8) as usize).checked_shl((i * 7) as u32)?; 301 | result = result.checked_add(shifted)?; 302 | if (byte & 0x80) == 0 { 303 | break; 304 | } 305 | i += 1; 306 | } 307 | Some((result, i + 1)) 308 | } 309 | 310 | impl<'a, S> IntoIterator for &'a BufferBackend 311 | where 312 | S: Symbol, 313 | { 314 | type Item = (S, &'a str); 315 | type IntoIter = Iter<'a, S>; 316 | 317 | #[cfg_attr(feature = "inline-more", inline)] 318 | fn into_iter(self) -> Self::IntoIter { 319 | self.iter() 320 | } 321 | } 322 | 323 | pub struct Iter<'a, S> { 324 | backend: &'a BufferBackend, 325 | remaining: usize, 326 | next: usize, 327 | } 328 | 329 | impl<'a, S> Iter<'a, S> { 330 | #[cfg_attr(feature = "inline-more", inline)] 331 | pub fn new(backend: &'a BufferBackend) -> Self { 332 | Self { 333 | backend, 334 | remaining: backend.len_strings, 335 | next: 0, 336 | } 337 | } 338 | } 339 | 340 | impl<'a, S> Iterator for Iter<'a, S> 341 | where 342 | S: Symbol, 343 | { 344 | type Item = (S, &'a str); 345 | 346 | #[inline] 347 | fn size_hint(&self) -> (usize, Option) { 348 | let remaining = self.len(); 349 | (remaining, Some(remaining)) 350 | } 351 | 352 | #[inline] 353 | fn next(&mut self) -> Option { 354 | self.backend 355 | .resolve_index_to_str(self.next) 356 | .and_then(|(bytes, next)| { 357 | // SAFETY: Within the iterator all indices given to `resolv_index_to_str` 358 | // are properly pointing to the start of each interned string. 359 | let string = unsafe { str::from_utf8_unchecked(bytes) }; 360 | let symbol = S::try_from_usize(self.next)?; 361 | self.next = next; 362 | self.remaining -= 1; 363 | Some((symbol, string)) 364 | }) 365 | } 366 | } 367 | 368 | impl ExactSizeIterator for Iter<'_, S> 369 | where 370 | S: Symbol, 371 | { 372 | #[inline] 373 | fn len(&self) -> usize { 374 | self.remaining 375 | } 376 | } 377 | 378 | #[cfg(test)] 379 | mod tests { 380 | use super::{decode_var_usize, encode_var_usize}; 381 | use alloc::vec::Vec; 382 | 383 | #[test] 384 | fn encode_var_usize_1_byte_works() { 385 | let mut buffer = Vec::new(); 386 | for i in 0..2usize.pow(7) { 387 | buffer.clear(); 388 | assert_eq!(encode_var_usize(&mut buffer, i), 1); 389 | assert_eq!(buffer, [i as u8]); 390 | assert_eq!(decode_var_usize(&buffer), Some((i, 1))); 391 | } 392 | } 393 | 394 | #[test] 395 | fn encode_var_usize_2_bytes_works() { 396 | let mut buffer = Vec::new(); 397 | for i in 2usize.pow(7)..2usize.pow(14) { 398 | buffer.clear(); 399 | assert_eq!(encode_var_usize(&mut buffer, i), 2); 400 | assert_eq!(buffer, [0x80 | ((i & 0x7F) as u8), (0x7F & (i >> 7) as u8)]); 401 | assert_eq!(decode_var_usize(&buffer), Some((i, 2))); 402 | } 403 | } 404 | 405 | #[test] 406 | #[cfg_attr(any(miri), ignore)] 407 | fn encode_var_usize_3_bytes_works() { 408 | let mut buffer = Vec::new(); 409 | for i in 2usize.pow(14)..2usize.pow(21) { 410 | buffer.clear(); 411 | assert_eq!(encode_var_usize(&mut buffer, i), 3); 412 | assert_eq!( 413 | buffer, 414 | [ 415 | 0x80 | ((i & 0x7F) as u8), 416 | 0x80 | (0x7F & (i >> 7) as u8), 417 | (0x7F & (i >> 14) as u8), 418 | ] 419 | ); 420 | assert_eq!(decode_var_usize(&buffer), Some((i, 3))); 421 | } 422 | } 423 | 424 | /// Allows to split up the test into multiple fragments that can run in parallel. 425 | #[cfg_attr(any(miri), ignore)] 426 | fn assert_encode_var_usize_4_bytes(range: core::ops::Range) { 427 | let mut buffer = Vec::new(); 428 | for i in range { 429 | buffer.clear(); 430 | assert_eq!(encode_var_usize(&mut buffer, i), 4); 431 | assert_eq!( 432 | buffer, 433 | [ 434 | 0x80 | ((i & 0x7F) as u8), 435 | 0x80 | (0x7F & (i >> 7) as u8), 436 | 0x80 | (0x7F & (i >> 14) as u8), 437 | (0x7F & (i >> 21) as u8), 438 | ] 439 | ); 440 | assert_eq!(decode_var_usize(&buffer), Some((i, 4))); 441 | } 442 | } 443 | 444 | #[test] 445 | #[cfg_attr(any(miri), ignore)] 446 | fn encode_var_usize_4_bytes_01_works() { 447 | assert_encode_var_usize_4_bytes(2usize.pow(21)..2usize.pow(24)); 448 | } 449 | 450 | #[test] 451 | #[cfg_attr(any(miri), ignore)] 452 | fn encode_var_usize_4_bytes_02_works() { 453 | assert_encode_var_usize_4_bytes(2usize.pow(24)..2usize.pow(26)); 454 | } 455 | 456 | #[test] 457 | #[cfg_attr(any(miri), ignore)] 458 | fn encode_var_usize_4_bytes_03_works() { 459 | assert_encode_var_usize_4_bytes(2usize.pow(26)..2usize.pow(27)); 460 | } 461 | 462 | #[test] 463 | #[cfg_attr(any(miri), ignore)] 464 | fn encode_var_usize_4_bytes_04_works() { 465 | assert_encode_var_usize_4_bytes(2usize.pow(27)..2usize.pow(28)); 466 | } 467 | 468 | #[test] 469 | fn encode_var_u32_max_works() { 470 | let mut buffer = Vec::new(); 471 | let i = u32::MAX as usize; 472 | assert_eq!(encode_var_usize(&mut buffer, i), 5); 473 | assert_eq!(buffer, [0xFF, 0xFF, 0xFF, 0xFF, 0x0F]); 474 | assert_eq!(decode_var_usize(&buffer), Some((i, 5))); 475 | } 476 | 477 | #[test] 478 | fn encode_var_u64_max_works() { 479 | let mut buffer = Vec::new(); 480 | let i = u64::MAX as usize; 481 | assert_eq!(encode_var_usize(&mut buffer, i), 10); 482 | assert_eq!( 483 | buffer, 484 | [0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x01] 485 | ); 486 | assert_eq!(decode_var_usize(&buffer), Some((i, 10))); 487 | } 488 | 489 | #[test] 490 | fn decode_var_fail() { 491 | // Empty buffer. 492 | assert_eq!(decode_var_usize(&[]), None); 493 | // Missing buffer bytes. 494 | assert_eq!(decode_var_usize(&[0x80]), None); 495 | // Out of range encoded value. 496 | // assert_eq!( 497 | // decode_var_usize(&[ 498 | // 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x03 499 | // ]), 500 | // None, 501 | // ); 502 | } 503 | } 504 | -------------------------------------------------------------------------------- /tests/tests.rs: -------------------------------------------------------------------------------- 1 | mod allocator; 2 | 3 | use allocator::TracingAllocator; 4 | use string_interner::{backend, DefaultHashBuilder, DefaultSymbol, Symbol}; 5 | 6 | #[global_allocator] 7 | static ALLOCATOR: TracingAllocator = TracingAllocator::new(); 8 | 9 | /// Creates the symbol `S` from the given `usize`. 10 | /// 11 | /// # Panics 12 | /// 13 | /// Panics if the conversion is invalid. 14 | #[inline] 15 | pub(crate) fn expect_valid_symbol(index: usize) -> S 16 | where 17 | S: Symbol, 18 | { 19 | S::try_from_usize(index).expect("encountered invalid symbol") 20 | } 21 | 22 | /// Stats for the backend. 23 | pub trait BackendStats { 24 | /// The expected minimum memory overhead for this string interner backend. 25 | const MIN_OVERHEAD: f64; 26 | /// The expected maximum memory overhead for this string interner backend. 27 | const MAX_OVERHEAD: f64; 28 | /// The amount of allocations per 1M words. 29 | const MAX_ALLOCATIONS: usize; 30 | /// The amount of deallocations per 1M words. 31 | const MAX_DEALLOCATIONS: usize; 32 | /// The name of the backend for debug display purpose. 33 | const NAME: &'static str; 34 | } 35 | 36 | impl BackendStats for backend::BucketBackend { 37 | const MIN_OVERHEAD: f64 = 2.2; 38 | const MAX_OVERHEAD: f64 = 3.1; 39 | const MAX_ALLOCATIONS: usize = 65; 40 | const MAX_DEALLOCATIONS: usize = 42; 41 | const NAME: &'static str = "BucketBackend"; 42 | } 43 | 44 | impl BackendStats for backend::StringBackend { 45 | const MIN_OVERHEAD: f64 = 1.7; 46 | const MAX_OVERHEAD: f64 = 1.93; 47 | const MAX_ALLOCATIONS: usize = 62; 48 | const MAX_DEALLOCATIONS: usize = 59; 49 | const NAME: &'static str = "StringBackend"; 50 | } 51 | 52 | impl BackendStats for backend::BufferBackend { 53 | const MIN_OVERHEAD: f64 = 1.35; 54 | const MAX_OVERHEAD: f64 = 1.58; 55 | const MAX_ALLOCATIONS: usize = 43; 56 | const MAX_DEALLOCATIONS: usize = 41; 57 | const NAME: &'static str = "BufferBackend"; 58 | } 59 | 60 | /// Memory profiling stats. 61 | pub struct ProfilingStats { 62 | /// The minimum memory usage overhead as factor. 63 | pub overhead: f64, 64 | /// The total amount of allocations of the profiling test. 65 | pub allocations: usize, 66 | /// The total amount of deallocations of the profiling test. 67 | pub deallocations: usize, 68 | } 69 | 70 | macro_rules! gen_tests_for_backend { 71 | ( $backend:ty ) => { 72 | type StringInterner = 73 | string_interner::StringInterner<$backend, DefaultHashBuilder>; 74 | 75 | fn profile_memory_usage(words: &[String]) -> ProfilingStats { 76 | ALLOCATOR.reset(); 77 | ALLOCATOR.start_profiling(); 78 | let mut interner = StringInterner::new(); 79 | ALLOCATOR.end_profiling(); 80 | 81 | for word in words { 82 | ALLOCATOR.start_profiling(); 83 | interner.get_or_intern(word); 84 | } 85 | interner.shrink_to_fit(); 86 | ALLOCATOR.end_profiling(); 87 | 88 | let stats = ALLOCATOR.stats(); 89 | let len_allocations = stats.len_allocations(); 90 | let len_deallocations = stats.len_deallocations(); 91 | let current_allocated_bytes = stats.current_allocated_bytes(); 92 | let total_allocated_bytes = stats.total_allocated_bytes(); 93 | 94 | assert_eq!(interner.len(), words.len()); 95 | 96 | println!( 97 | "\ 98 | \n\t- # words = {}\ 99 | \n\t- # allocations = {}\ 100 | \n\t- # deallocations = {}\ 101 | \n\t- allocated bytes = {}\ 102 | \n\t- requested bytes = {}\ 103 | ", 104 | words.len(), 105 | len_allocations, len_deallocations, current_allocated_bytes, total_allocated_bytes, 106 | ); 107 | 108 | let ideal_memory_usage = words.len() * words[0].len(); 109 | let memory_usage_overhead = 110 | (current_allocated_bytes as f64) / (ideal_memory_usage as f64); 111 | println!("\t- ideal allocated bytes = {}", ideal_memory_usage); 112 | println!("\t- actual allocated bytes = {}", current_allocated_bytes); 113 | println!("\t- % actual overhead = {:.02}%", memory_usage_overhead * 100.0); 114 | 115 | ProfilingStats { 116 | overhead: memory_usage_overhead, 117 | allocations: len_allocations, 118 | deallocations: len_deallocations, 119 | } 120 | } 121 | 122 | #[test] 123 | #[cfg_attr(any(miri, not(feature = "test-allocations")), ignore)] 124 | fn test_memory_consumption() { 125 | let len_words = 1_000_000; 126 | let words = (0..).take(len_words).map(|i| { 127 | format!("{:20}", i) 128 | }).collect::>(); 129 | 130 | println!(); 131 | println!("Benchmark Memory Usage for {}", <$backend as BackendStats>::NAME); 132 | let mut min_overhead = None; 133 | let mut max_overhead = None; 134 | let mut max_allocations = None; 135 | let mut max_deallocations = None; 136 | for i in 0..10 { 137 | let len_words = 100_000 * (i+1); 138 | let words = &words[0..len_words]; 139 | let stats = profile_memory_usage(words); 140 | if min_overhead.map(|min| stats.overhead < min).unwrap_or(true) { 141 | min_overhead = Some(stats.overhead); 142 | } 143 | if max_overhead.map(|max| stats.overhead > max).unwrap_or(true) { 144 | max_overhead = Some(stats.overhead); 145 | } 146 | if max_allocations.map(|max| stats.allocations > max).unwrap_or(true) { 147 | max_allocations = Some(stats.allocations); 148 | } 149 | if max_deallocations.map(|max| stats.deallocations > max).unwrap_or(true) { 150 | max_deallocations = Some(stats.deallocations); 151 | } 152 | } 153 | let actual_min_overhead = min_overhead.unwrap(); 154 | let actual_max_overhead = max_overhead.unwrap(); 155 | let expect_min_overhead = <$backend as BackendStats>::MIN_OVERHEAD; 156 | let expect_max_overhead = <$backend as BackendStats>::MAX_OVERHEAD; 157 | let actual_max_allocations = max_allocations.unwrap(); 158 | let actual_max_deallocations = max_deallocations.unwrap(); 159 | let expect_max_allocations = <$backend as BackendStats>::MAX_ALLOCATIONS; 160 | let expect_max_deallocations = <$backend as BackendStats>::MAX_DEALLOCATIONS; 161 | 162 | println!(); 163 | println!("- % min overhead = {:.02}%", actual_min_overhead * 100.0); 164 | println!("- % max overhead = {:.02}%", actual_max_overhead * 100.0); 165 | println!("- % max allocations = {}", actual_max_allocations); 166 | println!("- % max deallocations = {}", actual_max_deallocations); 167 | 168 | assert!( 169 | actual_min_overhead < expect_min_overhead, 170 | "{} string interner backend minimum memory overhead is greater than expected. expected = {:?}, actual = {:?}", 171 | <$backend as BackendStats>::NAME, 172 | expect_min_overhead, 173 | actual_min_overhead, 174 | ); 175 | assert!( 176 | actual_max_overhead < expect_max_overhead, 177 | "{} string interner backend maximum memory overhead is greater than expected. expected = {:?}, actual = {:?}", 178 | <$backend as BackendStats>::NAME, 179 | expect_max_overhead, 180 | actual_max_overhead, 181 | ); 182 | assert_eq!( 183 | actual_max_allocations, expect_max_allocations, 184 | "{} string interner backend maximum amount of allocations is greater than expected. expected = {:?}, actual = {:?}", 185 | <$backend as BackendStats>::NAME, 186 | expect_max_allocations, 187 | actual_max_allocations, 188 | ); 189 | assert_eq!( 190 | actual_max_deallocations, expect_max_deallocations, 191 | "{} string interner backend maximum amount of deallocations is greater than expected. expected = {:?}, actual = {:?}", 192 | <$backend as BackendStats>::NAME, 193 | expect_max_deallocations, 194 | actual_max_deallocations, 195 | ); 196 | } 197 | 198 | #[test] 199 | fn new_works() { 200 | let interner = StringInterner::new(); 201 | assert_eq!(interner.len(), 0); 202 | assert!(interner.is_empty()); 203 | let other = StringInterner::new(); 204 | assert_eq!(interner, other); 205 | } 206 | 207 | #[test] 208 | fn is_empty_works() { 209 | let mut interner = StringInterner::new(); 210 | assert!(interner.is_empty()); 211 | interner.get_or_intern("aa"); 212 | assert!(!interner.is_empty()); 213 | } 214 | 215 | #[test] 216 | fn clone_works() { 217 | let mut interner = StringInterner::new(); 218 | assert_eq!(interner.get_or_intern("aa").to_usize(), 0); 219 | 220 | let mut cloned = interner.clone(); 221 | assert_eq!(interner, cloned); 222 | // And the clone should have the same interned values 223 | assert_eq!(cloned.get_or_intern("aa").to_usize(), 0); 224 | } 225 | 226 | #[test] 227 | fn get_or_intern_works() { 228 | let mut interner = StringInterner::new(); 229 | // Insert 3 unique strings: 230 | let aa = interner.get_or_intern("aa").to_usize(); 231 | let bb = interner.get_or_intern("bb").to_usize(); 232 | let cc = interner.get_or_intern("cc").to_usize(); 233 | // All symbols must be different from each other. 234 | assert_ne!(aa, bb); 235 | assert_ne!(bb, cc); 236 | assert_ne!(cc, aa); 237 | // The length of the string interner must be 3 at this point. 238 | assert_eq!(interner.len(), 3); 239 | // Insert the same 3 unique strings, yield the same symbols: 240 | assert_eq!(interner.resolve( 241 | ::try_from_usize(aa).unwrap()), Some("aa")); 242 | assert_eq!( 243 | interner.get_or_intern("aa").to_usize(), 244 | aa, 245 | "'aa' did not produce the same symbol", 246 | ); 247 | assert_eq!( 248 | interner.get_or_intern("bb").to_usize(), 249 | bb, 250 | "'bb' did not produce the same symbol", 251 | ); 252 | assert_eq!( 253 | interner.get_or_intern("cc").to_usize(), 254 | cc, 255 | "'cc' did not produce the same symbol", 256 | ); 257 | assert_eq!(interner.len(), 3); 258 | } 259 | 260 | #[test] 261 | fn get_or_intern_static_works() { 262 | let mut interner = StringInterner::new(); 263 | // Insert 3 unique strings: 264 | let a = interner.get_or_intern_static("aa").to_usize(); 265 | let b = interner.get_or_intern_static("bb").to_usize(); 266 | let c = interner.get_or_intern_static("cc").to_usize(); 267 | // All symbols must be different from each other. 268 | assert_ne!(a, b); 269 | assert_ne!(b, c); 270 | assert_ne!(c, a); 271 | // The length of the string interner must be 3 at this point. 272 | assert_eq!(interner.len(), 3); 273 | // Insert the same 3 unique strings, yield the same symbols: 274 | assert_eq!(interner.get_or_intern_static("aa").to_usize(), a); 275 | assert_eq!(interner.get_or_intern_static("bb").to_usize(), b); 276 | assert_eq!(interner.get_or_intern_static("cc").to_usize(), c); 277 | assert_eq!(interner.len(), 3); 278 | } 279 | 280 | #[test] 281 | fn resolve_works() { 282 | let mut interner = StringInterner::new(); 283 | // Insert 3 unique strings: 284 | let aa = interner.get_or_intern("aa"); 285 | let bb = interner.get_or_intern("bb"); 286 | let cc = interner.get_or_intern("cc"); 287 | assert_eq!(interner.len(), 3); 288 | // Resolve valid symbols: 289 | assert_eq!(interner.resolve(aa), Some("aa")); 290 | assert_eq!(interner.resolve(bb), Some("bb")); 291 | assert_eq!(interner.resolve(cc), Some("cc")); 292 | assert_eq!(interner.len(), 3); 293 | // Resolve invalid symbols: 294 | let dd = expect_valid_symbol(1000); 295 | assert_ne!(aa, dd); 296 | assert_ne!(bb, dd); 297 | assert_ne!(cc, dd); 298 | assert_eq!(interner.resolve(dd), None); 299 | } 300 | 301 | #[test] 302 | fn resolve_unchecked_works() { 303 | let mut interner = StringInterner::new(); 304 | // Insert 3 unique strings: 305 | let aa = interner.get_or_intern("aa"); 306 | let bb = interner.get_or_intern("bb"); 307 | let cc = interner.get_or_intern("cc"); 308 | assert_eq!(interner.len(), 3); 309 | // Resolve valid symbols: 310 | assert_eq!(unsafe { interner.resolve_unchecked(aa) }, "aa"); 311 | assert_eq!(unsafe { interner.resolve_unchecked(bb) }, "bb"); 312 | assert_eq!(unsafe { interner.resolve_unchecked(cc) }, "cc"); 313 | assert_eq!(interner.len(), 3); 314 | // Resolve invalid symbols: 315 | let dd = expect_valid_symbol(1000); 316 | assert_ne!(aa, dd); 317 | assert_ne!(bb, dd); 318 | assert_ne!(cc, dd); 319 | } 320 | 321 | #[test] 322 | fn get_works() { 323 | let mut interner = StringInterner::new(); 324 | // Insert 3 unique strings: 325 | let aa = interner.get_or_intern("aa"); 326 | let bb = interner.get_or_intern("bb"); 327 | let cc = interner.get_or_intern("cc"); 328 | assert_eq!(interner.len(), 3); 329 | // Get the symbols of the same 3 strings: 330 | assert_eq!(interner.get("aa"), Some(aa)); 331 | assert_eq!(interner.get("bb"), Some(bb)); 332 | assert_eq!(interner.get("cc"), Some(cc)); 333 | assert_eq!(interner.len(), 3); 334 | // Get the symbols of some unknown strings: 335 | assert_eq!(interner.get("dd"), None); 336 | assert_eq!(interner.get("ee"), None); 337 | assert_eq!(interner.get("ff"), None); 338 | assert_eq!(interner.len(), 3); 339 | } 340 | 341 | #[test] 342 | fn from_iter_works() { 343 | let strings = ["aa", "bb", "cc", "dd", "ee", "ff"]; 344 | let expected = { 345 | let mut interner = StringInterner::new(); 346 | for &string in &strings { 347 | interner.get_or_intern(string); 348 | } 349 | interner 350 | }; 351 | let actual = strings.into_iter().collect::(); 352 | assert_eq!(actual.len(), strings.len()); 353 | assert_eq!(actual, expected); 354 | } 355 | 356 | #[test] 357 | fn extend_works() { 358 | let strings = ["aa", "bb", "cc", "dd", "ee", "ff"]; 359 | let expected = { 360 | let mut interner = StringInterner::new(); 361 | for &string in &strings { 362 | interner.get_or_intern(string); 363 | } 364 | interner 365 | }; 366 | let actual = { 367 | let mut interner = StringInterner::new(); 368 | interner.extend(strings.iter().copied()); 369 | interner 370 | }; 371 | assert_eq!(actual.len(), strings.len()); 372 | assert_eq!(actual, expected); 373 | } 374 | 375 | #[test] 376 | fn iter_works() { 377 | let mut interner = StringInterner::new(); 378 | let strings = ["aa", "bb", "cc", "dd", "ee", "ff"]; 379 | let symbols = strings.iter().map(|s| interner.get_or_intern(s)).collect::>(); 380 | let expected_iter = symbols.into_iter().zip(strings); 381 | assert!(Iterator::eq(expected_iter, &interner)); 382 | } 383 | 384 | #[test] 385 | fn shrink_to_fit_works() { 386 | let mut interner = StringInterner::new(); 387 | // Insert 3 unique strings: 388 | let aa = interner.get_or_intern("aa").to_usize(); 389 | let bb = interner.get_or_intern("bb").to_usize(); 390 | let cc = interner.get_or_intern("cc").to_usize(); 391 | 392 | interner.shrink_to_fit(); 393 | 394 | assert_eq!( 395 | interner.get_or_intern("aa").to_usize(), 396 | aa, 397 | "'aa' did not produce the same symbol", 398 | ); 399 | assert_eq!( 400 | interner.get_or_intern("bb").to_usize(), 401 | bb, 402 | "'bb' did not produce the same symbol", 403 | ); 404 | assert_eq!( 405 | interner.get_or_intern("cc").to_usize(), 406 | cc, 407 | "'cc' did not produce the same symbol", 408 | ); 409 | assert_eq!(interner.len(), 3); 410 | } 411 | }; 412 | } 413 | 414 | mod bucket_backend { 415 | use super::*; 416 | 417 | gen_tests_for_backend!(backend::BucketBackend); 418 | } 419 | 420 | mod string_backend { 421 | use super::*; 422 | 423 | gen_tests_for_backend!(backend::StringBackend); 424 | } 425 | 426 | mod buffer_backend { 427 | use super::*; 428 | 429 | gen_tests_for_backend!(backend::BufferBackend); 430 | } 431 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 4 4 | 5 | [[package]] 6 | name = "aho-corasick" 7 | version = "1.1.3" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" 10 | dependencies = [ 11 | "memchr", 12 | ] 13 | 14 | [[package]] 15 | name = "anes" 16 | version = "0.1.6" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" 19 | 20 | [[package]] 21 | name = "anstyle" 22 | version = "1.0.10" 23 | source = "registry+https://github.com/rust-lang/crates.io-index" 24 | checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" 25 | 26 | [[package]] 27 | name = "autocfg" 28 | version = "1.4.0" 29 | source = "registry+https://github.com/rust-lang/crates.io-index" 30 | checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" 31 | 32 | [[package]] 33 | name = "bumpalo" 34 | version = "3.17.0" 35 | source = "registry+https://github.com/rust-lang/crates.io-index" 36 | checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" 37 | 38 | [[package]] 39 | name = "byteorder" 40 | version = "1.5.0" 41 | source = "registry+https://github.com/rust-lang/crates.io-index" 42 | checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" 43 | 44 | [[package]] 45 | name = "cast" 46 | version = "0.3.0" 47 | source = "registry+https://github.com/rust-lang/crates.io-index" 48 | checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" 49 | 50 | [[package]] 51 | name = "cfg-if" 52 | version = "1.0.0" 53 | source = "registry+https://github.com/rust-lang/crates.io-index" 54 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 55 | 56 | [[package]] 57 | name = "ciborium" 58 | version = "0.2.2" 59 | source = "registry+https://github.com/rust-lang/crates.io-index" 60 | checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" 61 | dependencies = [ 62 | "ciborium-io", 63 | "ciborium-ll", 64 | "serde", 65 | ] 66 | 67 | [[package]] 68 | name = "ciborium-io" 69 | version = "0.2.2" 70 | source = "registry+https://github.com/rust-lang/crates.io-index" 71 | checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" 72 | 73 | [[package]] 74 | name = "ciborium-ll" 75 | version = "0.2.2" 76 | source = "registry+https://github.com/rust-lang/crates.io-index" 77 | checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" 78 | dependencies = [ 79 | "ciborium-io", 80 | "half", 81 | ] 82 | 83 | [[package]] 84 | name = "clap" 85 | version = "4.5.28" 86 | source = "registry+https://github.com/rust-lang/crates.io-index" 87 | checksum = "3e77c3243bd94243c03672cb5154667347c457ca271254724f9f393aee1c05ff" 88 | dependencies = [ 89 | "clap_builder", 90 | ] 91 | 92 | [[package]] 93 | name = "clap_builder" 94 | version = "4.5.27" 95 | source = "registry+https://github.com/rust-lang/crates.io-index" 96 | checksum = "1b26884eb4b57140e4d2d93652abfa49498b938b3c9179f9fc487b0acc3edad7" 97 | dependencies = [ 98 | "anstyle", 99 | "clap_lex", 100 | ] 101 | 102 | [[package]] 103 | name = "clap_lex" 104 | version = "0.7.4" 105 | source = "registry+https://github.com/rust-lang/crates.io-index" 106 | checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" 107 | 108 | [[package]] 109 | name = "criterion" 110 | version = "0.5.1" 111 | source = "registry+https://github.com/rust-lang/crates.io-index" 112 | checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" 113 | dependencies = [ 114 | "anes", 115 | "cast", 116 | "ciborium", 117 | "clap", 118 | "criterion-plot", 119 | "is-terminal", 120 | "itertools", 121 | "num-traits", 122 | "once_cell", 123 | "oorandom", 124 | "plotters", 125 | "rayon", 126 | "regex", 127 | "serde", 128 | "serde_derive", 129 | "serde_json", 130 | "tinytemplate", 131 | "walkdir", 132 | ] 133 | 134 | [[package]] 135 | name = "criterion-plot" 136 | version = "0.5.0" 137 | source = "registry+https://github.com/rust-lang/crates.io-index" 138 | checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" 139 | dependencies = [ 140 | "cast", 141 | "itertools", 142 | ] 143 | 144 | [[package]] 145 | name = "crossbeam-deque" 146 | version = "0.8.6" 147 | source = "registry+https://github.com/rust-lang/crates.io-index" 148 | checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" 149 | dependencies = [ 150 | "crossbeam-epoch", 151 | "crossbeam-utils", 152 | ] 153 | 154 | [[package]] 155 | name = "crossbeam-epoch" 156 | version = "0.9.18" 157 | source = "registry+https://github.com/rust-lang/crates.io-index" 158 | checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" 159 | dependencies = [ 160 | "crossbeam-utils", 161 | ] 162 | 163 | [[package]] 164 | name = "crossbeam-utils" 165 | version = "0.8.21" 166 | source = "registry+https://github.com/rust-lang/crates.io-index" 167 | checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" 168 | 169 | [[package]] 170 | name = "crunchy" 171 | version = "0.2.3" 172 | source = "registry+https://github.com/rust-lang/crates.io-index" 173 | checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" 174 | 175 | [[package]] 176 | name = "either" 177 | version = "1.13.0" 178 | source = "registry+https://github.com/rust-lang/crates.io-index" 179 | checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" 180 | 181 | [[package]] 182 | name = "foldhash" 183 | version = "0.1.4" 184 | source = "registry+https://github.com/rust-lang/crates.io-index" 185 | checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f" 186 | 187 | [[package]] 188 | name = "fxhash" 189 | version = "0.2.1" 190 | source = "registry+https://github.com/rust-lang/crates.io-index" 191 | checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" 192 | dependencies = [ 193 | "byteorder", 194 | ] 195 | 196 | [[package]] 197 | name = "half" 198 | version = "2.4.1" 199 | source = "registry+https://github.com/rust-lang/crates.io-index" 200 | checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" 201 | dependencies = [ 202 | "cfg-if", 203 | "crunchy", 204 | ] 205 | 206 | [[package]] 207 | name = "hashbrown" 208 | version = "0.15.2" 209 | source = "registry+https://github.com/rust-lang/crates.io-index" 210 | checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" 211 | dependencies = [ 212 | "foldhash", 213 | ] 214 | 215 | [[package]] 216 | name = "hermit-abi" 217 | version = "0.4.0" 218 | source = "registry+https://github.com/rust-lang/crates.io-index" 219 | checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc" 220 | 221 | [[package]] 222 | name = "is-terminal" 223 | version = "0.4.15" 224 | source = "registry+https://github.com/rust-lang/crates.io-index" 225 | checksum = "e19b23d53f35ce9f56aebc7d1bb4e6ac1e9c0db7ac85c8d1760c04379edced37" 226 | dependencies = [ 227 | "hermit-abi", 228 | "libc", 229 | "windows-sys", 230 | ] 231 | 232 | [[package]] 233 | name = "itertools" 234 | version = "0.10.5" 235 | source = "registry+https://github.com/rust-lang/crates.io-index" 236 | checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" 237 | dependencies = [ 238 | "either", 239 | ] 240 | 241 | [[package]] 242 | name = "itoa" 243 | version = "1.0.14" 244 | source = "registry+https://github.com/rust-lang/crates.io-index" 245 | checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" 246 | 247 | [[package]] 248 | name = "js-sys" 249 | version = "0.3.77" 250 | source = "registry+https://github.com/rust-lang/crates.io-index" 251 | checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" 252 | dependencies = [ 253 | "once_cell", 254 | "wasm-bindgen", 255 | ] 256 | 257 | [[package]] 258 | name = "libc" 259 | version = "0.2.169" 260 | source = "registry+https://github.com/rust-lang/crates.io-index" 261 | checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" 262 | 263 | [[package]] 264 | name = "log" 265 | version = "0.4.25" 266 | source = "registry+https://github.com/rust-lang/crates.io-index" 267 | checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f" 268 | 269 | [[package]] 270 | name = "memchr" 271 | version = "2.7.4" 272 | source = "registry+https://github.com/rust-lang/crates.io-index" 273 | checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" 274 | 275 | [[package]] 276 | name = "num-traits" 277 | version = "0.2.19" 278 | source = "registry+https://github.com/rust-lang/crates.io-index" 279 | checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" 280 | dependencies = [ 281 | "autocfg", 282 | ] 283 | 284 | [[package]] 285 | name = "once_cell" 286 | version = "1.20.3" 287 | source = "registry+https://github.com/rust-lang/crates.io-index" 288 | checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e" 289 | 290 | [[package]] 291 | name = "oorandom" 292 | version = "11.1.4" 293 | source = "registry+https://github.com/rust-lang/crates.io-index" 294 | checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9" 295 | 296 | [[package]] 297 | name = "plotters" 298 | version = "0.3.7" 299 | source = "registry+https://github.com/rust-lang/crates.io-index" 300 | checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" 301 | dependencies = [ 302 | "num-traits", 303 | "plotters-backend", 304 | "plotters-svg", 305 | "wasm-bindgen", 306 | "web-sys", 307 | ] 308 | 309 | [[package]] 310 | name = "plotters-backend" 311 | version = "0.3.7" 312 | source = "registry+https://github.com/rust-lang/crates.io-index" 313 | checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" 314 | 315 | [[package]] 316 | name = "plotters-svg" 317 | version = "0.3.7" 318 | source = "registry+https://github.com/rust-lang/crates.io-index" 319 | checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" 320 | dependencies = [ 321 | "plotters-backend", 322 | ] 323 | 324 | [[package]] 325 | name = "proc-macro2" 326 | version = "1.0.93" 327 | source = "registry+https://github.com/rust-lang/crates.io-index" 328 | checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99" 329 | dependencies = [ 330 | "unicode-ident", 331 | ] 332 | 333 | [[package]] 334 | name = "quote" 335 | version = "1.0.38" 336 | source = "registry+https://github.com/rust-lang/crates.io-index" 337 | checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" 338 | dependencies = [ 339 | "proc-macro2", 340 | ] 341 | 342 | [[package]] 343 | name = "rayon" 344 | version = "1.10.0" 345 | source = "registry+https://github.com/rust-lang/crates.io-index" 346 | checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" 347 | dependencies = [ 348 | "either", 349 | "rayon-core", 350 | ] 351 | 352 | [[package]] 353 | name = "rayon-core" 354 | version = "1.12.1" 355 | source = "registry+https://github.com/rust-lang/crates.io-index" 356 | checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" 357 | dependencies = [ 358 | "crossbeam-deque", 359 | "crossbeam-utils", 360 | ] 361 | 362 | [[package]] 363 | name = "regex" 364 | version = "1.11.1" 365 | source = "registry+https://github.com/rust-lang/crates.io-index" 366 | checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" 367 | dependencies = [ 368 | "aho-corasick", 369 | "memchr", 370 | "regex-automata", 371 | "regex-syntax", 372 | ] 373 | 374 | [[package]] 375 | name = "regex-automata" 376 | version = "0.4.9" 377 | source = "registry+https://github.com/rust-lang/crates.io-index" 378 | checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" 379 | dependencies = [ 380 | "aho-corasick", 381 | "memchr", 382 | "regex-syntax", 383 | ] 384 | 385 | [[package]] 386 | name = "regex-syntax" 387 | version = "0.8.5" 388 | source = "registry+https://github.com/rust-lang/crates.io-index" 389 | checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" 390 | 391 | [[package]] 392 | name = "rustversion" 393 | version = "1.0.19" 394 | source = "registry+https://github.com/rust-lang/crates.io-index" 395 | checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" 396 | 397 | [[package]] 398 | name = "ryu" 399 | version = "1.0.19" 400 | source = "registry+https://github.com/rust-lang/crates.io-index" 401 | checksum = "6ea1a2d0a644769cc99faa24c3ad26b379b786fe7c36fd3c546254801650e6dd" 402 | 403 | [[package]] 404 | name = "same-file" 405 | version = "1.0.6" 406 | source = "registry+https://github.com/rust-lang/crates.io-index" 407 | checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" 408 | dependencies = [ 409 | "winapi-util", 410 | ] 411 | 412 | [[package]] 413 | name = "serde" 414 | version = "1.0.217" 415 | source = "registry+https://github.com/rust-lang/crates.io-index" 416 | checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" 417 | dependencies = [ 418 | "serde_derive", 419 | ] 420 | 421 | [[package]] 422 | name = "serde_derive" 423 | version = "1.0.217" 424 | source = "registry+https://github.com/rust-lang/crates.io-index" 425 | checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" 426 | dependencies = [ 427 | "proc-macro2", 428 | "quote", 429 | "syn", 430 | ] 431 | 432 | [[package]] 433 | name = "serde_json" 434 | version = "1.0.138" 435 | source = "registry+https://github.com/rust-lang/crates.io-index" 436 | checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" 437 | dependencies = [ 438 | "itoa", 439 | "memchr", 440 | "ryu", 441 | "serde", 442 | ] 443 | 444 | [[package]] 445 | name = "string-interner" 446 | version = "0.19.0" 447 | dependencies = [ 448 | "criterion", 449 | "fxhash", 450 | "hashbrown", 451 | "serde", 452 | "serde_json", 453 | ] 454 | 455 | [[package]] 456 | name = "syn" 457 | version = "2.0.98" 458 | source = "registry+https://github.com/rust-lang/crates.io-index" 459 | checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1" 460 | dependencies = [ 461 | "proc-macro2", 462 | "quote", 463 | "unicode-ident", 464 | ] 465 | 466 | [[package]] 467 | name = "tinytemplate" 468 | version = "1.2.1" 469 | source = "registry+https://github.com/rust-lang/crates.io-index" 470 | checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" 471 | dependencies = [ 472 | "serde", 473 | "serde_json", 474 | ] 475 | 476 | [[package]] 477 | name = "unicode-ident" 478 | version = "1.0.16" 479 | source = "registry+https://github.com/rust-lang/crates.io-index" 480 | checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034" 481 | 482 | [[package]] 483 | name = "walkdir" 484 | version = "2.5.0" 485 | source = "registry+https://github.com/rust-lang/crates.io-index" 486 | checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" 487 | dependencies = [ 488 | "same-file", 489 | "winapi-util", 490 | ] 491 | 492 | [[package]] 493 | name = "wasm-bindgen" 494 | version = "0.2.100" 495 | source = "registry+https://github.com/rust-lang/crates.io-index" 496 | checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" 497 | dependencies = [ 498 | "cfg-if", 499 | "once_cell", 500 | "rustversion", 501 | "wasm-bindgen-macro", 502 | ] 503 | 504 | [[package]] 505 | name = "wasm-bindgen-backend" 506 | version = "0.2.100" 507 | source = "registry+https://github.com/rust-lang/crates.io-index" 508 | checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" 509 | dependencies = [ 510 | "bumpalo", 511 | "log", 512 | "proc-macro2", 513 | "quote", 514 | "syn", 515 | "wasm-bindgen-shared", 516 | ] 517 | 518 | [[package]] 519 | name = "wasm-bindgen-macro" 520 | version = "0.2.100" 521 | source = "registry+https://github.com/rust-lang/crates.io-index" 522 | checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" 523 | dependencies = [ 524 | "quote", 525 | "wasm-bindgen-macro-support", 526 | ] 527 | 528 | [[package]] 529 | name = "wasm-bindgen-macro-support" 530 | version = "0.2.100" 531 | source = "registry+https://github.com/rust-lang/crates.io-index" 532 | checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" 533 | dependencies = [ 534 | "proc-macro2", 535 | "quote", 536 | "syn", 537 | "wasm-bindgen-backend", 538 | "wasm-bindgen-shared", 539 | ] 540 | 541 | [[package]] 542 | name = "wasm-bindgen-shared" 543 | version = "0.2.100" 544 | source = "registry+https://github.com/rust-lang/crates.io-index" 545 | checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" 546 | dependencies = [ 547 | "unicode-ident", 548 | ] 549 | 550 | [[package]] 551 | name = "web-sys" 552 | version = "0.3.77" 553 | source = "registry+https://github.com/rust-lang/crates.io-index" 554 | checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" 555 | dependencies = [ 556 | "js-sys", 557 | "wasm-bindgen", 558 | ] 559 | 560 | [[package]] 561 | name = "winapi-util" 562 | version = "0.1.9" 563 | source = "registry+https://github.com/rust-lang/crates.io-index" 564 | checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" 565 | dependencies = [ 566 | "windows-sys", 567 | ] 568 | 569 | [[package]] 570 | name = "windows-sys" 571 | version = "0.59.0" 572 | source = "registry+https://github.com/rust-lang/crates.io-index" 573 | checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" 574 | dependencies = [ 575 | "windows-targets", 576 | ] 577 | 578 | [[package]] 579 | name = "windows-targets" 580 | version = "0.52.6" 581 | source = "registry+https://github.com/rust-lang/crates.io-index" 582 | checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" 583 | dependencies = [ 584 | "windows_aarch64_gnullvm", 585 | "windows_aarch64_msvc", 586 | "windows_i686_gnu", 587 | "windows_i686_gnullvm", 588 | "windows_i686_msvc", 589 | "windows_x86_64_gnu", 590 | "windows_x86_64_gnullvm", 591 | "windows_x86_64_msvc", 592 | ] 593 | 594 | [[package]] 595 | name = "windows_aarch64_gnullvm" 596 | version = "0.52.6" 597 | source = "registry+https://github.com/rust-lang/crates.io-index" 598 | checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" 599 | 600 | [[package]] 601 | name = "windows_aarch64_msvc" 602 | version = "0.52.6" 603 | source = "registry+https://github.com/rust-lang/crates.io-index" 604 | checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" 605 | 606 | [[package]] 607 | name = "windows_i686_gnu" 608 | version = "0.52.6" 609 | source = "registry+https://github.com/rust-lang/crates.io-index" 610 | checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" 611 | 612 | [[package]] 613 | name = "windows_i686_gnullvm" 614 | version = "0.52.6" 615 | source = "registry+https://github.com/rust-lang/crates.io-index" 616 | checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" 617 | 618 | [[package]] 619 | name = "windows_i686_msvc" 620 | version = "0.52.6" 621 | source = "registry+https://github.com/rust-lang/crates.io-index" 622 | checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" 623 | 624 | [[package]] 625 | name = "windows_x86_64_gnu" 626 | version = "0.52.6" 627 | source = "registry+https://github.com/rust-lang/crates.io-index" 628 | checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" 629 | 630 | [[package]] 631 | name = "windows_x86_64_gnullvm" 632 | version = "0.52.6" 633 | source = "registry+https://github.com/rust-lang/crates.io-index" 634 | checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" 635 | 636 | [[package]] 637 | name = "windows_x86_64_msvc" 638 | version = "0.52.6" 639 | source = "registry+https://github.com/rust-lang/crates.io-index" 640 | checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" 641 | --------------------------------------------------------------------------------