├── .gitignore ├── .travis.yml ├── Cargo.toml ├── LICENSE.md ├── README.md ├── benches └── creation.rs ├── data ├── blns.txt └── raft-large-directories.txt ├── include ├── ustr.h └── ustr.hpp ├── miri.sh ├── mutex_comparison.png ├── rustfmt.toml ├── src ├── bumpalloc.rs ├── hash.rs ├── lib.rs ├── serialization.rs ├── stringcache.rs └── ustr_extern.rs └── ustring_bench_raft.png /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | **/*.rs.bk 3 | Cargo.lock 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # dist: xenial 2 | 3 | language: rust 4 | rust: nightly 5 | 6 | matrix: 7 | fast_finish: true 8 | include: 9 | # Miri 10 | - name: "miri" 11 | env: TARGET=x86_64-unknown-linux-gnu 12 | script: sh miri.sh 13 | # Tier 1 targets: 14 | - name: "x86_64-unknown-linux-gnu" 15 | env: TARGET=x86_64-unknown-linux-gnu 16 | script: env RUST_TEST_THREADS=1 cargo test --verbose --all --features=serde 17 | - name: "x86_64-unknown-linux-gnu (beta)" 18 | rust: beta 19 | env: TARGET=x86_64-unknown-linux-gnu 20 | script: env RUST_TEST_THREADS=1 cargo test --verbose --all --features=serde 21 | - name: "x86_64-unknown-linux-gnu (stable)" 22 | rust: stable 23 | env: TARGET=x86_64-unknown-linux-gnu 24 | script: env RUST_TEST_THREADS=1 cargo test --verbose --all --features=serde 25 | - name: "i686-unknown-linux-gnu" 26 | env: TARGET=i686-unknown-linux-gnu CROSS=1 27 | script: env RUST_TEST_THREADS=1 cargo test --verbose --all --features=serde 28 | - name: "x86_64-apple-darwin-10.3" 29 | env: TARGET=x86_64-apple-darwin 30 | script: env RUST_TEST_THREADS=1 cargo test --verbose --all --features=serde 31 | os: osx 32 | osx_image: xcode10.3 33 | - name: "x86_64-apple-darwin-11.2" 34 | env: TARGET=x86_64-apple-darwin 35 | script: env RUST_TEST_THREADS=1 cargo test --verbose --all --features=serde 36 | os: osx 37 | osx_image: xcode11.2 38 | - name: "x86_64-pc-windows-msvc" 39 | env: TARGET=x86_64-pc-windows-msvc 40 | script: env RUST_TEST_THREADS=1 cargo test --verbose --all --features=serde 41 | os: windows 42 | - name: "x86_64-pc-windows-gnu" 43 | env: TARGET=x86_64-pc-windows-gnu CROSS=1 44 | script: env RUST_TEST_THREADS=1 cargo test --verbose --all --features=serde 45 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ustr" 3 | version = "1.1.0" 4 | authors = ["Anders Langlands "] 5 | edition = "2021" 6 | license = "BSD-2-Clause-Patent" 7 | description = "Fast, FFI-friendly string interning." 8 | documentation = "https://docs.rs/ustr" 9 | repository = "https://github.com/anderslanglands/ustr" 10 | readme = "README.md" 11 | keywords = ["string", "interning", "FFI"] 12 | categories = ["caching", "data-structures"] 13 | 14 | [badges] 15 | travis-ci = { repository = "anderslanglands/ustr", branch = "master" } 16 | 17 | [dependencies] 18 | byteorder = "1.5" 19 | lazy_static = "1.5" 20 | parking_lot = "0.12" 21 | serde = { version = "1", optional = true } 22 | ahash = { version = "0.8.3", default-features = false } 23 | 24 | 25 | [dev-dependencies] 26 | criterion = "0.4" 27 | crossbeam-channel = "0.5" 28 | crossbeam-utils = "0.8" 29 | libc = "0.2" 30 | serde_json = "1" 31 | string-interner = "0.13" 32 | string_cache = "0.8" 33 | 34 | [[bench]] 35 | name = "creation" 36 | harness = false 37 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | BSD+ License 2 | 3 | Copyright (c) 2019 Anders Langlands 4 | 5 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 10 | 11 | Subject to the terms and conditions of this license, each copyright holder and contributor hereby grants to those receiving rights under this license a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except for failure to satisfy the conditions of this license) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer this software, where such license applies only to those patent claims, already acquired or hereafter acquired, licensable by such copyright holder or contributor that are necessarily infringed by: 12 | 13 | (a) their Contribution(s) (the licensed copyrights of copyright holders and non-copyrightable additions of contributors, in source or binary form) alone; or 14 | 15 | (b) combination of their Contribution(s) with the work of authorship to which such Contribution(s) was added by such copyright holder or contributor, if, at the time the Contribution is added, such addition causes such combination to be necessarily infringed. The patent license shall not apply to any other combinations which include the Contribution. 16 | 17 | Except as expressly stated above, no rights or licenses from any copyright holder or contributor is granted under this license, whether expressly, by implication, estoppel or otherwise. 18 | 19 | DISCLAIMER 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `ustr` 2 | 3 | Fast, FFI-friendly string interning. 4 | 5 | [![Build Status]][travis] [![Latest Version]][crates.io] [![Docs Badge]][docs.rs] 6 | 7 | [Build Status]: https://img.shields.io/travis/anderslanglands/ustr/master?style=for-the-badge 8 | [travis]: https://travis-ci.com/anderslanglands/ustr 9 | [Latest Version]: https://img.shields.io/crates/v/ustr?style=for-the-badge 10 | [crates.io]: https://crates.io/crates/ustr 11 | [Docs Badge]: https://img.shields.io/docsrs/ustr?style=for-the-badge 12 | [docs.rs]:https://docs.rs/ustr 13 | 14 | A `Ustr` (**U**nique **str**) is a lightweight handle representing a static, 15 | immutable entry in a global string cache, allowing for: 16 | 17 | * Extremely fast string assignment and comparisons. 18 | 19 | * Efficient storage. Only one copy of the string is held in memory, and 20 | getting access to it is just a pointer indirection. 21 | 22 | * Fast hashing ‒ the precomputed hash is stored with the string. 23 | 24 | * Fast FFI ‒ the string is stored with a terminating null byte so can be 25 | passed to C directly without doing the `CString` dance. 26 | 27 | The downside is no strings are ever freed, so if you're creating lots and lots 28 | of strings, you might run out of memory. On the other hand, *War and Peace* is 29 | only 3MB, so it's probably fine. 30 | 31 | This crate is based on [OpenImageIO's](https://openimageio.readthedocs.io/en/v2.4.10.0/) 32 | (OIIO) [`ustring`](https://github.com/OpenImageIO/oiio/blob/master/src/include/OpenImageIO/ustring.h) 33 | but it is *not* binary-compatible (yet). The underlying hash map implementation 34 | is directy ported from OIIO. 35 | 36 | ## Usage 37 | 38 | ```rust 39 | use ustr::{Ustr, ustr}; 40 | 41 | // Creation is quick and easy using either `Ustr::from` or the `ustr` short 42 | // function and only one copy of any string is stored. 43 | let h1 = Ustr::from("hello"); 44 | let h2 = ustr("hello"); 45 | 46 | // Comparisons and copies are extremely cheap. 47 | let h3 = h1; 48 | assert_eq!(h2, h3); 49 | 50 | // You can pass straight to FFI. 51 | let len = unsafe { 52 | libc::strlen(h1.as_char_ptr()) 53 | }; 54 | assert_eq!(len, 5); 55 | 56 | // For best performance when using Ustr as key for a HashMap or HashSet, 57 | // you'll want to use the precomputed hash. To make this easier, just use 58 | // the UstrMap and UstrSet exports: 59 | use ustr::UstrMap; 60 | 61 | // Key type is always Ustr. 62 | let mut map: UstrMap = UstrMap::default(); 63 | map.insert(u1, 17); 64 | assert_eq!(*map.get(&u1).unwrap(), 17); 65 | ``` 66 | 67 | By enabling the `"serde"` feature you can serialize individual `Ustr`s or 68 | the whole cache with serde. 69 | 70 | ```rust 71 | use ustr::{Ustr, ustr}; 72 | 73 | let u_ser = ustr("serialization is fun!"); 74 | let json = serde_json::to_string(&u_ser).unwrap(); 75 | let u_de : Ustr = serde_json::from_str(&json).unwrap(); 76 | 77 | assert_eq!(u_ser, u_de); 78 | ``` 79 | 80 | Since the cache is global, use the `ustr::DeserializedCache` dummy object to 81 | drive the deserialization. 82 | 83 | ```rust 84 | ustr("Send me to JSON and back"); 85 | let json = serde_json::to_string(ustr::cache()).unwrap(); 86 | 87 | // ... some time later ... 88 | let _: ustr::DeserializedCache = serde_json::from_str(&json).unwrap(); 89 | assert_eq!(ustr::num_entries(), 1); 90 | assert_eq!(ustr::string_cache_iter().collect::>(), vec!["Send me to JSON and back"]); 91 | 92 | ``` 93 | 94 | ## Calling from C/C++ 95 | 96 | If you are writing a library that uses ustr and want users to be able to create 97 | `Ustr`s to pass to your API from C, add `ustr_extern.rs` to your crate and use 98 | `include/ustr.h` or `include/ustr.hpp` for function declarations. 99 | 100 | ## Changelog 101 | 102 | ### Changes since 1.0.0 103 | 104 | * [Add a bunch of trait implementations that make it easier to work with stringy types](https://github.com/anderslanglands/ustr/pull/40). Thanks to @kornelski. 105 | * [Disable unused ahash features by default](https://github.com/anderslanglands/ustr/pull/46) to fix compilation on `wasm-unknown-unknown`. Thanks to @stephanemagnenat. 106 | 107 | * [Fix panic if string cache is empty](https://github.com/anderslanglands/ustr/pull/44) and assorted miri warnings and errors. Thanks to @orzogc. 108 | 109 | * [Bump versions of byteorder, lazy_static, ahash](https://github.com/anderslanglands/ustr/pull/49) and add additional documentation. Thanks to @virtualritz. 110 | 111 | 112 | ### Changes since 0.10 113 | 114 | * Actually renamed `serialization` feature to `serde` 115 | 116 | ### Changes since 0.9 117 | 118 | * Fixed and [issue](https://github.com/anderslanglands/ustr/issues/33) that 119 | would stop `Ustr` from working on `wasm32-unknown-unknown` (contributed by bouk) 120 | 121 | and thanks to virtualritz: 122 | 123 | * `Ustr::get_cache()` was [renamed](https://rust-lang.github.io/api-guidelines/naming.html#getter-names-follow-rust-convention-c-getter) 124 | to `cache()` 125 | 126 | * All dependencies were bumped to latest versions 127 | 128 | * All features were removed (there are good defaults) except for 129 | `serialization` 130 | 131 | * The `serialization` feature was [renamed](https://github.com/rust-lang/api-guidelines/discussions/180) 132 | to `serde` 133 | 134 | * `ustr` now uses Rust 2021 135 | 136 | ### Changes since 0.8 137 | 138 | * Add `existing_ustr` function (contributed by macprog-guy) 139 | 140 | The idea behind this is to allow the creation of a `Ustr` only when that 141 | `Ustr` already exists. This is particularly useful when `Ustr`s are being 142 | created using untrusted user input (say from a web server or API). In that 143 | case, by providing different values at each call we consume more and more 144 | memory eventually running out (DoS). 145 | 146 | * Add implementation for `Ord` (contributed by zigazeljko) 147 | 148 | * Inlined a bunch of simple functions (contributed by g-plane) 149 | 150 | * Fixed tests to lock rather than relying on `RUST_TEST_THREADS=1` (contributed 151 | by kornelski) 152 | 153 | * Fixed tests to handle serialization feature properly when enabled 154 | (contributed by kornelski) 155 | 156 | * Added a check for a potential allocation failure in the allocator 157 | (contributed by kornelski) 158 | 159 | * Added `FromStr` impl (contributed by martinmr) 160 | 161 | * Add `rustfmt.toml` to repo 162 | 163 | ### Changes since 0.7 164 | 165 | * Update dependencies 166 | 167 | The versions of `parking_lot` and `ahash` have been updated. 168 | 169 | * Space optimization with `NonNull` 170 | 171 | The internal pointer is now a `NonNull` to take advanatge of layout 172 | optimizations in `Option` etc. 173 | 174 | * Add `as_cstr()` method 175 | 176 | Added `as_cstr(&self) -> std::ffi::CStr` to make it easier to interface with 177 | APIs that rely on `CStr`. 178 | 179 | ### Changes since 0.6 180 | 181 | * Derive Ord for Ustr 182 | 183 | So now you can sort a `Vec` of `Ustr`s lexicographically. 184 | 185 | ### Changes since 0.5 186 | 187 | * Added `From` for `&str` 188 | 189 | This `impl` makes it easier to pass a `Ustr` to methods expecting an 190 | `Into<&str>`. 191 | 192 | ### Changes since 0.4 193 | 194 | * 32-bit support added 195 | 196 | Removed the restriction to 64-bit systems and fixed a bug relating to pointer 197 | maths. Thanks to agaussman for [bringing it up](https://github.com/anderslanglands/ustr/issues/8). 198 | 199 | * Miri leak checks re-enabled 200 | 201 | Thanks to RalfJung for pointing out that Miri now ignores ["leaks" from statics](https://github.com/anderslanglands/ustr/pull/9). 202 | 203 | * `PartialOrd` is now lexicographic 204 | * 205 | Thanks to macprog-guy for the PR implementing PartialOrd by deferring to 206 | `&str`. This will be slower than the previous derived implementation which 207 | just did a pointer comparison, but is much [less surprising](https://github.com/anderslanglands/ustr/pull/10). 208 | 209 | ### Changes since 0.3 210 | 211 | * Added Miri to CI tests 212 | 213 | Miri sanity-checks the unsafe parts of the code to guard against some types 214 | of UB. 215 | 216 | * Switched to [ahash](https://github.com/tkaitchuck/aHash) as the default 217 | hasher 218 | 219 | Ahash is a fast, non-cryptographic pure Rust hasher. Pure Rust is important 220 | to be able to run Miri and ahash benchmarks the fastest I could find. The old 221 | `fasthash`/`cityhash` is available by enabling `--features=hashcity` 222 | 223 | ### Changes since 0.2 224 | 225 | * Serde support 226 | 227 | `Ustr` can now be serialized with Serde when enabling 228 | `--features=serialization`. The global string cache can also be serialized if 229 | you really want to. 230 | 231 | * Switched to `parking_lot::Mutex` as default synchronization 232 | 233 | Spinlocks have been getting a bad rap recently so the string cache now uses 234 | `parking_lot::Mutex` as the default synchronization primitive. `spin::Mutex` 235 | is still available behind the `--features=spinlock` feature gate if you 236 | really want that extra 5% speed. 237 | 238 | * Cleaned up `unsafe` 239 | 240 | Did a better job of documenting the invariants for the unsafe blocks and 241 | replaced some blind additions with checked_add() and friends to avoid 242 | potential (but very unlikely) overflow. 243 | 244 | * Compared to `string-cache` 245 | 246 | [string-cache](https://github.com/servo/string-cache) provides a global cache 247 | that can be created at compile time as well as at run time. Dynamic strings 248 | in the cache appear to be reference-counted so will be freed when they are no 249 | longer used, while `Ustr`s are never deleted. 250 | 251 | Creating a `string_cache::DefaultAtom` is much slower than creating a `Ustr`, 252 | especially in a multi-threaded context. On the other hand if you can just 253 | bake all your `Atom`s into your binary at compile-time this wouldn't be an 254 | issue. 255 | 256 | * Compared to `string-interner` 257 | 258 | [string-interner](https://github.com/robbepop/string-interner) gives you 259 | individual `Interner` objects to work with rather than a global cache, which 260 | could be more flexible. It's faster to create than string-cache but still 261 | significantly slower than `Ustr`. 262 | 263 | ## Speed 264 | 265 | `Ustr`s are significantly faster to create than `string-interner` or 266 | `string-cache`. Creating 100,000 cycled copies of ~20,000 path strings of the 267 | form: 268 | 269 | ```text 270 | /cgi-bin/images/admin 271 | /modules/templates/cache 272 | /libraries/themes/wp-includes 273 | ... etc. 274 | ``` 275 | 276 | ![raft bench](ustring_bench_raft.png) 277 | 278 | ## Why? 279 | 280 | It is common in certain types of applications to use strings as identifiers, 281 | but not really do any processing with them. To paraphrase from OIIO's `ustring` 282 | documentation: 283 | 284 | Compared to standard strings, `Ustr`s have several advantages: 285 | 286 | * Each individual `Ustr` is very small -- in fact, we guarantee that a `Ustr` 287 | is the same size and memory layout as an ordinary *u8. 288 | 289 | * Storage is frugal, since there is only one allocated copy of each unique 290 | character sequence, throughout the lifetime of the program. 291 | 292 | * Assignment from one `Ustr` to another is just copy of the pointer; no 293 | allocation, no character copying, no reference counting. 294 | 295 | * Equality testing (do the strings contain the same characters) is a single 296 | operation, the comparison of the pointer. 297 | 298 | * Memory allocation only occurs when a new `Ustr` is constructed from raw 299 | characters the *first* time ‒ subsequent constructions of the same string 300 | just finds it in the canonial string set, but doesn't need to allocate new 301 | storage. Destruction of a `Ustr` is trivial, there is no de-allocation 302 | because the canonical version stays in the set. Also, therefore, no user 303 | code mistake can lead to memory leaks. 304 | 305 | But there are some problems, too. Canonical strings are never freed from the 306 | table. So in some sense all the strings "leak", but they only leak one copy 307 | for each unique string that the program ever comes across. Creating a `Ustr` 308 | is slower than `String::from()` on a single thread, and performance will be 309 | worse if trying to create many `Ustr`s in tight loops from multiple threads 310 | due to lock contention for the global cache. 311 | 312 | On the whole, `Ustr`s are a really great string representation 313 | 314 | * if you tend to have (relatively) few unique strings, but many copies of those 315 | strings; 316 | 317 | * if you tend to make the same strings over and over again, and if it's 318 | relatively rare that a single unique character sequence is used only once in 319 | the entire lifetime of the program; ‒ if your most common string operations 320 | are assignment and equality testing and you want them to be as fast as 321 | possible; 322 | 323 | * if you are doing relatively little character-by-character assembly of 324 | strings, string concatenation, or other "string manipulation" (other than 325 | equality testing). 326 | 327 | `Ustr`s are not so hot: 328 | 329 | * if your program tends to have very few copies of each character sequence over 330 | the entire lifetime of the program; 331 | 332 | * if your program tends to generate a huge variety of unique strings over its 333 | lifetime, each of which is used only a short time and then discarded, never 334 | to be needed again; 335 | 336 | * if you don't need to do a lot of string assignment or equality testing, but 337 | lots of more complex string manipulation. 338 | 339 | ## Safety and Compatibility 340 | 341 | This crate contains a significant amount of unsafe but usage has been checked 342 | and is well-documented. It is also run through Miri as part of the CI process. 343 | 344 | I use it regularly on 64-bit systems, and it has passed Miri on a 32-bit system 345 | as well, bit 32-bit is not checked regularly. If you want to use it on 32-bit, 346 | please make sure to run Miri and open and issue if you find any problems. 347 | 348 | ## License 349 | 350 | BSD+ License 351 | 352 | Copyright © 2019—2024 Anders Langlands 353 | 354 | Redistribution and use in source and binary forms, with or without 355 | modification, are permitted provided that the following conditions are met: 356 | 357 | 1. Redistributions of source code must retain the above copyright notice, this 358 | list of conditions and the following disclaimer. 359 | 360 | 2. Redistributions in binary form must reproduce the above copyright notice, 361 | this list of conditions and the following disclaimer in the documentation 362 | and/or other materials provided with the distribution. 363 | 364 | Subject to the terms and conditions of this license, each copyright holder and 365 | contributor hereby grants to those receiving rights under this license a 366 | perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable 367 | (except for failure to satisfy the conditions of this license) patent license 368 | to make, have made, use, offer to sell, sell, import, and otherwise transfer 369 | this software, where such license applies only to those patent claims, already 370 | acquired or hereafter acquired, licensable by such copyright holder or 371 | contributor that are necessarily infringed by: 372 | 373 | (a) their Contribution(s) (the licensed copyrights of copyright holders and 374 | non-copyrightable additions of contributors, in source or binary form) alone; 375 | or 376 | 377 | (b) combination of their Contribution(s) with the work of authorship to which 378 | such Contribution(s) was added by such copyright holder or contributor, if, at 379 | the time the Contribution is added, such addition causes such combination to be 380 | necessarily infringed. The patent license shall not apply to any other 381 | combinations which include the Contribution. 382 | 383 | Except as expressly stated above, no rights or licenses from any copyright 384 | holder or contributor is granted under this license, whether expressly, by 385 | implication, estoppel or otherwise. 386 | 387 | DISCLAIMER 388 | 389 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 390 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 391 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 392 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE 393 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 394 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 395 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 396 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 397 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 398 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 399 | 400 | Contains code ported from [OpenImageIO](https://github.com/OpenImageIO/oiio), 401 | BSD 3-clause license. 402 | 403 | Contains a copy of Max Woolf's [Big List of Naughty Strings](https://github.com/minimaxir/big-list-of-naughty-strings), 404 | MIT license. 405 | 406 | Contains some strings from 407 | [SecLists](https://github.com/danielmiessler/SecLists), MIT license. 408 | -------------------------------------------------------------------------------- /benches/creation.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate criterion; 3 | use criterion::black_box; 4 | use criterion::Criterion; 5 | use crossbeam_channel::bounded; 6 | use crossbeam_utils::thread::scope; 7 | use std::sync::Arc; 8 | use string_cache::DefaultAtom; 9 | use string_interner::StringInterner; 10 | 11 | use ustr::*; 12 | 13 | use parking_lot::Mutex; 14 | 15 | fn criterion_benchmark(c: &mut Criterion) { 16 | let path = 17 | std::path::Path::new(&std::env::var("CARGO_MANIFEST_DIR").unwrap()) 18 | .join("data") 19 | .join("raft-large-directories.txt"); 20 | let raft = std::fs::read_to_string(path).unwrap(); 21 | let raft = Arc::new( 22 | raft.split_whitespace() 23 | .collect::>() 24 | .chunks(3) 25 | .map(|s| { 26 | if s.len() == 3 { 27 | format!("{}/{}/{}", s[0], s[1], s[2]) 28 | } else { 29 | s[0].to_owned() 30 | } 31 | }) 32 | .collect::>(), 33 | ); 34 | 35 | let s = raft.clone(); 36 | c.bench_function("single raft ustr", move |b| { 37 | b.iter(|| { 38 | unsafe { ustr::_clear_cache() }; 39 | for s in s.iter().cycle().take(100_000) { 40 | black_box(ustr(s)); 41 | } 42 | }); 43 | }); 44 | 45 | let s = raft.clone(); 46 | c.bench_function("single raft string-interner", move |b| { 47 | b.iter(|| { 48 | let mut interner = StringInterner::default(); 49 | for s in s.iter().cycle().take(100_000) { 50 | black_box(interner.get_or_intern(s)); 51 | } 52 | }); 53 | }); 54 | 55 | let s = raft.clone(); 56 | c.bench_function("single raft string-cache", move |b| { 57 | b.iter(|| { 58 | let mut v = Vec::with_capacity(100_000); 59 | for s in s.iter().cycle().take(100_000) { 60 | v.push(DefaultAtom::from(s.as_str())); 61 | } 62 | black_box(v); 63 | }); 64 | }); 65 | 66 | let s = raft.clone(); 67 | c.bench_function("single raft String", move |b| { 68 | b.iter(|| { 69 | for s in s.iter().cycle().take(100_000) { 70 | black_box(String::from(s)); 71 | } 72 | }); 73 | }); 74 | 75 | let num = 100_000; 76 | 77 | for num_threads in [1, 2, 4, 6, 8, 12].iter() { 78 | let num_threads = *num_threads; 79 | 80 | let s = Arc::clone(&raft); 81 | c.bench_function( 82 | &format!("raft ustr x {} threads", num_threads), 83 | move |b| { 84 | let (tx1, rx1) = bounded(0); 85 | let (tx2, rx2) = bounded(0); 86 | let s = Arc::clone(&s); 87 | scope(|scope| { 88 | for tt in 0..num_threads { 89 | let t = tt; 90 | let rx1 = rx1.clone(); 91 | let tx2 = tx2.clone(); 92 | let s = Arc::clone(&s); 93 | scope.spawn(move |_| { 94 | while rx1.recv().is_ok() { 95 | for s in s.iter().cycle().skip(t * 17).take(num) 96 | { 97 | black_box(ustr(s)); 98 | } 99 | tx2.send(()).unwrap(); 100 | } 101 | }); 102 | } 103 | 104 | b.iter(|| { 105 | unsafe { ustr::_clear_cache() }; 106 | for _ in 0..num_threads { 107 | tx1.send(()).unwrap(); 108 | } 109 | 110 | for _ in 0..num_threads { 111 | rx2.recv().unwrap(); 112 | } 113 | }); 114 | drop(tx1); 115 | }) 116 | .unwrap(); 117 | }, 118 | ); 119 | 120 | let s = Arc::clone(&raft); 121 | c.bench_function( 122 | &format!("raft string-interner x {} threads", num_threads), 123 | move |b| { 124 | let (tx1, rx1) = bounded::< 125 | Arc>>, 126 | >(0); 127 | let (tx2, rx2) = bounded(0); 128 | scope(|scope| { 129 | for tt in 0..num_threads { 130 | let t = tt; 131 | let rx1 = rx1.clone(); 132 | let tx2 = tx2.clone(); 133 | let s = Arc::clone(&s); 134 | scope.spawn(move |_| { 135 | while let Ok(interner) = rx1.recv() { 136 | for s in s.iter().cycle().skip(t * 17).take(num) 137 | { 138 | let mut int = interner.lock(); 139 | black_box(int.get_or_intern(s)); 140 | } 141 | tx2.send(()).unwrap(); 142 | } 143 | }); 144 | } 145 | 146 | b.iter(|| { 147 | let interner = 148 | Arc::new(Mutex::new(StringInterner::default())); 149 | for _ in 0..num_threads { 150 | tx1.send(interner.clone()).unwrap(); 151 | } 152 | 153 | for _ in 0..num_threads { 154 | rx2.recv().unwrap(); 155 | } 156 | }); 157 | drop(tx1); 158 | }) 159 | .unwrap(); 160 | }, 161 | ); 162 | 163 | let s = Arc::clone(&raft); 164 | c.bench_function( 165 | &format!("raft string-cache x {} threads", num_threads), 166 | move |b| { 167 | let (tx1, rx1) = bounded(0); 168 | let (tx2, rx2) = bounded(0); 169 | scope(|scope| { 170 | for tt in 0..num_threads { 171 | let t = tt; 172 | let rx1 = rx1.clone(); 173 | let tx2 = tx2.clone(); 174 | let s = Arc::clone(&s); 175 | scope.spawn(move |_| { 176 | while rx1.recv().is_ok() { 177 | let mut v = Vec::with_capacity(num); 178 | for s in s.iter().cycle().skip(t * 17).take(num) 179 | { 180 | v.push(DefaultAtom::from(s.as_str())); 181 | } 182 | tx2.send(()).unwrap(); 183 | } 184 | }); 185 | } 186 | 187 | b.iter(|| { 188 | for _ in 0..num_threads { 189 | tx1.send(()).unwrap(); 190 | } 191 | 192 | for _ in 0..num_threads { 193 | rx2.recv().unwrap(); 194 | } 195 | }); 196 | drop(tx1); 197 | }) 198 | .unwrap(); 199 | }, 200 | ); 201 | 202 | let s = Arc::clone(&raft); 203 | c.bench_function( 204 | &format!("raft String::from x {} threads", num_threads), 205 | move |b| { 206 | let (tx1, rx1) = bounded(0); 207 | let (tx2, rx2) = bounded(0); 208 | scope(|scope| { 209 | for tt in 0..num_threads { 210 | let t = tt; 211 | let rx1 = rx1.clone(); 212 | let tx2 = tx2.clone(); 213 | let s = Arc::clone(&s); 214 | scope.spawn(move |_| { 215 | while rx1.recv().is_ok() { 216 | for s in s.iter().cycle().skip(t * 17).take(num) 217 | { 218 | black_box(String::from(s)); 219 | } 220 | tx2.send(()).unwrap(); 221 | } 222 | }); 223 | } 224 | 225 | b.iter(|| { 226 | for _ in 0..num_threads { 227 | tx1.send(()).unwrap(); 228 | } 229 | 230 | for _ in 0..num_threads { 231 | rx2.recv().unwrap(); 232 | } 233 | }); 234 | drop(tx1); 235 | }) 236 | .unwrap(); 237 | }, 238 | ); 239 | } 240 | 241 | let path = 242 | std::path::Path::new(&std::env::var("CARGO_MANIFEST_DIR").unwrap()) 243 | .join("data") 244 | .join("raft-large-directories.txt"); 245 | let raft_large = std::fs::read_to_string(path).unwrap(); 246 | let raft_large = Arc::new( 247 | raft_large 248 | .split_whitespace() 249 | .collect::>() 250 | .chunks(11) 251 | .map(|s| { 252 | // if s.len() == 3 { 253 | // format!("{}/{}/{}", s[0], s[1], s[2]) 254 | // } else { 255 | // s[0].to_owned() 256 | // } 257 | s.join("/") 258 | }) 259 | .collect::>(), 260 | ); 261 | 262 | let s = raft_large.clone(); 263 | c.bench_function("raft large x1", move |b| { 264 | b.iter(|| { 265 | unsafe { ustr::_clear_cache() }; 266 | for s in s.iter().cycle().take(100_000) { 267 | black_box(ustr(s)); 268 | } 269 | }); 270 | }); 271 | 272 | let num_threads = 6; 273 | let s = raft_large.clone(); 274 | c.bench_function("raft large x6", move |b| { 275 | let (tx1, rx1) = bounded(0); 276 | let (tx2, rx2) = bounded(0); 277 | let s = Arc::clone(&s); 278 | scope(|scope| { 279 | for tt in 0..num_threads { 280 | let t = tt; 281 | let rx1 = rx1.clone(); 282 | let tx2 = tx2.clone(); 283 | let s = Arc::clone(&s); 284 | scope.spawn(move |_| { 285 | while rx1.recv().is_ok() { 286 | for s in s.iter().cycle().skip(t * 17).take(num) { 287 | black_box(ustr(s)); 288 | } 289 | tx2.send(()).unwrap(); 290 | } 291 | }); 292 | } 293 | 294 | b.iter(|| { 295 | unsafe { ustr::_clear_cache() }; 296 | for _ in 0..num_threads { 297 | tx1.send(()).unwrap(); 298 | } 299 | 300 | for _ in 0..num_threads { 301 | rx2.recv().unwrap(); 302 | } 303 | }); 304 | drop(tx1); 305 | }) 306 | .unwrap(); 307 | }); 308 | } 309 | 310 | criterion_group!( 311 | name = benches; 312 | config = Criterion::default().sample_size(30); 313 | targets = criterion_benchmark 314 | ); 315 | criterion_main!(benches); 316 | -------------------------------------------------------------------------------- /data/blns.txt: -------------------------------------------------------------------------------- 1 | # Reserved Strings 2 | # 3 | # Strings which may be used elsewhere in code 4 | 5 | undefined 6 | undef 7 | null 8 | NULL 9 | (null) 10 | nil 11 | NIL 12 | true 13 | false 14 | True 15 | False 16 | TRUE 17 | FALSE 18 | None 19 | hasOwnProperty 20 | then 21 | \ 22 | \\ 23 | 24 | # Numeric Strings 25 | # 26 | # Strings which can be interpreted as numeric 27 | 28 | 0 29 | 1 30 | 1.00 31 | $1.00 32 | 1/2 33 | 1E2 34 | 1E02 35 | 1E+02 36 | -1 37 | -1.00 38 | -$1.00 39 | -1/2 40 | -1E2 41 | -1E02 42 | -1E+02 43 | 1/0 44 | 0/0 45 | -2147483648/-1 46 | -9223372036854775808/-1 47 | -0 48 | -0.0 49 | +0 50 | +0.0 51 | 0.00 52 | 0..0 53 | . 54 | 0.0.0 55 | 0,00 56 | 0,,0 57 | , 58 | 0,0,0 59 | 0.0/0 60 | 1.0/0.0 61 | 0.0/0.0 62 | 1,0/0,0 63 | 0,0/0,0 64 | --1 65 | - 66 | -. 67 | -, 68 | 999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999 69 | NaN 70 | Infinity 71 | -Infinity 72 | INF 73 | 1#INF 74 | -1#IND 75 | 1#QNAN 76 | 1#SNAN 77 | 1#IND 78 | 0x0 79 | 0xffffffff 80 | 0xffffffffffffffff 81 | 0xabad1dea 82 | 123456789012345678901234567890123456789 83 | 1,000.00 84 | 1 000.00 85 | 1'000.00 86 | 1,000,000.00 87 | 1 000 000.00 88 | 1'000'000.00 89 | 1.000,00 90 | 1 000,00 91 | 1'000,00 92 | 1.000.000,00 93 | 1 000 000,00 94 | 1'000'000,00 95 | 01000 96 | 08 97 | 09 98 | 2.2250738585072011e-308 99 | 100 | # Special Characters 101 | # 102 | # ASCII punctuation. All of these characters may need to be escaped in some 103 | # contexts. Divided into three groups based on (US-layout) keyboard position. 104 | 105 | ,./;'[]\-= 106 | <>?:"{}|_+ 107 | !@#$%^&*()`~ 108 | 109 | # Non-whitespace C0 controls: U+0001 through U+0008, U+000E through U+001F, 110 | # and U+007F (DEL) 111 | # Often forbidden to appear in various text-based file formats (e.g. XML), 112 | # or reused for internal delimiters on the theory that they should never 113 | # appear in input. 114 | # The next line may appear to be blank or mojibake in some viewers. 115 |  116 | 117 | # Non-whitespace C1 controls: U+0080 through U+0084 and U+0086 through U+009F. 118 | # Commonly misinterpreted as additional graphic characters. 119 | # The next line may appear to be blank, mojibake, or dingbats in some viewers. 120 | €‚ƒ„†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ 121 | 122 | # Whitespace: all of the characters with category Zs, Zl, or Zp (in Unicode 123 | # version 8.0.0), plus U+0009 (HT), U+000B (VT), U+000C (FF), U+0085 (NEL), 124 | # and U+200B (ZERO WIDTH SPACE), which are in the C categories but are often 125 | # treated as whitespace in some contexts. 126 | # This file unfortunately cannot express strings containing 127 | # U+0000, U+000A, or U+000D (NUL, LF, CR). 128 | # The next line may appear to be blank or mojibake in some viewers. 129 | # The next line may be flagged for "trailing whitespace" in some viewers. 130 | …             ​

    131 | 132 | # Unicode additional control characters: all of the characters with 133 | # general category Cf (in Unicode 8.0.0). 134 | # The next line may appear to be blank or mojibake in some viewers. 135 | ­؀؁؂؃؄؅؜۝܏᠎​‌‍‎‏‪‫‬‭‮⁠⁡⁢⁣⁤⁦⁧⁨⁩𑂽𛲠𛲡𛲢𛲣𝅳𝅴𝅵𝅶𝅷𝅸𝅹𝅺󠀁󠀠󠀡󠀢󠀣󠀤󠀥󠀦󠀧󠀨󠀩󠀪󠀫󠀬󠀭󠀮󠀯󠀰󠀱󠀲󠀳󠀴󠀵󠀶󠀷󠀸󠀹󠀺󠀻󠀼󠀽󠀾󠀿󠁀󠁁󠁂󠁃󠁄󠁅󠁆󠁇󠁈󠁉󠁊󠁋󠁌󠁍󠁎󠁏󠁐󠁑󠁒󠁓󠁔󠁕󠁖󠁗󠁘󠁙󠁚󠁛󠁜󠁝󠁞󠁟󠁠󠁡󠁢󠁣󠁤󠁥󠁦󠁧󠁨󠁩󠁪󠁫󠁬󠁭󠁮󠁯󠁰󠁱󠁲󠁳󠁴󠁵󠁶󠁷󠁸󠁹󠁺󠁻󠁼󠁽󠁾󠁿 136 | 137 | # "Byte order marks", U+FEFF and U+FFFE, each on its own line. 138 | # The next two lines may appear to be blank or mojibake in some viewers. 139 |  140 | ￾ 141 | 142 | # Unicode Symbols 143 | # 144 | # Strings which contain common unicode symbols (e.g. smart quotes) 145 | 146 | Ω≈ç√∫˜µ≤≥÷ 147 | åß∂ƒ©˙∆˚¬…æ 148 | œ∑´®†¥¨ˆøπ“‘ 149 | ¡™£¢∞§¶•ªº–≠ 150 | ¸˛Ç◊ı˜Â¯˘¿ 151 | ÅÍÎÏ˝ÓÔÒÚÆ☃ 152 | Œ„´‰ˇÁ¨ˆØ∏”’ 153 | `⁄€‹›fifl‡°·‚—± 154 | ⅛⅜⅝⅞ 155 | ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя 156 | ٠١٢٣٤٥٦٧٨٩ 157 | 158 | # Unicode Subscript/Superscript/Accents 159 | # 160 | # Strings which contain unicode subscripts/superscripts; can cause rendering issues 161 | 162 | ⁰⁴⁵ 163 | ₀₁₂ 164 | ⁰⁴⁵₀₁₂ 165 | ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ 166 | 167 | # Quotation Marks 168 | # 169 | # Strings which contain misplaced quotation marks; can cause encoding errors 170 | 171 | ' 172 | " 173 | '' 174 | "" 175 | '"' 176 | "''''"'" 177 | "'"'"''''" 178 | 179 | 180 | 181 | 182 | 183 | # Two-Byte Characters 184 | # 185 | # Strings which contain two-byte characters: can cause rendering issues or character-length issues 186 | 187 | 田中さんにあげて下さい 188 | パーティーへ行かないか 189 | 和製漢語 190 | 部落格 191 | 사회과학원 어학연구소 192 | 찦차를 타고 온 펲시맨과 쑛다리 똠방각하 193 | 社會科學院語學研究所 194 | 울란바토르 195 | 𠜎𠜱𠝹𠱓𠱸𠲖𠳏 196 | 197 | # Special Unicode Characters Union 198 | # 199 | # A super string recommended by VMware Inc. Globalization Team: can effectively cause rendering issues or character-length issues to validate product globalization readiness. 200 | # 201 | # 表 CJK_UNIFIED_IDEOGRAPHS (U+8868) 202 | # ポ KATAKANA LETTER PO (U+30DD) 203 | # あ HIRAGANA LETTER A (U+3042) 204 | # A LATIN CAPITAL LETTER A (U+0041) 205 | # 鷗 CJK_UNIFIED_IDEOGRAPHS (U+9DD7) 206 | # Œ LATIN SMALL LIGATURE OE (U+0153) 207 | # é LATIN SMALL LETTER E WITH ACUTE (U+00E9) 208 | # B FULLWIDTH LATIN CAPITAL LETTER B (U+FF22) 209 | # 逍 CJK_UNIFIED_IDEOGRAPHS (U+900D) 210 | # Ü LATIN SMALL LETTER U WITH DIAERESIS (U+00FC) 211 | # ß LATIN SMALL LETTER SHARP S (U+00DF) 212 | # ª FEMININE ORDINAL INDICATOR (U+00AA) 213 | # ą LATIN SMALL LETTER A WITH OGONEK (U+0105) 214 | # ñ LATIN SMALL LETTER N WITH TILDE (U+00F1) 215 | # 丂 CJK_UNIFIED_IDEOGRAPHS (U+4E02) 216 | # 㐀 CJK Ideograph Extension A, First (U+3400) 217 | # 𠀀 CJK Ideograph Extension B, First (U+20000) 218 | 219 | 表ポあA鷗ŒéB逍Üߪąñ丂㐀𠀀 220 | 221 | # Changing length when lowercased 222 | # 223 | # Characters which increase in length (2 to 3 bytes) when lowercased 224 | # Credit: https://twitter.com/jifa/status/625776454479970304 225 | 226 | Ⱥ 227 | Ⱦ 228 | 229 | # Japanese Emoticons 230 | # 231 | # Strings which consists of Japanese-style emoticons which are popular on the web 232 | 233 | ヽ༼ຈل͜ຈ༽ノ ヽ༼ຈل͜ຈ༽ノ 234 | (。◕ ∀ ◕。) 235 | `ィ(´∀`∩ 236 | __ロ(,_,*) 237 | ・( ̄∀ ̄)・:*: 238 | ゚・✿ヾ╲(。◕‿◕。)╱✿・゚ 239 | ,。・:*:・゜’( ☻ ω ☻ )。・:*:・゜’ 240 | (╯°□°)╯︵ ┻━┻) 241 | (ノಥ益ಥ)ノ ┻━┻ 242 | ┬─┬ノ( º _ ºノ) 243 | ( ͡° ͜ʖ ͡°) 244 | ¯\_(ツ)_/¯ 245 | 246 | # Emoji 247 | # 248 | # Strings which contain Emoji; should be the same behavior as two-byte characters, but not always 249 | 250 | 😍 251 | 👩🏽 252 | 👾 🙇 💁 🙅 🙆 🙋 🙎 🙍 253 | 🐵 🙈 🙉 🙊 254 | ❤️ 💔 💌 💕 💞 💓 💗 💖 💘 💝 💟 💜 💛 💚 💙 255 | ✋🏿 💪🏿 👐🏿 🙌🏿 👏🏿 🙏🏿 256 | 🚾 🆒 🆓 🆕 🆖 🆗 🆙 🏧 257 | 0️⃣ 1️⃣ 2️⃣ 3️⃣ 4️⃣ 5️⃣ 6️⃣ 7️⃣ 8️⃣ 9️⃣ 🔟 258 | 259 | # Regional Indicator Symbols 260 | # 261 | # Regional Indicator Symbols can be displayed differently across 262 | # fonts, and have a number of special behaviors 263 | 264 | 🇺🇸🇷🇺🇸 🇦🇫🇦🇲🇸 265 | 🇺🇸🇷🇺🇸🇦🇫🇦🇲 266 | 🇺🇸🇷🇺🇸🇦 267 | 268 | # Unicode Numbers 269 | # 270 | # Strings which contain unicode numbers; if the code is localized, it should see the input as numeric 271 | 272 | 123 273 | ١٢٣ 274 | 275 | # Right-To-Left Strings 276 | # 277 | # Strings which contain text that should be rendered RTL if possible (e.g. Arabic, Hebrew) 278 | 279 | ثم نفس سقطت وبالتحديد،, جزيرتي باستخدام أن دنو. إذ هنا؟ الستار وتنصيب كان. أهّل ايطاليا، بريطانيا-فرنسا قد أخذ. سليمان، إتفاقية بين ما, يذكر الحدود أي بعد, معاملة بولندا، الإطلاق عل إيو. 280 | בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ 281 | הָיְתָהtestالصفحات التّحول 282 | ﷽ 283 | ﷺ 284 | مُنَاقَشَةُ سُبُلِ اِسْتِخْدَامِ اللُّغَةِ فِي النُّظُمِ الْقَائِمَةِ وَفِيم يَخُصَّ التَّطْبِيقَاتُ الْحاسُوبِيَّةُ، 285 | 286 | # Trick Unicode 287 | # 288 | # Strings which contain unicode with unusual properties (e.g. Right-to-left override) (c.f. http://www.unicode.org/charts/PDF/U2000.pdf) 289 | 290 | ‪‪test‪ 291 | ‫test‫ 292 | 
test
 293 | test⁠test‫ 294 | ⁦test⁧ 295 | 296 | # Zalgo Text 297 | # 298 | # Strings which contain "corrupted" text. The corruption will not appear in non-HTML text, however. (via http://www.eeemo.net) 299 | 300 | Ṱ̺̺̕o͞ ̷i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤ ̖t̝͕̳̣̻̪͞h̼͓̲̦̳̘̲e͇̣̰̦̬͎ ̢̼̻̱̘h͚͎͙̜̣̲ͅi̦̲̣̰̤v̻͍e̺̭̳̪̰-m̢iͅn̖̺̞̲̯̰d̵̼̟͙̩̼̘̳ ̞̥̱̳̭r̛̗̘e͙p͠r̼̞̻̭̗e̺̠̣͟s̘͇̳͍̝͉e͉̥̯̞̲͚̬͜ǹ̬͎͎̟̖͇̤t͍̬̤͓̼̭͘ͅi̪̱n͠g̴͉ ͏͉ͅc̬̟h͡a̫̻̯͘o̫̟̖͍̙̝͉s̗̦̲.̨̹͈̣ 301 | ̡͓̞ͅI̗̘̦͝n͇͇͙v̮̫ok̲̫̙͈i̖͙̭̹̠̞n̡̻̮̣̺g̲͈͙̭͙̬͎ ̰t͔̦h̞̲e̢̤ ͍̬̲͖f̴̘͕̣è͖ẹ̥̩l͖͔͚i͓͚̦͠n͖͍̗͓̳̮g͍ ̨o͚̪͡f̘̣̬ ̖̘͖̟͙̮c҉͔̫͖͓͇͖ͅh̵̤̣͚͔á̗̼͕ͅo̼̣̥s̱͈̺̖̦̻͢.̛̖̞̠̫̰ 302 | ̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟ 303 | ̦H̬̤̗̤͝e͜ ̜̥̝̻͍̟́w̕h̖̯͓o̝͙̖͎̱̮ ҉̺̙̞̟͈W̷̼̭a̺̪͍į͈͕̭͙̯̜t̶̼̮s̘͙͖̕ ̠̫̠B̻͍͙͉̳ͅe̵h̵̬͇̫͙i̹͓̳̳̮͎̫̕n͟d̴̪̜̖ ̰͉̩͇͙̲͞ͅT͖̼͓̪͢h͏͓̮̻e̬̝̟ͅ ̤̹̝W͙̞̝͔͇͝ͅa͏͓͔̹̼̣l̴͔̰̤̟͔ḽ̫.͕ 304 | Z̮̞̠͙͔ͅḀ̗̞͈̻̗Ḷ͙͎̯̹̞͓G̻O̭̗̮ 305 | 306 | # Unicode Upsidedown 307 | # 308 | # Strings which contain unicode with an "upsidedown" effect (via http://www.upsidedowntext.com) 309 | 310 | ˙ɐnbᴉlɐ ɐuƃɐɯ ǝɹolop ʇǝ ǝɹoqɐl ʇn ʇunpᴉpᴉɔuᴉ ɹodɯǝʇ poɯsnᴉǝ op pǝs 'ʇᴉlǝ ƃuᴉɔsᴉdᴉpɐ ɹnʇǝʇɔǝsuoɔ 'ʇǝɯɐ ʇᴉs ɹolop ɯnsdᴉ ɯǝɹo˥ 311 | 00˙Ɩ$- 312 | 313 | # Unicode font 314 | # 315 | # Strings which contain bold/italic/etc. versions of normal characters 316 | 317 | The quick brown fox jumps over the lazy dog 318 | 𝐓𝐡𝐞 𝐪𝐮𝐢𝐜𝐤 𝐛𝐫𝐨𝐰𝐧 𝐟𝐨𝐱 𝐣𝐮𝐦𝐩𝐬 𝐨𝐯𝐞𝐫 𝐭𝐡𝐞 𝐥𝐚𝐳𝐲 𝐝𝐨𝐠 319 | 𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌 320 | 𝑻𝒉𝒆 𝒒𝒖𝒊𝒄𝒌 𝒃𝒓𝒐𝒘𝒏 𝒇𝒐𝒙 𝒋𝒖𝒎𝒑𝒔 𝒐𝒗𝒆𝒓 𝒕𝒉𝒆 𝒍𝒂𝒛𝒚 𝒅𝒐𝒈 321 | 𝓣𝓱𝓮 𝓺𝓾𝓲𝓬𝓴 𝓫𝓻𝓸𝔀𝓷 𝓯𝓸𝔁 𝓳𝓾𝓶𝓹𝓼 𝓸𝓿𝓮𝓻 𝓽𝓱𝓮 𝓵𝓪𝔃𝔂 𝓭𝓸𝓰 322 | 𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘 323 | 𝚃𝚑𝚎 𝚚𝚞𝚒𝚌𝚔 𝚋𝚛𝚘𝚠𝚗 𝚏𝚘𝚡 𝚓𝚞𝚖𝚙𝚜 𝚘𝚟𝚎𝚛 𝚝𝚑𝚎 𝚕𝚊𝚣𝚢 𝚍𝚘𝚐 324 | ⒯⒣⒠ ⒬⒰⒤⒞⒦ ⒝⒭⒪⒲⒩ ⒡⒪⒳ ⒥⒰⒨⒫⒮ ⒪⒱⒠⒭ ⒯⒣⒠ ⒧⒜⒵⒴ ⒟⒪⒢ 325 | 326 | # Script Injection 327 | # 328 | # Strings which attempt to invoke a benign script injection; shows vulnerability to XSS 329 | 330 | 331 | <script>alert('123');</script> 332 | 333 | 334 | "> 335 | '> 336 | > 337 | 338 | < / script >< script >alert(123)< / script > 339 | onfocus=JaVaSCript:alert(123) autofocus 340 | " onfocus=JaVaSCript:alert(123) autofocus 341 | ' onfocus=JaVaSCript:alert(123) autofocus 342 | <script>alert(123)</script> 343 | ript>alert(123)ript> 344 | --> 345 | ";alert(123);t=" 346 | ';alert(123);t=' 347 | JavaSCript:alert(123) 348 | ;alert(123); 349 | src=JaVaSCript:prompt(132) 350 | ">javascript:alert(1); 356 | javascript:alert(1); 357 | javascript:alert(1); 358 | javascript:alert(1); 359 | javascript:alert(1); 360 | javascript:alert(1); 361 | javascript:alert(1); 362 | '`"><\x3Cscript>javascript:alert(1) 363 | '`"><\x00script>javascript:alert(1) 364 | ABC
DEF 365 | ABC
DEF 366 | ABC
DEF 367 | ABC
DEF 368 | ABC
DEF 369 | ABC
DEF 370 | ABC
DEF 371 | ABC
DEF 372 | ABC
DEF 373 | ABC
DEF 374 | ABC
DEF 375 | ABC
DEF 376 | ABC
DEF 377 | ABC
DEF 378 | ABC
DEF 379 | ABC
DEF 380 | ABC
DEF 381 | ABC
DEF 382 | ABC
DEF 383 | ABC
DEF 384 | ABC
DEF 385 | ABC
DEF 386 | ABC
DEF 387 | ABC
DEF 388 | ABC
DEF 389 | ABC
DEF 390 | ABC
DEF 391 | test 392 | test 393 | test 394 | test 395 | test 396 | test 397 | test 398 | test 399 | test 400 | test 401 | test 402 | test 403 | test 404 | test 405 | test 406 | test 407 | test 408 | test 409 | test 410 | test 411 | test 412 | test 413 | test 414 | test 415 | test 416 | test 417 | test 418 | test 419 | test 420 | test 421 | test 422 | test 423 | test 424 | test 425 | test 426 | test 427 | test 428 | test 429 | test 430 | test 431 | test 432 | test 433 | test 434 | test 435 | test 436 | test 437 | test 438 | test 439 | test 440 | test 441 | test 442 | test 443 | test 444 | test 445 | test 446 | test 447 | test 448 | `"'> 449 | `"'> 450 | `"'> 451 | `"'> 452 | `"'> 453 | `"'> 454 | `"'> 455 | `"'> 456 | `"'> 457 | `"'> 458 | "`'> 459 | "`'> 460 | "`'> 461 | "`'> 462 | "`'> 463 | "`'> 464 | "`'> 465 | "`'> 466 | "`'> 467 | "`'> 468 | "`'> 469 | "`'> 470 | "`'> 471 | "`'> 472 | "`'> 473 | "`'> 474 | "`'> 475 | "`'> 476 | "`'> 477 | "`'> 478 | "`'> 479 | "`'> 480 | "`'> 481 | "`'> 482 | "`'> 483 | "`'> 484 | "`'> 485 | "`'> 486 | "`'> 487 | "`'> 488 | "`'> 489 | "`'> 490 | "`'> 491 | "`'> 492 | "`'> 493 | "`'> 494 | "`'> 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | XXX 530 | 531 | 532 | 533 | <a href=http://foo.bar/#x=`y></a><img alt="`><img src=x:x onerror=javascript:alert(1)></a>"> 534 | <!--[if]><script>javascript:alert(1)</script --> 535 | <!--[if<img src=x onerror=javascript:alert(1)//]> --> 536 | <script src="/\%(jscript)s"></script> 537 | <script src="\\%(jscript)s"></script> 538 | <IMG """><SCRIPT>alert("XSS")</SCRIPT>"> 539 | <IMG SRC=javascript:alert(String.fromCharCode(88,83,83))> 540 | <IMG SRC=# onmouseover="alert('xxs')"> 541 | <IMG SRC= onmouseover="alert('xxs')"> 542 | <IMG onmouseover="alert('xxs')"> 543 | <IMG SRC=javascript:alert('XSS')> 544 | <IMG SRC=javascript:alert('XSS')> 545 | <IMG SRC=javascript:alert('XSS')> 546 | <IMG SRC="jav ascript:alert('XSS');"> 547 | <IMG SRC="jav ascript:alert('XSS');"> 548 | <IMG SRC="jav ascript:alert('XSS');"> 549 | <IMG SRC="jav ascript:alert('XSS');"> 550 | perl -e 'print "<IMG SRC=java\0script:alert(\"XSS\")>";' > out 551 | <IMG SRC="  javascript:alert('XSS');"> 552 | <SCRIPT/XSS SRC="http://ha.ckers.org/xss.js"></SCRIPT> 553 | <BODY onload!#$%&()*~+-_.,:;?@[/|\]^`=alert("XSS")> 554 | <SCRIPT/SRC="http://ha.ckers.org/xss.js"></SCRIPT> 555 | <<SCRIPT>alert("XSS");//<</SCRIPT> 556 | <SCRIPT SRC=http://ha.ckers.org/xss.js?< B > 557 | <SCRIPT SRC=//ha.ckers.org/.j> 558 | <IMG SRC="javascript:alert('XSS')" 559 | <iframe src=http://ha.ckers.org/scriptlet.html < 560 | \";alert('XSS');// 561 | <u oncopy=alert()> Copy me</u> 562 | <i onwheel=alert(1)> Scroll over me </i> 563 | <plaintext> 564 | http://a/%%30%30 565 | </textarea><script>alert(123)</script> 566 | 567 | # SQL Injection 568 | # 569 | # Strings which can cause a SQL injection if inputs are not sanitized 570 | 571 | 1;DROP TABLE users 572 | 1'; DROP TABLE users-- 1 573 | ' OR 1=1 -- 1 574 | ' OR '1'='1 575 | 576 | % 577 | _ 578 | 579 | # Server Code Injection 580 | # 581 | # Strings which can cause user to run code on server as a privileged user (c.f. https://news.ycombinator.com/item?id=7665153) 582 | 583 | - 584 | -- 585 | --version 586 | --help 587 | $USER 588 | /dev/null; touch /tmp/blns.fail ; echo 589 | `touch /tmp/blns.fail` 590 | $(touch /tmp/blns.fail) 591 | @{[system "touch /tmp/blns.fail"]} 592 | 593 | # Command Injection (Ruby) 594 | # 595 | # Strings which can call system commands within Ruby/Rails applications 596 | 597 | eval("puts 'hello world'") 598 | System("ls -al /") 599 | `ls -al /` 600 | Kernel.exec("ls -al /") 601 | Kernel.exit(1) 602 | %x('ls -al /') 603 | 604 | # XXE Injection (XML) 605 | # 606 | # String which can reveal system files when parsed by a badly configured XML parser 607 | 608 | <?xml version="1.0" encoding="ISO-8859-1"?><!DOCTYPE foo [ <!ELEMENT foo ANY ><!ENTITY xxe SYSTEM "file:///etc/passwd" >]><foo>&xxe;</foo> 609 | 610 | # Unwanted Interpolation 611 | # 612 | # Strings which can be accidentally expanded into different strings if evaluated in the wrong context, e.g. used as a printf format string or via Perl or shell eval. Might expose sensitive data from the program doing the interpolation, or might just represent the wrong string. 613 | 614 | $HOME 615 | $ENV{'HOME'} 616 | %d 617 | %s%s%s%s%s 618 | {0} 619 | %*.*s 620 | %@ 621 | %n 622 | File:/// 623 | 624 | # File Inclusion 625 | # 626 | # Strings which can cause user to pull in files that should not be a part of a web server 627 | 628 | ../../../../../../../../../../../etc/passwd%00 629 | ../../../../../../../../../../../etc/hosts 630 | 631 | # Known CVEs and Vulnerabilities 632 | # 633 | # Strings that test for known vulnerabilities 634 | 635 | () { 0; }; touch /tmp/blns.shellshock1.fail; 636 | () { _; } >_[$($())] { touch /tmp/blns.shellshock2.fail; } 637 | <<< %s(un='%s') = %u 638 | +++ATH0 639 | 640 | # MSDOS/Windows Special Filenames 641 | # 642 | # Strings which are reserved characters in MSDOS/Windows 643 | 644 | CON 645 | PRN 646 | AUX 647 | CLOCK$ 648 | NUL 649 | A: 650 | ZZ: 651 | COM1 652 | LPT1 653 | LPT2 654 | LPT3 655 | COM2 656 | COM3 657 | COM4 658 | 659 | # IRC specific strings 660 | # 661 | # Strings that may occur on IRC clients that make security products freak out 662 | 663 | DCC SEND STARTKEYLOGGER 0 0 0 664 | 665 | # Scunthorpe Problem 666 | # 667 | # Innocuous strings which may be blocked by profanity filters (https://en.wikipedia.org/wiki/Scunthorpe_problem) 668 | 669 | Scunthorpe General Hospital 670 | Penistone Community Church 671 | Lightwater Country Park 672 | Jimmy Clitheroe 673 | Horniman Museum 674 | shitake mushrooms 675 | RomansInSussex.co.uk 676 | http://www.cum.qc.ca/ 677 | Craig Cockburn, Software Specialist 678 | Linda Callahan 679 | Dr. Herman I. Libshitz 680 | magna cum laude 681 | Super Bowl XXX 682 | medieval erection of parapets 683 | evaluate 684 | mocha 685 | expression 686 | Arsenal canal 687 | classic 688 | Tyson Gay 689 | Dick Van Dyke 690 | basement 691 | 692 | # Human injection 693 | # 694 | # Strings which may cause human to reinterpret worldview 695 | 696 | If you're reading this, you've been in a coma for almost 20 years now. We're trying a new technique. We don't know where this message will end up in your dream, but we hope it works. Please wake up, we miss you. 697 | 698 | # Terminal escape codes 699 | # 700 | # Strings which punish the fools who use cat/type on this file 701 | 702 | Roses are red, violets are blue. Hope you enjoy terminal hue 703 | But now...for my greatest trick... 704 | The quick brown fox... [Beeeep] 705 | 706 | # iOS Vulnerabilities 707 | # 708 | # Strings which crashed iMessage in various versions of iOS 709 | 710 | Powerلُلُصّبُلُلصّبُررً ॣ ॣh ॣ ॣ冗 711 | 🏳0🌈️ 712 | జ్ఞ‌ా 713 | -------------------------------------------------------------------------------- /include/ustr.h: -------------------------------------------------------------------------------- 1 | #ifndef __USTR_H__ 2 | #define __USTR_H__ 3 | 4 | #include <stddef.h> 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | 10 | typedef struct { 11 | const char* ptr; 12 | } ustr_t; 13 | 14 | /* 15 | Create a new ustr_t from the given char*. 16 | It is assumed that `str` is a valid, non-null pointer. Passing anything else 17 | will result in undefined behaviour. 18 | Any invlid UTF-8 in `str` will be replaced by U+FFFD REPLACEMENT CHARACTER 19 | */ 20 | ustr_t ustr(const char* str); 21 | 22 | /* 23 | Returns the length of the given ustr_t in bytes. 24 | */ 25 | size_t ustr_len(ustr_t u); 26 | 27 | /* 28 | Returns the precomputed hash for the given ustr_t. 29 | */ 30 | uint64_t ustr_hash(ustr_t u); 31 | 32 | #ifdef __cplusplus 33 | } 34 | #endif 35 | 36 | #endif -------------------------------------------------------------------------------- /include/ustr.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __USTR_HPP__ 2 | #define __USTR_HPP__ 3 | 4 | #include "ustr.h" 5 | #include <string> 6 | 7 | /// A class representing an interned string. 8 | class Ustr { 9 | ustr_t _u; 10 | 11 | public: 12 | /// Creates the empty string 13 | Ustr() { _u = ustr(""); } 14 | 15 | /// Create a new Ustr from a const char* 16 | /// It is assumed that `str` is a valid, non-null pointer. Passing anything 17 | /// else will result in undefined behaviour. 18 | /// Any invlid UTF-8 in `str` will be replaced by U+FFFD REPLACEMENT 19 | /// CHARACTER 20 | Ustr(const char* ptr) { _u = ustr(ptr); } 21 | 22 | /// Create a new Ustr from a std::string 23 | Ustr(const std::string& s) { _u = ustr(s.c_str()); } 24 | 25 | /// Returns true if the string is empty 26 | bool is_empty() const { return len() == 0; } 27 | 28 | /// Returns the length of the string, in bytes. 29 | size_t len() const { return ustr_len(_u); } 30 | 31 | /// Returns the precomputed hash of the string 32 | size_t hash() const { return ustr_hash(_u); } 33 | 34 | /// Easy conversion to the underlying C struct 35 | operator ustr_t() const { return _u; } 36 | 37 | /// Get the interned chars 38 | const char* c_str() const { return _u.ptr; } 39 | }; 40 | 41 | #endif -------------------------------------------------------------------------------- /miri.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | set -ex 4 | 5 | export CARGO_NET_RETRY=5 6 | export CARGO_NET_TIMEOUT=10 7 | 8 | MIRI_NIGHTLY=nightly-$(curl -s https://rust-lang.github.io/rustup-components-history/x86_64-unknown-linux-gnu/miri) 9 | echo "Installing latest nightly with Miri: $MIRI_NIGHTLY" 10 | rustup default "$MIRI_NIGHTLY" 11 | 12 | rustup component add miri 13 | cargo miri setup 14 | 15 | export RUST_TEST_THREADS=1 16 | cargo miri test --features=serde 17 | -------------------------------------------------------------------------------- /mutex_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anderslanglands/ustr/4c1fde840807f8af9d4bfd38805c3d89ac51baeb/mutex_comparison.png -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | edition = "2018" 2 | max_width = 80 3 | wrap_comments = true 4 | -------------------------------------------------------------------------------- /src/bumpalloc.rs: -------------------------------------------------------------------------------- 1 | use std::alloc::{GlobalAlloc, Layout, System}; 2 | 3 | // The world's dumbest allocator. Just keep bumping a pointer until we run out 4 | // of memory, in which case we abort. StringCache is responsible for creating 5 | // a new allocator when that's about to happen. 6 | // This is now bumping downward rather than up, which simplifies the allocate() 7 | // method and gives a small (5-7%) performance improvement in multithreaded 8 | // benchmarks 9 | // See https://fitzgeraldnick.com/2019/11/01/always-bump-downwards.html 10 | pub(crate) struct LeakyBumpAlloc { 11 | layout: Layout, 12 | start: *mut u8, 13 | end: *mut u8, 14 | ptr: *mut u8, 15 | } 16 | 17 | impl LeakyBumpAlloc { 18 | pub fn new(capacity: usize, alignment: usize) -> LeakyBumpAlloc { 19 | let layout = Layout::from_size_align(capacity, alignment).unwrap(); 20 | let start = unsafe { System.alloc(layout) }; 21 | if start.is_null() { 22 | panic!("oom"); 23 | } 24 | let end = unsafe { start.add(layout.size()) }; 25 | let ptr = end; 26 | LeakyBumpAlloc { 27 | layout, 28 | start, 29 | end, 30 | ptr, 31 | } 32 | } 33 | 34 | #[doc(hidden)] 35 | // used for resetting the cache between benchmark runs. DO NOT CALL THIS. 36 | pub unsafe fn clear(&mut self) { 37 | System.dealloc(self.start, self.layout); 38 | } 39 | 40 | // Allocates a new chunk. Aborts if out of memory. 41 | pub unsafe fn allocate(&mut self, num_bytes: usize) -> *mut u8 { 42 | // Our new ptr will be offset down the heap by num_bytes bytes. 43 | let ptr = self.ptr as usize; 44 | let new_ptr = ptr.checked_sub(num_bytes).expect("ptr sub overflowed"); 45 | // Round down to alignment. 46 | let new_ptr = new_ptr & !(self.layout.align() - 1); 47 | // Check we have enough capacity. 48 | let start = self.start as usize; 49 | if new_ptr < start { 50 | eprintln!( 51 | "Allocator asked to bump to {} bytes with a capacity of {}", 52 | self.end as usize - new_ptr, 53 | self.capacity() 54 | ); 55 | // We have to abort here rather than panic or the mutex may 56 | // deadlock. 57 | std::process::abort(); 58 | } 59 | 60 | self.ptr = self.ptr.sub(ptr - new_ptr); 61 | self.ptr 62 | } 63 | 64 | pub fn allocated(&self) -> usize { 65 | self.end as usize - self.ptr as usize 66 | } 67 | 68 | pub fn capacity(&self) -> usize { 69 | self.layout.size() 70 | } 71 | 72 | pub(crate) fn end(&self) -> *const u8 { 73 | self.end 74 | } 75 | 76 | pub(crate) fn ptr(&self) -> *const u8 { 77 | self.ptr 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/hash.rs: -------------------------------------------------------------------------------- 1 | use super::Ustr; 2 | use byteorder::{ByteOrder, NativeEndian}; 3 | use std::{ 4 | collections::{HashMap, HashSet}, 5 | hash::{BuildHasherDefault, Hasher}, 6 | }; 7 | 8 | /// A standard `HashMap` using `Ustr` as the key type with a custom `Hasher` 9 | /// that just uses the precomputed hash for speed instead of calculating it. 10 | pub type UstrMap<V> = HashMap<Ustr, V, BuildHasherDefault<IdentityHasher>>; 11 | 12 | /// A standard `HashSet` using `Ustr` as the key type with a custom `Hasher` 13 | /// that just uses the precomputed hash for speed instead of calculating it. 14 | pub type UstrSet = HashSet<Ustr, BuildHasherDefault<IdentityHasher>>; 15 | 16 | /// The worst hasher in the world -- the identity hasher. 17 | #[doc(hidden)] 18 | #[derive(Default)] 19 | pub struct IdentityHasher { 20 | hash: u64, 21 | } 22 | 23 | impl Hasher for IdentityHasher { 24 | #[inline] 25 | fn write(&mut self, bytes: &[u8]) { 26 | if bytes.len() == 8 { 27 | self.hash = NativeEndian::read_u64(bytes); 28 | } 29 | } 30 | 31 | #[inline] 32 | fn finish(&self) -> u64 { 33 | self.hash 34 | } 35 | } 36 | 37 | #[test] 38 | fn test_hashing() { 39 | let _t = super::TEST_LOCK.lock(); 40 | use crate::ustr as u; 41 | 42 | use std::hash::Hash; 43 | let u1 = u("the quick brown fox"); 44 | let u2 = u("jumped over the lazy dog"); 45 | 46 | let mut hasher = IdentityHasher::default(); 47 | u1.hash(&mut hasher); 48 | assert_eq!(hasher.finish(), u1.precomputed_hash()); 49 | 50 | let mut hasher = IdentityHasher::default(); 51 | u2.hash(&mut hasher); 52 | assert_eq!(hasher.finish(), u2.precomputed_hash()); 53 | 54 | let mut hm = UstrMap::<u32>::default(); 55 | hm.insert(u1, 17); 56 | hm.insert(u2, 42); 57 | 58 | assert_eq!(hm.get(&u1), Some(&17)); 59 | assert_eq!(hm.get(&u2), Some(&42)); 60 | } 61 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Fast, FFI-friendly string interning. A [`Ustr`] (**U**nique **Str**) is a 2 | //! lightweight handle representing a static, immutable entry in a global string 3 | //! cache, allowing for: 4 | //! 5 | //! * Extremely fast string assignment and comparisons -- it's just a pointer 6 | //! comparison. 7 | //! 8 | //! * Efficient storage -- only one copy of the string is held in memory, and 9 | //! getting access to it is just a pointer indirection. 10 | //! 11 | //! * Fast hashing -- the precomputed hash is stored with the string. 12 | //! 13 | //! * Fast FFI -- the string is stored with a terminating null byte so can be 14 | //! passed to C directly without doing the `CString` dance. 15 | //! 16 | //! The downside is no strings are ever freed, so if you're creating lots and 17 | //! lots of strings, you might run out of memory. On the other hand, War and 18 | //! Peace is only 3MB, so it's probably fine. 19 | //! 20 | //! This crate is based on [OpenImageIO's](https://openimageio.readthedocs.io/en/v2.4.10.0/) 21 | //! (OIIO) [`ustring`](https://github.com/OpenImageIO/oiio/blob/master/src/include/OpenImageIO/ustring.h) 22 | //! but it is *not* binary-compatible (yet). The underlying hash map 23 | //! implementation is directy ported from OIIO. 24 | //! 25 | //! # Usage 26 | //! 27 | //! ``` 28 | //! use ustr::{Ustr, ustr, ustr as u}; 29 | //! 30 | //! # unsafe { ustr::_clear_cache() }; 31 | //! // Creation is quick and easy using either `Ustr::from` or the ustr function 32 | //! // and only one copy of any string is stored. 33 | //! let u1 = Ustr::from("the quick brown fox"); 34 | //! let u2 = ustr("the quick brown fox"); 35 | //! 36 | //! // Comparisons and copies are extremely cheap. 37 | //! let u3 = u1; 38 | //! assert_eq!(u2, u3); 39 | //! 40 | //! // You can pass straight to FFI. 41 | //! let len = unsafe { 42 | //! libc::strlen(u1.as_char_ptr()) 43 | //! }; 44 | //! assert_eq!(len, 19); 45 | //! 46 | //! // Use as_str() to get a `str`. 47 | //! let words: Vec<&str> = u1.as_str().split_whitespace().collect(); 48 | //! assert_eq!(words, ["the", "quick", "brown", "fox"]); 49 | //! 50 | //! // For best performance when using Ustr as key for a HashMap or HashSet, 51 | //! // you'll want to use the precomputed hash. To make this easier, just use 52 | //! // the UstrMap and UstrSet exports: 53 | //! use ustr::UstrMap; 54 | //! 55 | //! // Key type is always `Ustr`. 56 | //! let mut map: UstrMap<usize> = UstrMap::default(); 57 | //! map.insert(u1, 17); 58 | //! assert_eq!(*map.get(&u1).unwrap(), 17); 59 | //! ``` 60 | //! 61 | //! By enabling the `"serde"` feature you can serialize individual `Ustr`s 62 | //! or the whole cache with serde. 63 | //! 64 | //! ``` 65 | //! # #[cfg(feature = "serde")] { 66 | //! use ustr::{Ustr, ustr}; 67 | //! let u_ser = ustr("serde"); 68 | //! let json = serde_json::to_string(&u_ser).unwrap(); 69 | //! let u_de : Ustr = serde_json::from_str(&json).unwrap(); 70 | //! assert_eq!(u_ser, u_de); 71 | //! # } 72 | //! ``` 73 | //! 74 | //! Since the cache is global, use the `ustr::DeserializedCache` dummy object to 75 | //! drive the deserialization. 76 | //! 77 | //! ``` 78 | //! # #[cfg(feature = "serde")] { 79 | //! use ustr::{Ustr, ustr}; 80 | //! ustr("Send me to JSON and back"); 81 | //! let json = serde_json::to_string(ustr::cache()).unwrap(); 82 | //! 83 | //! // ... some time later ... 84 | //! let _: ustr::DeserializedCache = serde_json::from_str(&json).unwrap(); 85 | //! assert_eq!(ustr::num_entries(), 1); 86 | //! assert_eq!(ustr::string_cache_iter().collect::<Vec<_>>(), vec!["Send me to JSON and back"]); 87 | //! # } 88 | //! ``` 89 | //! 90 | //! ## Why? 91 | //! 92 | //! It is common in certain types of applications to use strings as identifiers, 93 | //! but not really do any processing with them. 94 | //! To paraphrase from OIIO's `Ustring` documentation -- compared to standard 95 | //! strings, `Ustr`s have several advantages: 96 | //! 97 | //! - Each individual `Ustr` is very small -- in fact, we guarantee that a 98 | //! `Ustr` is the same size and memory layout as an ordinary `*u8`. 99 | //! 100 | //! - Storage is frugal, since there is only one allocated copy of each unique 101 | //! character sequence, throughout the lifetime of the program. 102 | //! 103 | //! - Assignment from one `Ustr` to another is just copy of the pointer; no 104 | //! allocation, no character copying, no reference counting. 105 | //! 106 | //! - Equality testing (do the strings contain the same characters) is a 107 | //! single operation, the comparison of the pointer. 108 | //! 109 | //! - Memory allocation only occurs when a new `Ustr` is constructed from raw 110 | //! characters the FIRST time -- subsequent constructions of the same string 111 | //! just finds it in the canonial string set, but doesn't need to allocate 112 | //! new storage. Destruction of a `Ustr` is trivial, there is no 113 | //! de-allocation because the canonical version stays in the set. Also, 114 | //! therefore, no user code mistake can lead to memory leaks. 115 | //! 116 | //! But there are some problems, too. Canonical strings are never freed 117 | //! from the table. So in some sense all the strings "leak", but they 118 | //! only leak one copy for each unique string that the program ever comes 119 | //! across. 120 | //! 121 | //! On the whole, `Ustr`s are a really great string representation 122 | //! 123 | //! - if you tend to have (relatively) few unique strings, but many copies of 124 | //! those strings; 125 | //! 126 | //! - if the creation of strings from raw characters is relatively rare 127 | //! compared to copying or comparing to existing strings; 128 | //! 129 | //! - if you tend to make the same strings over and over again, and if it's 130 | //! relatively rare that a single unique character sequence is used only 131 | //! once in the entire lifetime of the program; 132 | //! 133 | //! - if your most common string operations are assignment and equality 134 | //! testing and you want them to be as fast as possible; 135 | //! 136 | //! - if you are doing relatively little character-by-character assembly of 137 | //! strings, string concatenation, or other "string manipulation" (other 138 | //! than equality testing). 139 | //! 140 | //! `Ustr`s are not so hot 141 | //! 142 | //! - if your program tends to have very few copies of each character sequence 143 | //! over the entire lifetime of the program; 144 | //! 145 | //! - if your program tends to generate a huge variety of unique strings over 146 | //! its lifetime, each of which is used only a short time and then 147 | //! discarded, never to be needed again; 148 | //! 149 | //! - if you don't need to do a lot of string assignment or equality testing, 150 | //! but lots of more complex string manipulation. 151 | //! 152 | //! ## Safety and Compatibility 153 | //! 154 | //! This crate contains a significant amount of unsafe but usage has been 155 | //! checked and is well-documented. It is also run through Miri as part of the 156 | //! CI process. I use it regularly on 64-bit systems, and it has passed Miri on 157 | //! a 32-bit system as well, bit 32-bit is not checked regularly. If you want to 158 | //! use it on 32-bit, please make sure to run Miri and open and issue if you 159 | //! find any problems. 160 | use parking_lot::Mutex; 161 | use std::{ 162 | borrow::Cow, 163 | cmp::Ordering, 164 | ffi::{CStr, OsStr}, 165 | fmt, 166 | hash::{Hash, Hasher}, 167 | ops::Deref, 168 | os::raw::c_char, 169 | path::Path, 170 | ptr::NonNull, 171 | rc::Rc, 172 | slice, str, 173 | str::FromStr, 174 | sync::Arc, 175 | }; 176 | 177 | mod hash; 178 | pub use hash::*; 179 | mod bumpalloc; 180 | 181 | mod stringcache; 182 | pub use stringcache::*; 183 | #[cfg(feature = "serde")] 184 | pub mod serialization; 185 | #[cfg(feature = "serde")] 186 | pub use serialization::DeserializedCache; 187 | 188 | /// A handle representing a string in the global string cache. 189 | /// 190 | /// To use, create one using [`Ustr::from`] or the [`ustr`] function. You can 191 | /// freely copy, destroy or send `Ustr`s to other threads: the underlying string 192 | /// is always valid in memory (and is never destroyed). 193 | #[derive(Copy, Clone, PartialEq)] 194 | #[repr(transparent)] 195 | pub struct Ustr { 196 | char_ptr: NonNull<u8>, 197 | } 198 | 199 | /// Defer to `str` for equality. 200 | /// 201 | /// Lexicographic ordering will be slower than pointer comparison, but much less 202 | /// surprising if you use `Ustr`s as keys in e.g. a `BTreeMap`. 203 | impl Ord for Ustr { 204 | fn cmp(&self, other: &Self) -> Ordering { 205 | self.as_str().cmp(other.as_str()) 206 | } 207 | } 208 | 209 | /// Defer to `str` for equality. 210 | /// 211 | /// Lexicographic ordering will be slower thanpointer comparison, but much less 212 | /// surprising if you use `Ustr`s as keys in e.g. a `BTreeMap`. 213 | #[allow(clippy::non_canonical_partial_ord_impl)] 214 | impl PartialOrd for Ustr { 215 | fn partial_cmp(&self, other: &Self) -> Option<Ordering> { 216 | Some(self.cmp(other)) 217 | } 218 | } 219 | 220 | impl Ustr { 221 | /// Create a new `Ustr` from the given `str`. 222 | /// 223 | /// You can also use the [`ustr`] function. 224 | /// 225 | /// # Examples 226 | /// 227 | /// ``` 228 | /// use ustr::{Ustr, ustr as u}; 229 | /// # unsafe { ustr::_clear_cache() }; 230 | /// 231 | /// let u1 = Ustr::from("the quick brown fox"); 232 | /// let u2 = u("the quick brown fox"); 233 | /// assert_eq!(u1, u2); 234 | /// assert_eq!(ustr::num_entries(), 1); 235 | /// ``` 236 | pub fn from(string: &str) -> Ustr { 237 | let hash = { 238 | let mut hasher = ahash::AHasher::default(); 239 | hasher.write(string.as_bytes()); 240 | hasher.finish() 241 | }; 242 | let mut sc = STRING_CACHE.0[whichbin(hash)].lock(); 243 | Ustr { 244 | // SAFETY: sc.insert does not give back a null pointer 245 | char_ptr: unsafe { 246 | NonNull::new_unchecked(sc.insert(string, hash) as *mut _) 247 | }, 248 | } 249 | } 250 | 251 | pub fn from_existing(string: &str) -> Option<Ustr> { 252 | let hash = { 253 | let mut hasher = ahash::AHasher::default(); 254 | hasher.write(string.as_bytes()); 255 | hasher.finish() 256 | }; 257 | let sc = STRING_CACHE.0[whichbin(hash)].lock(); 258 | sc.get_existing(string, hash).map(|ptr| Ustr { 259 | char_ptr: unsafe { NonNull::new_unchecked(ptr as *mut _) }, 260 | }) 261 | } 262 | 263 | /// Get the cached `Ustr` as a `str`. 264 | /// 265 | /// # Examples 266 | /// 267 | /// ``` 268 | /// use ustr::ustr as u; 269 | /// # unsafe { ustr::_clear_cache() }; 270 | /// 271 | /// let u_fox = u("the quick brown fox"); 272 | /// let words: Vec<&str> = u_fox.as_str().split_whitespace().collect(); 273 | /// assert_eq!(words, ["the", "quick", "brown", "fox"]); 274 | /// ``` 275 | pub fn as_str(&self) -> &'static str { 276 | // This is safe if: 277 | // 1) self.char_ptr points to a valid address 278 | // 2) len is a usize stored usize aligned usize bytes before char_ptr 279 | // 3) char_ptr points to a valid UTF-8 string of len bytes. 280 | // All these are guaranteed by StringCache::insert() and by the fact 281 | // we can only construct a Ustr from a valid &str. 282 | unsafe { 283 | str::from_utf8_unchecked(slice::from_raw_parts( 284 | self.char_ptr.as_ptr(), 285 | self.len(), 286 | )) 287 | } 288 | } 289 | 290 | /// Get the cached string as a C `char*`. 291 | /// 292 | /// This includes the null terminator so is safe to pass straight to FFI. 293 | /// 294 | /// # Examples 295 | /// 296 | /// ``` 297 | /// use ustr::ustr as u; 298 | /// # unsafe { ustr::_clear_cache() }; 299 | /// 300 | /// let u_fox = u("the quick brown fox"); 301 | /// let len = unsafe { 302 | /// libc::strlen(u_fox.as_char_ptr()) 303 | /// }; 304 | /// assert_eq!(len, 19); 305 | /// ``` 306 | /// 307 | /// # Safety 308 | /// 309 | /// This is just passing a raw byte array with a null terminator to C. If 310 | /// your source string contains non-ascii bytes then this will pass them 311 | /// straight along with no checking. 312 | /// 313 | /// The string is **immutable**. That means that if you modify it across the 314 | /// FFI boundary then all sorts of terrible things will happen. 315 | pub fn as_char_ptr(&self) -> *const c_char { 316 | self.char_ptr.as_ptr() as *const c_char 317 | } 318 | 319 | /// Get this `Ustr` as a [`CStr`] 320 | /// 321 | /// This is useful for passing to APIs (like ash) that use `CStr`. 322 | /// 323 | /// # Safety 324 | /// 325 | /// This function by itself is safe as the pointer and length are guaranteed 326 | /// to be valid. All the same caveats for the use of the `CStr` as given in 327 | /// the `CStr` docs apply. 328 | pub fn as_cstr(&self) -> &CStr { 329 | unsafe { 330 | CStr::from_bytes_with_nul_unchecked(slice::from_raw_parts( 331 | self.as_ptr(), 332 | self.len() + 1, 333 | )) 334 | } 335 | } 336 | 337 | /// Get a raw pointer to the `StringCacheEntry`. 338 | #[inline] 339 | fn as_string_cache_entry(&self) -> &StringCacheEntry { 340 | // The allocator guarantees that the alignment is correct and that 341 | // this pointer is non-null 342 | unsafe { &*(self.char_ptr.as_ptr().cast::<StringCacheEntry>().sub(1)) } 343 | } 344 | 345 | /// Get the length (in bytes) of this string. 346 | #[inline] 347 | pub fn len(&self) -> usize { 348 | self.as_string_cache_entry().len 349 | } 350 | 351 | /// Returns true if the length is zero. 352 | pub fn is_empty(&self) -> bool { 353 | self.len() == 0 354 | } 355 | 356 | /// Get the precomputed hash for this string. 357 | #[inline] 358 | pub fn precomputed_hash(&self) -> u64 { 359 | self.as_string_cache_entry().hash 360 | } 361 | 362 | /// Get an owned String copy of this string. 363 | pub fn to_owned(&self) -> String { 364 | self.as_str().to_owned() 365 | } 366 | } 367 | 368 | // We're safe to impl these because the strings they reference are immutable 369 | // and for all intents and purposes 'static since they're never deleted after 370 | // being created 371 | unsafe impl Send for Ustr {} 372 | unsafe impl Sync for Ustr {} 373 | 374 | impl PartialEq<str> for Ustr { 375 | fn eq(&self, other: &str) -> bool { 376 | self.as_str() == other 377 | } 378 | } 379 | 380 | impl PartialEq<Ustr> for str { 381 | fn eq(&self, u: &Ustr) -> bool { 382 | self == u.as_str() 383 | } 384 | } 385 | 386 | impl PartialEq<&str> for Ustr { 387 | fn eq(&self, other: &&str) -> bool { 388 | self.as_str() == *other 389 | } 390 | } 391 | 392 | impl PartialEq<Ustr> for &str { 393 | fn eq(&self, u: &Ustr) -> bool { 394 | *self == u.as_str() 395 | } 396 | } 397 | 398 | impl PartialEq<&&str> for Ustr { 399 | fn eq(&self, other: &&&str) -> bool { 400 | self.as_str() == **other 401 | } 402 | } 403 | 404 | impl PartialEq<Ustr> for &&str { 405 | fn eq(&self, u: &Ustr) -> bool { 406 | **self == u.as_str() 407 | } 408 | } 409 | 410 | impl PartialEq<String> for Ustr { 411 | fn eq(&self, other: &String) -> bool { 412 | self.as_str() == other 413 | } 414 | } 415 | 416 | impl PartialEq<Ustr> for String { 417 | fn eq(&self, u: &Ustr) -> bool { 418 | self == u.as_str() 419 | } 420 | } 421 | 422 | impl PartialEq<&String> for Ustr { 423 | fn eq(&self, other: &&String) -> bool { 424 | self.as_str() == *other 425 | } 426 | } 427 | 428 | impl PartialEq<Ustr> for &String { 429 | fn eq(&self, u: &Ustr) -> bool { 430 | *self == u.as_str() 431 | } 432 | } 433 | 434 | impl PartialEq<Box<str>> for Ustr { 435 | fn eq(&self, other: &Box<str>) -> bool { 436 | self.as_str() == &**other 437 | } 438 | } 439 | 440 | impl PartialEq<Ustr> for Box<str> { 441 | fn eq(&self, u: &Ustr) -> bool { 442 | &**self == u.as_str() 443 | } 444 | } 445 | 446 | impl PartialEq<Ustr> for &Box<str> { 447 | fn eq(&self, u: &Ustr) -> bool { 448 | &***self == u.as_str() 449 | } 450 | } 451 | 452 | impl PartialEq<Cow<'_, str>> for Ustr { 453 | fn eq(&self, other: &Cow<'_, str>) -> bool { 454 | self.as_str() == &*other 455 | } 456 | } 457 | 458 | impl PartialEq<Ustr> for Cow<'_, str> { 459 | fn eq(&self, u: &Ustr) -> bool { 460 | &*self == u.as_str() 461 | } 462 | } 463 | 464 | impl PartialEq<&Cow<'_, str>> for Ustr { 465 | fn eq(&self, other: &&Cow<'_, str>) -> bool { 466 | self.as_str() == &**other 467 | } 468 | } 469 | 470 | impl PartialEq<Ustr> for &Cow<'_, str> { 471 | fn eq(&self, u: &Ustr) -> bool { 472 | &**self == u.as_str() 473 | } 474 | } 475 | 476 | impl PartialEq<Ustr> for Path { 477 | fn eq(&self, u: &Ustr) -> bool { 478 | self == Path::new(u) 479 | } 480 | } 481 | 482 | impl PartialEq<Ustr> for &Path { 483 | fn eq(&self, u: &Ustr) -> bool { 484 | *self == Path::new(u) 485 | } 486 | } 487 | 488 | impl PartialEq<Ustr> for OsStr { 489 | fn eq(&self, u: &Ustr) -> bool { 490 | self == OsStr::new(u) 491 | } 492 | } 493 | 494 | impl PartialEq<Ustr> for &OsStr { 495 | fn eq(&self, u: &Ustr) -> bool { 496 | *self == OsStr::new(u) 497 | } 498 | } 499 | 500 | impl Eq for Ustr {} 501 | 502 | impl<T: ?Sized> AsRef<T> for Ustr 503 | where 504 | str: AsRef<T>, 505 | { 506 | fn as_ref(&self) -> &T { 507 | self.as_str().as_ref() 508 | } 509 | } 510 | 511 | impl FromStr for Ustr { 512 | type Err = std::string::ParseError; 513 | 514 | #[inline] 515 | fn from_str(s: &str) -> Result<Self, Self::Err> { 516 | Ok(Ustr::from(s)) 517 | } 518 | } 519 | 520 | impl From<&str> for Ustr { 521 | fn from(s: &str) -> Ustr { 522 | Ustr::from(s) 523 | } 524 | } 525 | 526 | impl From<Ustr> for &'static str { 527 | fn from(s: Ustr) -> &'static str { 528 | s.as_str() 529 | } 530 | } 531 | 532 | impl From<Ustr> for String { 533 | fn from(u: Ustr) -> Self { 534 | String::from(u.as_str()) 535 | } 536 | } 537 | 538 | impl From<Ustr> for Box<str> { 539 | fn from(u: Ustr) -> Self { 540 | Box::from(u.as_str()) 541 | } 542 | } 543 | 544 | impl From<Ustr> for Rc<str> { 545 | fn from(u: Ustr) -> Self { 546 | Rc::from(u.as_str()) 547 | } 548 | } 549 | 550 | impl From<Ustr> for Arc<str> { 551 | fn from(u: Ustr) -> Self { 552 | Arc::from(u.as_str()) 553 | } 554 | } 555 | 556 | impl From<Ustr> for Cow<'static, str> { 557 | fn from(u: Ustr) -> Self { 558 | Cow::Borrowed(u.as_str()) 559 | } 560 | } 561 | 562 | impl From<String> for Ustr { 563 | fn from(s: String) -> Ustr { 564 | Ustr::from(&s) 565 | } 566 | } 567 | 568 | impl From<&String> for Ustr { 569 | fn from(s: &String) -> Ustr { 570 | Ustr::from(&**s) 571 | } 572 | } 573 | 574 | impl From<Box<str>> for Ustr { 575 | fn from(s: Box<str>) -> Ustr { 576 | Ustr::from(&*s) 577 | } 578 | } 579 | 580 | impl From<Rc<str>> for Ustr { 581 | fn from(s: Rc<str>) -> Ustr { 582 | Ustr::from(&*s) 583 | } 584 | } 585 | 586 | impl From<Arc<str>> for Ustr { 587 | fn from(s: Arc<str>) -> Ustr { 588 | Ustr::from(&*s) 589 | } 590 | } 591 | 592 | impl From<Cow<'_, str>> for Ustr { 593 | fn from(s: Cow<'_, str>) -> Ustr { 594 | Ustr::from(&*s) 595 | } 596 | } 597 | 598 | impl Default for Ustr { 599 | fn default() -> Self { 600 | Ustr::from("") 601 | } 602 | } 603 | 604 | impl Deref for Ustr { 605 | type Target = str; 606 | fn deref(&self) -> &Self::Target { 607 | self.as_str() 608 | } 609 | } 610 | 611 | impl fmt::Display for Ustr { 612 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 613 | write!(f, "{}", self.as_str()) 614 | } 615 | } 616 | 617 | impl fmt::Debug for Ustr { 618 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 619 | write!(f, "u!({:?})", self.as_str()) 620 | } 621 | } 622 | 623 | // Just feed the precomputed hash into the Hasher. Note that this will of course 624 | // be terrible unless the Hasher in question is expecting a precomputed hash. 625 | impl Hash for Ustr { 626 | fn hash<H: Hasher>(&self, state: &mut H) { 627 | self.precomputed_hash().hash(state); 628 | } 629 | } 630 | 631 | /// DO NOT CALL THIS. 632 | /// 633 | /// Clears the cache -- used for benchmarking and testing purposes to clear the 634 | /// cache. Calling this will invalidate any previously created `UStr`s and 635 | /// probably cause your house to burn down. DO NOT CALL THIS. 636 | /// 637 | /// # Safety 638 | /// 639 | /// DO NOT CALL THIS. 640 | #[doc(hidden)] 641 | pub unsafe fn _clear_cache() { 642 | for m in STRING_CACHE.0.iter() { 643 | m.lock().clear(); 644 | } 645 | } 646 | 647 | /// Returns the total amount of memory allocated and in use by the cache in 648 | /// bytes. 649 | pub fn total_allocated() -> usize { 650 | STRING_CACHE 651 | .0 652 | .iter() 653 | .map(|sc| { 654 | let t = sc.lock().total_allocated(); 655 | 656 | t 657 | }) 658 | .sum() 659 | } 660 | 661 | /// Returns the total amount of memory reserved by the cache in bytes. 662 | pub fn total_capacity() -> usize { 663 | STRING_CACHE 664 | .0 665 | .iter() 666 | .map(|sc| { 667 | let t = sc.lock().total_capacity(); 668 | t 669 | }) 670 | .sum() 671 | } 672 | 673 | /// Create a new `Ustr` from the given `str`. 674 | /// 675 | /// # Examples 676 | /// 677 | /// ``` 678 | /// use ustr::ustr; 679 | /// # unsafe { ustr::_clear_cache() }; 680 | /// 681 | /// let u1 = ustr("the quick brown fox"); 682 | /// let u2 = ustr("the quick brown fox"); 683 | /// assert_eq!(u1, u2); 684 | /// assert_eq!(ustr::num_entries(), 1); 685 | /// ``` 686 | #[inline] 687 | pub fn ustr(s: &str) -> Ustr { 688 | Ustr::from(s) 689 | } 690 | 691 | /// Create a new `Ustr` from the given `str` but only if it already exists in 692 | /// the string cache. 693 | /// 694 | /// # Examples 695 | /// 696 | /// ``` 697 | /// use ustr::{ustr, existing_ustr}; 698 | /// # unsafe { ustr::_clear_cache() }; 699 | /// 700 | /// let u1 = existing_ustr("the quick brown fox"); 701 | /// let u2 = ustr("the quick brown fox"); 702 | /// let u3 = existing_ustr("the quick brown fox"); 703 | /// assert_eq!(u1, None); 704 | /// assert_eq!(u3, Some(u2)); 705 | /// ``` 706 | #[inline] 707 | pub fn existing_ustr(s: &str) -> Option<Ustr> { 708 | Ustr::from_existing(s) 709 | } 710 | 711 | /// Utility function to get a reference to the main cache object for use with 712 | /// serialization. 713 | /// 714 | /// # Examples 715 | /// 716 | /// ``` 717 | /// # use ustr::{Ustr, ustr, ustr as u}; 718 | /// # #[cfg(feature="serde")] 719 | /// # { 720 | /// # unsafe { ustr::_clear_cache() }; 721 | /// ustr("Send me to JSON and back"); 722 | /// let json = serde_json::to_string(ustr::cache()).unwrap(); 723 | /// # } 724 | pub fn cache() -> &'static Bins { 725 | &STRING_CACHE 726 | } 727 | 728 | /// Returns the number of unique strings in the cache. 729 | /// 730 | /// This may be an underestimate if other threads are writing to the cache 731 | /// concurrently. 732 | /// 733 | /// # Examples 734 | /// 735 | /// ``` 736 | /// use ustr::ustr as u; 737 | /// 738 | /// let _ = u("Hello"); 739 | /// let _ = u(", World!"); 740 | /// assert_eq!(ustr::num_entries(), 2); 741 | /// ``` 742 | pub fn num_entries() -> usize { 743 | STRING_CACHE 744 | .0 745 | .iter() 746 | .map(|sc| { 747 | let t = sc.lock().num_entries(); 748 | t 749 | }) 750 | .sum() 751 | } 752 | 753 | #[doc(hidden)] 754 | pub fn num_entries_per_bin() -> Vec<usize> { 755 | STRING_CACHE 756 | .0 757 | .iter() 758 | .map(|sc| { 759 | let t = sc.lock().num_entries(); 760 | t 761 | }) 762 | .collect::<Vec<_>>() 763 | } 764 | 765 | /// Return an iterator over the entire string cache. 766 | /// 767 | /// If another thread is adding strings concurrently to this call then they 768 | /// might not show up in the view of the cache presented by this iterator. 769 | /// 770 | /// # Safety 771 | /// 772 | /// This returns an iterator to the state of the cache at the time when 773 | /// `string_cache_iter()` was called. It is of course possible that another 774 | /// thread will add more strings to the cache after this, but since we never 775 | /// destroy the strings, they remain valid, meaning it's safe to iterate over 776 | /// them, the list just might not be completely up to date. 777 | pub fn string_cache_iter() -> StringCacheIterator { 778 | let mut allocs = Vec::new(); 779 | for m in STRING_CACHE.0.iter() { 780 | let sc = m.lock(); 781 | // the start of the allocator's data is actually the ptr, start() just 782 | // points to the beginning of the allocated region. The first bytes will 783 | // be uninitialized since we're bumping down 784 | for a in &sc.old_allocs { 785 | allocs.push((a.ptr(), a.end())); 786 | } 787 | let ptr = sc.alloc.ptr(); 788 | let end = sc.alloc.end(); 789 | if ptr != end { 790 | allocs.push((sc.alloc.ptr(), sc.alloc.end())); 791 | } 792 | } 793 | 794 | let current_ptr = 795 | allocs.first().map(|s| s.0).unwrap_or_else(std::ptr::null); 796 | 797 | StringCacheIterator { 798 | allocs, 799 | current_alloc: 0, 800 | current_ptr, 801 | } 802 | } 803 | 804 | /// The type used for the global string cache. 805 | /// 806 | /// This is exposed to allow e.g. serialization of the data returned by the 807 | /// [`cache()`] function. 808 | #[repr(transparent)] 809 | pub struct Bins(pub(crate) [Mutex<StringCache>; NUM_BINS]); 810 | 811 | #[cfg(test)] 812 | lazy_static::lazy_static! { 813 | static ref TEST_LOCK: Mutex<()> = Mutex::new(()); 814 | } 815 | 816 | #[cfg(test)] 817 | mod tests { 818 | use super::TEST_LOCK; 819 | use lazy_static::lazy_static; 820 | use std::ffi::OsStr; 821 | use std::path::Path; 822 | use std::sync::Mutex; 823 | 824 | #[test] 825 | fn it_works() { 826 | let _t = TEST_LOCK.lock(); 827 | use super::ustr as u; 828 | 829 | let u_hello = u("hello"); 830 | assert_eq!(u_hello, "hello"); 831 | let u_world = u("world"); 832 | assert_eq!(u_world, String::from("world")); 833 | } 834 | 835 | #[test] 836 | fn empty_string() { 837 | let _t = TEST_LOCK.lock(); 838 | use super::ustr as u; 839 | 840 | unsafe { 841 | super::_clear_cache(); 842 | } 843 | 844 | let _empty = u(""); 845 | let empty = u(""); 846 | 847 | assert!(empty.as_str().is_empty()); 848 | assert_eq!(super::num_entries(), 1); 849 | } 850 | 851 | #[test] 852 | fn c_str_works() { 853 | let _t = TEST_LOCK.lock(); 854 | use super::ustr as u; 855 | use std::ffi::CStr; 856 | 857 | let s_fox = "The quick brown fox jumps over the lazy dog."; 858 | let u_fox = u(s_fox); 859 | let fox = unsafe { CStr::from_ptr(u_fox.as_char_ptr()) } 860 | .to_string_lossy() 861 | .into_owned(); 862 | assert_eq!(fox, s_fox); 863 | 864 | let s_odys = "Τη γλώσσα μου έδωσαν ελληνική"; 865 | let u_odys = u(s_odys); 866 | let odys = unsafe { CStr::from_ptr(u_odys.as_char_ptr()) } 867 | .to_string_lossy() 868 | .into_owned(); 869 | assert_eq!(odys, s_odys); 870 | } 871 | 872 | #[test] 873 | // We have to disable miri here as it's far too slow unfortunately 874 | #[cfg_attr(miri, ignore)] 875 | fn blns() { 876 | let _t = TEST_LOCK.lock(); 877 | use super::{string_cache_iter, ustr as u}; 878 | use std::collections::HashSet; 879 | 880 | // clear the cache first or our results will be wrong 881 | unsafe { super::_clear_cache() }; 882 | 883 | // let path = 884 | // std::path::Path::new(&std::env::var("CARGO_MANIFEST_DIR").unwrap()) 885 | // .join("data") 886 | // .join("blns.txt"); 887 | // let blns = std::fs::read_to_string(path).unwrap(); 888 | let blns = include_str!("../data/blns.txt"); 889 | 890 | let mut hs = HashSet::new(); 891 | for s in blns.split_whitespace() { 892 | hs.insert(s); 893 | } 894 | 895 | let mut us = Vec::new(); 896 | let mut ss = Vec::new(); 897 | 898 | for s in blns.split_whitespace().cycle().take(100_000) { 899 | let u = u(s); 900 | us.push(u); 901 | ss.push(s.to_owned()); 902 | } 903 | 904 | let mut hs_u = HashSet::new(); 905 | for s in string_cache_iter() { 906 | hs_u.insert(s); 907 | } 908 | let diff: HashSet<_> = hs.difference(&hs_u).collect(); 909 | 910 | // check that the number of entries is the same 911 | assert_eq!(super::num_entries(), hs.len()); 912 | 913 | // check that we have the exact same (unique) strings in the cache as in 914 | // the source data 915 | assert_eq!(diff.len(), 0); 916 | 917 | let nbs = super::num_entries_per_bin(); 918 | println!("{:?}", nbs); 919 | 920 | println!("Total allocated: {}", super::total_allocated()); 921 | println!("Total capacity: {}", super::total_capacity()); 922 | 923 | println!( 924 | "size of StringCache: {}", 925 | std::mem::size_of::<super::StringCache>() 926 | ); 927 | } 928 | 929 | #[test] 930 | // We have to disable miri here as it's far too slow unfortunately 931 | #[cfg_attr(miri, ignore)] 932 | fn raft() { 933 | let _t = TEST_LOCK.lock(); 934 | use super::ustr as u; 935 | use std::sync::Arc; 936 | 937 | // let path = 938 | // std::path::Path::new(&std::env::var("CARGO_MANIFEST_DIR").unwrap()) 939 | // .join("data") 940 | // .join("raft-large-directories.txt"); 941 | // let raft = std::fs::read_to_string(path).unwrap(); 942 | let raft = include_str!("../data/raft-large-directories.txt"); 943 | let raft = Arc::new( 944 | raft.split_whitespace() 945 | .collect::<Vec<_>>() 946 | .chunks(3) 947 | .map(|s| { 948 | if s.len() == 3 { 949 | format!("{}/{}/{}", s[0], s[1], s[2]) 950 | } else { 951 | s[0].to_owned() 952 | } 953 | }) 954 | .collect::<Vec<_>>(), 955 | ); 956 | 957 | let s = raft.clone(); 958 | for _ in 0..600 { 959 | let mut v = Vec::with_capacity(20_000); 960 | unsafe { super::_clear_cache() }; 961 | for s in s.iter().cycle().take(20_000) { 962 | v.push(u(s)); 963 | } 964 | } 965 | } 966 | 967 | // This test is to have miri check the allocation code paths, but miri 968 | // can't open files so it's not usable right now 969 | // #[test] 970 | // fn words() { 971 | // let _t = TEST_LOCK.lock(); 972 | // use super::ustr as u; 973 | // use std::sync::Arc; 974 | 975 | // let path = std::path::Path::new("/usr/share/dict/words"); 976 | // let wordlist = std::fs::read_to_string(path).unwrap(); 977 | // let wordlist = Arc::new( 978 | // wordlist 979 | // .split_whitespace() 980 | // .collect::<Vec<_>>() 981 | // .chunks(7) 982 | // .cycle() 983 | // .take(4_000_000) 984 | // .enumerate() 985 | // .map(|(i, s)| u(&format!("{}{}", i, s.join("-")))) 986 | // .collect::<Vec<_>>(), 987 | // ); 988 | // } 989 | 990 | #[cfg(all(feature = "serde", not(miri)))] 991 | #[test] 992 | fn serialization() { 993 | let _t = TEST_LOCK.lock(); 994 | use super::{string_cache_iter, ustr as u}; 995 | use std::collections::HashSet; 996 | 997 | // clear the cache first or our results will be wrong 998 | unsafe { super::_clear_cache() }; 999 | 1000 | let path = std::path::Path::new( 1001 | &std::env::var("CARGO_MANIFEST_DIR") 1002 | .expect("CARGO_MANIFEST_DIR not set"), 1003 | ) 1004 | .join("data") 1005 | .join("blns.txt"); 1006 | let blns = std::fs::read_to_string(path).unwrap(); 1007 | 1008 | let mut hs = HashSet::new(); 1009 | for s in blns.split_whitespace() { 1010 | hs.insert(s); 1011 | } 1012 | 1013 | let mut us = Vec::new(); 1014 | let mut ss = Vec::new(); 1015 | 1016 | for s in blns.split_whitespace().cycle().take(100_000) { 1017 | let u = u(s); 1018 | us.push(u); 1019 | ss.push(s.to_owned()); 1020 | } 1021 | 1022 | let json = serde_json::to_string(super::cache()).unwrap(); 1023 | unsafe { 1024 | super::_clear_cache(); 1025 | } 1026 | let _: super::DeserializedCache = serde_json::from_str(&json).unwrap(); 1027 | 1028 | // now check that we've got the same data in the cache still 1029 | let mut hs_u = HashSet::new(); 1030 | for s in string_cache_iter() { 1031 | hs_u.insert(s); 1032 | } 1033 | let diff: HashSet<_> = hs.difference(&hs_u).collect(); 1034 | 1035 | // check that the number of entries is the same 1036 | assert_eq!(super::num_entries(), hs.len()); 1037 | 1038 | // check that we have the exact same (unique) strings in the cache as in 1039 | // the source data 1040 | assert_eq!(diff.len(), 0); 1041 | } 1042 | 1043 | #[cfg(all(feature = "serde", not(miri)))] 1044 | #[test] 1045 | fn serialization_ustr() { 1046 | let _t = TEST_LOCK.lock(); 1047 | 1048 | use super::{ustr, Ustr}; 1049 | 1050 | let u_hello = ustr("hello"); 1051 | 1052 | let json = serde_json::to_string(&u_hello).unwrap(); 1053 | let me_hello: Ustr = serde_json::from_str(&json).unwrap(); 1054 | 1055 | assert_eq!(u_hello, me_hello); 1056 | } 1057 | 1058 | #[test] 1059 | fn partial_ord() { 1060 | let _t = TEST_LOCK.lock(); 1061 | use super::ustr; 1062 | let str_a = ustr("aaa"); 1063 | let str_z = ustr("zzz"); 1064 | let str_k = ustr("kkk"); 1065 | assert!(str_a < str_k); 1066 | assert!(str_k < str_z); 1067 | } 1068 | 1069 | #[test] 1070 | fn ord() { 1071 | let _t = TEST_LOCK.lock(); 1072 | use super::ustr; 1073 | let u_apple = ustr("apple"); 1074 | let u_bravo = ustr("bravo"); 1075 | let u_charlie = ustr("charlie"); 1076 | let u_delta = ustr("delta"); 1077 | 1078 | let mut v = vec![u_delta, u_bravo, u_charlie, u_apple]; 1079 | v.sort(); 1080 | assert_eq!(v, vec![u_apple, u_bravo, u_charlie, u_delta]); 1081 | } 1082 | 1083 | fn takes_into_str<'a, S: Into<&'a str>>(s: S) -> &'a str { 1084 | s.into() 1085 | } 1086 | 1087 | #[test] 1088 | fn test_into_str() { 1089 | let _t = TEST_LOCK.lock(); 1090 | use super::ustr; 1091 | 1092 | assert_eq!("converted", takes_into_str(ustr("converted"))); 1093 | } 1094 | 1095 | #[test] 1096 | fn test_existing_ustr() { 1097 | let _t = TEST_LOCK.lock(); 1098 | use super::{existing_ustr, ustr}; 1099 | assert_eq!(existing_ustr("hello world!"), None); 1100 | let s1 = ustr("hello world!"); 1101 | let s2 = existing_ustr("hello world!"); 1102 | assert_eq!(Some(s1), s2); 1103 | } 1104 | 1105 | #[test] 1106 | fn test_empty_cache() { 1107 | unsafe { super::_clear_cache() }; 1108 | assert_eq!( 1109 | super::string_cache_iter().collect::<Vec<_>>(), 1110 | Vec::<&'static str>::new() 1111 | ); 1112 | } 1113 | 1114 | #[test] 1115 | fn as_refs() { 1116 | let _t = TEST_LOCK.lock(); 1117 | 1118 | let u = super::ustr("test"); 1119 | 1120 | let s: String = u.to_owned(); 1121 | assert_eq!(u, s); 1122 | assert_eq!(s, u); 1123 | 1124 | let p: &Path = u.as_ref(); 1125 | assert_eq!(p, u); 1126 | 1127 | let _: &[u8] = u.as_ref(); 1128 | 1129 | let o: &OsStr = u.as_ref(); 1130 | assert_eq!(p, o); 1131 | assert_eq!(o, p); 1132 | 1133 | let cow = std::borrow::Cow::from(u); 1134 | assert_eq!(cow, u); 1135 | assert_eq!(u, cow); 1136 | 1137 | let boxed: Box<str> = u.into(); 1138 | assert_eq!(boxed, u); 1139 | } 1140 | } 1141 | 1142 | lazy_static::lazy_static! { 1143 | static ref STRING_CACHE: Bins = { 1144 | use std::mem::{self, MaybeUninit}; 1145 | // This deeply unsafe feeling dance allows us to initialize an array of 1146 | // arbitrary size and will have to tide us over until const generics 1147 | // land. See: 1148 | // https://doc.rust-lang.org/beta/std/mem/union.MaybeUninit.html#initializing-an-array-element-by-element 1149 | 1150 | // Create an uninitialized array of `MaybeUninit`. The `assume_init` is 1151 | // safe because the type we are claiming to have initialized here is a 1152 | // bunch of `MaybeUninit`s, which do not require initialization. 1153 | let mut bins: [MaybeUninit<Mutex<StringCache>>; NUM_BINS] = unsafe { 1154 | MaybeUninit::uninit().assume_init() 1155 | }; 1156 | 1157 | // Dropping a `MaybeUninit` does nothing. Thus using raw pointer 1158 | // assignment instead of `ptr::write` does not cause the old 1159 | // uninitialized value to be dropped. Also if there is a panic during 1160 | // this loop, we have a memory leak, but there is no memory safety 1161 | // issue. 1162 | for bin in &mut bins[..] { 1163 | *bin = MaybeUninit::new(Mutex::new(StringCache::default())); 1164 | } 1165 | 1166 | // Everything is initialized. Transmute the array to the 1167 | // initialized type. 1168 | unsafe { mem::transmute::<_, Bins>(bins) } 1169 | }; 1170 | } 1171 | 1172 | // Use the top bits of the hash to choose a bin 1173 | #[inline] 1174 | fn whichbin(hash: u64) -> usize { 1175 | ((hash >> TOP_SHIFT as u64) % NUM_BINS as u64) as usize 1176 | } 1177 | -------------------------------------------------------------------------------- /src/serialization.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | use serde::{ 3 | de::{Deserialize, Deserializer, Error, SeqAccess, Visitor}, 4 | ser::{Serialize, SerializeSeq, Serializer}, 5 | }; 6 | 7 | impl Serialize for Bins { 8 | fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> 9 | where 10 | S: Serializer, 11 | { 12 | let strings: Vec<&'static str> = string_cache_iter().collect(); 13 | let mut seq = serializer.serialize_seq(Some(strings.len()))?; 14 | for s in strings { 15 | match seq.serialize_element(s) { 16 | Ok(_) => (), 17 | Err(e) => { 18 | panic!("Error serializing \"{}\": {}", s, e); 19 | } 20 | } 21 | } 22 | seq.end() 23 | } 24 | } 25 | 26 | pub struct BinsVisitor {} 27 | 28 | impl BinsVisitor { 29 | #[allow(clippy::new_without_default)] 30 | pub fn new() -> Self { 31 | BinsVisitor {} 32 | } 33 | } 34 | 35 | impl<'de> Visitor<'de> for BinsVisitor { 36 | type Value = DeserializedCache; 37 | 38 | fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { 39 | formatter.write_str("a sequence of strings") 40 | } 41 | 42 | fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error> 43 | where 44 | A: SeqAccess<'de>, 45 | { 46 | while let Some(s) = seq.next_element::<String>()? { 47 | ustr(&s); 48 | } 49 | 50 | Ok(DeserializedCache {}) 51 | } 52 | } 53 | 54 | pub struct DeserializedCache {} 55 | 56 | impl<'de> Deserialize<'de> for DeserializedCache { 57 | fn deserialize<D>(deserializer: D) -> Result<DeserializedCache, D::Error> 58 | where 59 | D: Deserializer<'de>, 60 | { 61 | deserializer.deserialize_seq(BinsVisitor::new()) 62 | } 63 | } 64 | 65 | pub struct UstrVisitor {} 66 | impl UstrVisitor { 67 | #[allow(clippy::new_without_default)] 68 | pub fn new() -> Self { 69 | UstrVisitor {} 70 | } 71 | } 72 | 73 | impl<'de> Visitor<'de> for UstrVisitor { 74 | type Value = Ustr; 75 | 76 | fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { 77 | formatter.write_str("a &str") 78 | } 79 | 80 | fn visit_str<E>(self, s: &str) -> Result<Self::Value, E> 81 | where 82 | E: Error, 83 | { 84 | Ok(Ustr::from(s)) 85 | } 86 | } 87 | 88 | impl<'de> Deserialize<'de> for Ustr { 89 | fn deserialize<D>(deserializer: D) -> Result<Ustr, D::Error> 90 | where 91 | D: Deserializer<'de>, 92 | { 93 | deserializer.deserialize_str(UstrVisitor::new()) 94 | } 95 | } 96 | 97 | impl Serialize for Ustr { 98 | fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> 99 | where 100 | S: Serializer, 101 | { 102 | serializer.serialize_str(self.as_str()) 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/stringcache.rs: -------------------------------------------------------------------------------- 1 | use super::bumpalloc::LeakyBumpAlloc; 2 | 3 | // `StringCache` stores a `Vec` of pointers to the `StringCacheEntry` structs. 4 | // The actual memory for the `StringCacheEntry` is stored in the LeakyBumpAlloc, 5 | // and each `Alloc` is rotated out when it's full and a new one twice its size 6 | // is allocated. The Allocator memory is never freed so our strings essentialy 7 | // have a 'static lifetime. 8 | // 9 | // The actual memory representation is as follows. Each `StringCacheEntry` is 10 | // aligned to 8 bytes on a 64-bit system. The 64-bit memoized hash of the string 11 | // is stored first, then a usize length, then the u8 characters, followed by a 12 | // null terminator (not included in len), then x<8 bytes of uninitialized memory 13 | // as padding before the next aligned entry. 14 | // 15 | // hash len H e l l o , W o r l d !\0 16 | // |. . . . . . . .|. . . . . . . .|. . . . . . . .|. . . . . . . .| 17 | // 0 8 16 len 18 | // ^ StringCacheEntry ^ u8 chars ^ null ^ Next entry 19 | // 20 | // Proper alignment is guaranteed when allocating each entry as the alignment 21 | // is baked into the allocator. `StringCache` is responsible for monitoring the 22 | // Allocator and creating a new one when it would overflow -- the `Alloc` itself 23 | // will just `abort()` if it runs out of memory. Note that we abort() rather 24 | // than panic because the behaviour of the spinlock in case of a panic while 25 | // holding the lock is undefined. 26 | // 27 | // Thread safety is ensured because we can only access the `StringCache` through 28 | // the spinlock in the `lazy_static` ref. The initial capacity of the cache is 29 | // divided evenly among a number of 'bins' or shards each with their own lock, 30 | // in order to reduce contention. 31 | #[repr(align(128))] 32 | pub(crate) struct StringCache { 33 | pub(crate) alloc: LeakyBumpAlloc, 34 | pub(crate) old_allocs: Vec<LeakyBumpAlloc>, 35 | entries: Vec<*mut StringCacheEntry>, 36 | num_entries: usize, 37 | mask: usize, 38 | total_allocated: usize, 39 | // Padding and aligning to 128 bytes gives up to 20% performance 40 | // improvement this actually aligns to 256 bytes because of the Mutex 41 | // around it. 42 | _pad: [u32; 3], 43 | } 44 | 45 | // TODO: make these configurable? 46 | // Initial size of the StringCache table 47 | pub(crate) const INITIAL_CAPACITY: usize = 1 << 20; 48 | // Initial size of the allocator storage (in bytes) 49 | pub(crate) const INITIAL_ALLOC: usize = 4 << 20; 50 | // Number of bins (shards) for map 51 | pub(crate) const BIN_SHIFT: usize = 6; 52 | pub(crate) const NUM_BINS: usize = 1 << BIN_SHIFT; 53 | // Shift for top bits to determine bin a hash falls into 54 | pub(crate) const TOP_SHIFT: usize = 55 | 8 * std::mem::size_of::<usize>() - BIN_SHIFT; 56 | 57 | impl StringCache { 58 | /// Create a new StringCache with the given starting capacity 59 | pub fn new() -> StringCache { 60 | let capacity = INITIAL_CAPACITY / NUM_BINS; 61 | let alloc = LeakyBumpAlloc::new( 62 | INITIAL_ALLOC / NUM_BINS, 63 | std::mem::align_of::<StringCacheEntry>(), 64 | ); 65 | StringCache { 66 | // Current allocator. 67 | alloc, 68 | // Old allocators we'll keep around for iteration purposes. 69 | // 16 would mean we've allocated 128GB of string storage since we 70 | // double each time. 71 | old_allocs: Vec::with_capacity(16), 72 | // Vector of pointers to the `StringCacheEntry` headers. 73 | entries: vec![std::ptr::null_mut(); capacity], 74 | num_entries: 0, 75 | mask: capacity - 1, 76 | total_allocated: capacity, 77 | _pad: [0u32; 3], 78 | } 79 | } 80 | 81 | pub(crate) fn get_existing( 82 | &self, 83 | string: &str, 84 | hash: u64, 85 | ) -> Option<*const u8> { 86 | let mut pos = self.mask & hash as usize; 87 | let mut dist = 0; 88 | loop { 89 | let entry = unsafe { self.entries.get_unchecked(pos) }; 90 | if entry.is_null() { 91 | return None; 92 | } 93 | // This is safe as long as entry points to a valid address and the 94 | // layout described in the `StringCache` doc comment holds. 95 | unsafe { 96 | // entry is a `*StringCacheEntry` so offseting by 1 gives us a 97 | // pointer to the end of the entry, aka the beginning of the 98 | // chars. 99 | // As long as the memory is valid and the layout is correct, 100 | // we're safe to create a string slice from the chars since 101 | // they were copied directly from a valid `str`. 102 | let entry_chars = entry.add(1) as *const u8; 103 | // if entry is non-null then it must point to a valid 104 | // StringCacheEntry 105 | let sce = &**entry; 106 | if sce.hash == hash 107 | && sce.len == string.len() 108 | && std::str::from_utf8_unchecked( 109 | std::slice::from_raw_parts(entry_chars, sce.len), 110 | ) == string 111 | { 112 | // found matching string in the cache already, return it 113 | return Some(entry_chars); 114 | } 115 | } 116 | 117 | // Keep looking. 118 | dist += 1; 119 | debug_assert!(dist <= self.mask); 120 | pos = (pos + dist) & self.mask; 121 | } 122 | } 123 | 124 | // Insert the given string with its given hash into the cache. 125 | pub(crate) fn insert(&mut self, string: &str, hash: u64) -> *const u8 { 126 | let mut pos = self.mask & hash as usize; 127 | let mut dist = 0; 128 | loop { 129 | let entry = unsafe { self.entries.get_unchecked(pos) }; 130 | if entry.is_null() { 131 | // found empty slot to insert 132 | break; 133 | } 134 | 135 | // This is safe as long as entry points to a valid address and the 136 | // layout described in the `StringCache` doc comment holds. 137 | unsafe { 138 | // entry is a `*StringCacheEntry` so offseting by 1 gives us a 139 | // pointer to the end of the entry, aka the beginning of the 140 | // chars. 141 | // As long as the memory is valid and the layout is correct, 142 | // we're safe to create a string slice from the chars since 143 | // they were copied directly from a valid `str`. 144 | let entry_chars = entry.add(1) as *const u8; 145 | // If entry is non-null then it must point to a valid 146 | // `StringCacheEntry`. 147 | let sce = &**entry; 148 | if sce.hash == hash 149 | && sce.len == string.len() 150 | && std::str::from_utf8_unchecked( 151 | std::slice::from_raw_parts(entry_chars, sce.len), 152 | ) == string 153 | { 154 | // found matching string in the cache already, return it 155 | return entry_chars; 156 | } 157 | } 158 | 159 | // keep looking 160 | dist += 1; 161 | debug_assert!(dist <= self.mask); 162 | pos = (pos + dist) & self.mask; 163 | } 164 | 165 | // 166 | // Insert the new string. 167 | // 168 | 169 | // We know pos is in bounds as it's &ed with the mask above. 170 | let entry_ptr = unsafe { self.entries.get_unchecked_mut(pos) }; 171 | // Ddd one to length for null byte. 172 | // There's no way we could overflow here in practice since that would 173 | // require having allocated a `u64::MAX`-length string, by which time 174 | // we'll be using 128-bit pointers and we'll need to rewrite this 175 | // crate anyway. 176 | let byte_len = string.len() + 1; 177 | let alloc_size = std::mem::size_of::<StringCacheEntry>() + byte_len; 178 | 179 | // if our new allocation would spill over the allocator, make a new 180 | // allocator and let the old one leak 181 | let capacity = self.alloc.capacity(); 182 | let allocated = self.alloc.allocated(); 183 | if alloc_size 184 | .checked_add(allocated) 185 | .expect("overflowed alloc_size + allocated") 186 | > capacity 187 | { 188 | let new_capacity = capacity 189 | .checked_mul(2) 190 | .expect("capacity * 2 overflowed") 191 | .max(alloc_size); 192 | let old_alloc = std::mem::replace( 193 | &mut self.alloc, 194 | LeakyBumpAlloc::new( 195 | new_capacity, 196 | std::mem::align_of::<StringCacheEntry>(), 197 | ), 198 | ); 199 | self.old_allocs.push(old_alloc); 200 | self.total_allocated += new_capacity; 201 | } 202 | 203 | // This is safe as long as: 204 | // 1. `alloc_size` is calculated correctly. 205 | // 2. there is enough space in the allocator (checked in the block 206 | // above). 207 | // 3. The `StringCacheEntry` layout descibed above holds and the memory 208 | // returned by allocate() is prooperly aligned. 209 | unsafe { 210 | *entry_ptr = 211 | self.alloc.allocate(alloc_size) as *mut StringCacheEntry; 212 | 213 | // Write the header. 214 | // `entry_ptr` is guaranteed to point to a valid `StringCacheEntry`, 215 | // or `alloc.allocate()` would have aborted. 216 | std::ptr::write( 217 | *entry_ptr, 218 | StringCacheEntry { 219 | hash, 220 | len: string.len(), 221 | }, 222 | ); 223 | // Write the characters after the `StringCacheEntry`. 224 | let char_ptr = entry_ptr.add(1) as *mut u8; 225 | std::ptr::copy_nonoverlapping( 226 | string.as_bytes().as_ptr(), 227 | char_ptr, 228 | string.len(), 229 | ); 230 | // Write the trailing null. 231 | let write_ptr = char_ptr.add(string.len()); 232 | std::ptr::write(write_ptr, 0u8); 233 | 234 | self.num_entries += 1; 235 | // We want to keep an 0.5 load factor for the map, so grow if we've 236 | // exceeded that. 237 | if self.num_entries * 2 > self.mask { 238 | self.grow(); 239 | } 240 | 241 | char_ptr 242 | } 243 | } 244 | 245 | // Double the size of the map storage. 246 | // 247 | // This is safe as long as: 248 | // - The in-memory layout of the `StringCacheEntry` is correct. 249 | // 250 | // If there's not enough memory for the new entry table, it will just abort 251 | pub(crate) unsafe fn grow(&mut self) { 252 | let new_mask = self.mask * 2 + 1; 253 | 254 | let mut new_entries: std::vec::Vec<*mut StringCacheEntry> = 255 | vec![std::ptr::null_mut(); new_mask + 1]; 256 | 257 | // copy the existing map into the new map 258 | let mut to_copy = self.num_entries; 259 | for e in self.entries.iter_mut() { 260 | if e.is_null() { 261 | continue; 262 | } 263 | 264 | // Start of the entry is the hash. 265 | let hash = *(*e as *const u64); 266 | let mut pos = (hash as usize) & new_mask; 267 | let mut dist = 0; 268 | loop { 269 | if new_entries[pos].is_null() { 270 | // Here's an empty slot to put the pointer in. 271 | break; 272 | } 273 | 274 | dist += 1; 275 | // This should be impossble as we've allocated twice as many 276 | // slots as we have entries. 277 | debug_assert!(dist <= new_mask, "Probing wrapped around"); 278 | pos = pos.wrapping_add(dist) & new_mask; 279 | } 280 | 281 | new_entries[pos] = *e; 282 | to_copy -= 1; 283 | if to_copy == 0 { 284 | break; 285 | } 286 | } 287 | 288 | self.entries = new_entries; 289 | self.mask = new_mask; 290 | } 291 | 292 | // This is only called by `clear()` during tests to clear the cache between 293 | // runs. **DO NOT CALL THIS**. 294 | pub(crate) unsafe fn clear(&mut self) { 295 | // just zero all the pointers that have already been set 296 | std::ptr::write_bytes(self.entries.as_mut_ptr(), 0, self.mask + 1); 297 | self.num_entries = 0; 298 | self.total_allocated = 0; 299 | for a in self.old_allocs.iter_mut() { 300 | a.clear(); 301 | } 302 | self.old_allocs = Vec::new(); 303 | self.alloc.clear(); 304 | self.alloc = LeakyBumpAlloc::new( 305 | INITIAL_ALLOC / NUM_BINS, 306 | std::mem::align_of::<StringCacheEntry>(), 307 | ); 308 | } 309 | 310 | pub(crate) fn total_allocated(&self) -> usize { 311 | self.alloc.allocated() 312 | + self.old_allocs.iter().map(|a| a.allocated()).sum::<usize>() 313 | } 314 | 315 | pub(crate) fn total_capacity(&self) -> usize { 316 | self.alloc.capacity() 317 | + self.old_allocs.iter().map(|a| a.capacity()).sum::<usize>() 318 | } 319 | 320 | pub(crate) fn num_entries(&self) -> usize { 321 | self.num_entries 322 | } 323 | } 324 | 325 | impl Default for StringCache { 326 | fn default() -> StringCache { 327 | StringCache::new() 328 | } 329 | } 330 | 331 | // We are safe to be `Send` but not `Sync` (we get Sync by wrapping in a mutex). 332 | unsafe impl Send for StringCache {} 333 | 334 | #[doc(hidden)] 335 | pub struct StringCacheIterator { 336 | pub(crate) allocs: Vec<(*const u8, *const u8)>, 337 | pub(crate) current_alloc: usize, 338 | pub(crate) current_ptr: *const u8, 339 | } 340 | 341 | fn round_up_to(n: usize, align: usize) -> usize { 342 | debug_assert!(align.is_power_of_two()); 343 | (n.checked_add(align).expect("round_up_to overflowed") - 1) & !(align - 1) 344 | } 345 | 346 | impl Iterator for StringCacheIterator { 347 | type Item = &'static str; 348 | fn next(&mut self) -> Option<Self::Item> { 349 | // check that the cache is not empty before accessing 350 | if self.allocs.is_empty() { 351 | return None; 352 | } 353 | 354 | let (_, end) = self.allocs[self.current_alloc]; 355 | if self.current_ptr >= end { 356 | // We've reached the end of the current alloc. 357 | if self.current_alloc == self.allocs.len() - 1 { 358 | // We've reached the end. 359 | return None; 360 | } else { 361 | // Advance to the next alloc. 362 | self.current_alloc += 1; 363 | let (current_ptr, _) = self.allocs[self.current_alloc]; 364 | self.current_ptr = current_ptr; 365 | } 366 | } 367 | 368 | // Cast the current ptr to a `StringCacheEntry` and create the next 369 | // string from it. 370 | unsafe { 371 | let sce = &*(self.current_ptr as *const StringCacheEntry); 372 | // The next entry will be the size of the number of bytes in the 373 | // string, +1 for the null byte, rounded up to the alignment (8). 374 | self.current_ptr = sce.next_entry(); 375 | 376 | // We know we're safe not to check here since we put valid UTF-8 in. 377 | let s = std::str::from_utf8_unchecked(std::slice::from_raw_parts( 378 | sce.char_ptr(), 379 | sce.len, 380 | )); 381 | Some(s) 382 | } 383 | } 384 | } 385 | 386 | #[repr(C)] 387 | #[derive(Clone)] 388 | pub(crate) struct StringCacheEntry { 389 | pub(crate) hash: u64, 390 | pub(crate) len: usize, 391 | } 392 | 393 | impl StringCacheEntry { 394 | // Get the pointer to the characters. 395 | pub(crate) fn char_ptr(&self) -> *const u8 { 396 | // We know the chars are always directly after this struct in memory 397 | // because that's the way they're laid out on initialization. 398 | unsafe { (self as *const StringCacheEntry).add(1) as *const u8 } 399 | } 400 | 401 | // Calcualte the address of the next entry in the cache. This is a utility 402 | // function to hide the pointer arithmetic in iterators. 403 | pub(crate) unsafe fn next_entry(&self) -> *const u8 { 404 | #[allow(clippy::ptr_offset_with_cast)] 405 | self.char_ptr().add(round_up_to( 406 | self.len + 1, 407 | std::mem::align_of::<StringCacheEntry>(), 408 | )) 409 | } 410 | } -------------------------------------------------------------------------------- /src/ustr_extern.rs: -------------------------------------------------------------------------------- 1 | use ustr::Ustr; 2 | 3 | #[no_mangle] 4 | pub extern "C" fn ustr(chars: *const std::os::raw::c_char) -> Ustr { 5 | let cs = unsafe { std::ffi::CStr::from_ptr(chars).to_string_lossy() }; 6 | Ustr::from(&cs) 7 | } 8 | 9 | #[no_mangle] 10 | pub extern "C" fn ustr_len(u: Ustr) -> usize { 11 | u.len() 12 | } 13 | 14 | #[no_mangle] 15 | pub extern "C" fn ustr_hash(u: Ustr) -> u64 { 16 | u.precomputed_hash() 17 | } 18 | -------------------------------------------------------------------------------- /ustring_bench_raft.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anderslanglands/ustr/4c1fde840807f8af9d4bfd38805c3d89ac51baeb/ustring_bench_raft.png --------------------------------------------------------------------------------