├── .gitignore
├── .travis.yml
├── Cargo.toml
├── LICENSE.md
├── README.md
├── benches
    └── creation.rs
├── data
    ├── blns.txt
    └── raft-large-directories.txt
├── include
    ├── ustr.h
    └── ustr.hpp
├── miri.sh
├── mutex_comparison.png
├── rustfmt.toml
├── src
    ├── bumpalloc.rs
    ├── hash.rs
    ├── lib.rs
    ├── serialization.rs
    ├── stringcache.rs
    └── ustr_extern.rs
└── ustring_bench_raft.png


/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | **/*.rs.bk
3 | Cargo.lock
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # dist: xenial
 2 | 
 3 | language: rust
 4 | rust: nightly
 5 | 
 6 | matrix:
 7 |   fast_finish: true
 8 |   include:
 9 |   # Miri
10 |     - name: "miri"
11 |       env: TARGET=x86_64-unknown-linux-gnu
12 |       script: sh miri.sh
13 |   # Tier 1 targets:
14 |     - name: "x86_64-unknown-linux-gnu"
15 |       env: TARGET=x86_64-unknown-linux-gnu
16 |       script: env RUST_TEST_THREADS=1 cargo test --verbose --all --features=serde
17 |     - name: "x86_64-unknown-linux-gnu (beta)"
18 |       rust: beta
19 |       env: TARGET=x86_64-unknown-linux-gnu
20 |       script: env RUST_TEST_THREADS=1 cargo test --verbose --all --features=serde
21 |     - name: "x86_64-unknown-linux-gnu (stable)"
22 |       rust: stable
23 |       env: TARGET=x86_64-unknown-linux-gnu
24 |       script: env RUST_TEST_THREADS=1 cargo test --verbose --all --features=serde
25 |     - name: "i686-unknown-linux-gnu"
26 |       env: TARGET=i686-unknown-linux-gnu CROSS=1
27 |       script: env RUST_TEST_THREADS=1 cargo test --verbose --all --features=serde
28 |     - name: "x86_64-apple-darwin-10.3"
29 |       env: TARGET=x86_64-apple-darwin
30 |       script: env RUST_TEST_THREADS=1 cargo test --verbose --all --features=serde
31 |       os: osx
32 |       osx_image: xcode10.3
33 |     - name: "x86_64-apple-darwin-11.2"
34 |       env: TARGET=x86_64-apple-darwin
35 |       script: env RUST_TEST_THREADS=1 cargo test --verbose --all --features=serde
36 |       os: osx
37 |       osx_image: xcode11.2
38 |     - name: "x86_64-pc-windows-msvc"
39 |       env: TARGET=x86_64-pc-windows-msvc
40 |       script: env RUST_TEST_THREADS=1 cargo test --verbose --all --features=serde
41 |       os: windows
42 |     - name: "x86_64-pc-windows-gnu"
43 |       env: TARGET=x86_64-pc-windows-gnu CROSS=1
44 |       script: env RUST_TEST_THREADS=1 cargo test --verbose --all --features=serde
45 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "ustr"
 3 | version = "1.1.0"
 4 | authors = ["Anders Langlands <anderslanglands@gmail.com>"]
 5 | edition = "2021"
 6 | license = "BSD-2-Clause-Patent"
 7 | description = "Fast, FFI-friendly string interning."
 8 | documentation = "https://docs.rs/ustr"
 9 | repository = "https://github.com/anderslanglands/ustr"
10 | readme = "README.md"
11 | keywords = ["string", "interning", "FFI"]
12 | categories = ["caching", "data-structures"]
13 | 
14 | [badges]
15 | travis-ci = { repository = "anderslanglands/ustr", branch = "master" }
16 | 
17 | [dependencies]
18 | byteorder = "1.5"
19 | lazy_static = "1.5"
20 | parking_lot = "0.12"
21 | serde = { version = "1", optional = true }
22 | ahash = { version = "0.8.3", default-features = false }
23 | 
24 | 
25 | [dev-dependencies]
26 | criterion = "0.4"
27 | crossbeam-channel = "0.5"
28 | crossbeam-utils = "0.8"
29 | libc = "0.2"
30 | serde_json = "1"
31 | string-interner = "0.13"
32 | string_cache = "0.8"
33 | 
34 | [[bench]]
35 | name = "creation"
36 | harness = false
37 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | BSD+ License
 2 | 
 3 | Copyright (c) 2019 Anders Langlands
 4 | 
 5 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 8 | 
 9 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
10 | 
11 | Subject to the terms and conditions of this license, each copyright holder and contributor hereby grants to those receiving rights under this license a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except for failure to satisfy the conditions of this license) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer this software, where such license applies only to those patent claims, already acquired or hereafter acquired, licensable by such copyright holder or contributor that are necessarily infringed by:
12 | 
13 | (a) their Contribution(s) (the licensed copyrights of copyright holders and non-copyrightable additions of contributors, in source or binary form) alone; or
14 | 
15 | (b) combination of their Contribution(s) with the work of authorship to which such Contribution(s) was added by such copyright holder or contributor, if, at the time the Contribution is added, such addition causes such combination to be necessarily infringed. The patent license shall not apply to any other combinations which include the Contribution.
16 | 
17 | Except as expressly stated above, no rights or licenses from any copyright holder or contributor is granted under this license, whether expressly, by implication, estoppel or otherwise.
18 | 
19 | DISCLAIMER
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # `ustr`
  2 | 
  3 | Fast, FFI-friendly string interning.
  4 | 
  5 | [![Build Status]][travis] [![Latest Version]][crates.io] [![Docs Badge]][docs.rs]
  6 | 
  7 | [Build Status]: https://img.shields.io/travis/anderslanglands/ustr/master?style=for-the-badge
  8 | [travis]: https://travis-ci.com/anderslanglands/ustr
  9 | [Latest Version]: https://img.shields.io/crates/v/ustr?style=for-the-badge
 10 | [crates.io]: https://crates.io/crates/ustr
 11 | [Docs Badge]: https://img.shields.io/docsrs/ustr?style=for-the-badge
 12 | [docs.rs]:https://docs.rs/ustr
 13 | 
 14 | A `Ustr` (**U**nique **str**) is a lightweight handle representing a static,
 15 | immutable entry in a global string cache, allowing for:
 16 | 
 17 | * Extremely fast string assignment and comparisons.
 18 | 
 19 | * Efficient storage. Only one copy of the string is held in memory, and
 20 |   getting access to it is just a pointer indirection.
 21 | 
 22 | * Fast hashing ‒ the precomputed hash is stored with the string.
 23 | 
 24 | * Fast FFI ‒ the string is stored with a terminating null byte so can be
 25 |   passed to C directly without doing the `CString` dance.
 26 | 
 27 | The downside is no strings are ever freed, so if you're creating lots and lots
 28 | of strings, you might run out of memory. On the other hand, *War and Peace* is
 29 | only 3MB, so it's probably fine.
 30 | 
 31 | This crate is based on [OpenImageIO's](https://openimageio.readthedocs.io/en/v2.4.10.0/)
 32 | (OIIO) [`ustring`](https://github.com/OpenImageIO/oiio/blob/master/src/include/OpenImageIO/ustring.h)
 33 | but it is *not* binary-compatible (yet). The underlying hash map implementation
 34 | is directy ported from OIIO.
 35 | 
 36 | ## Usage
 37 | 
 38 | ```rust
 39 | use ustr::{Ustr, ustr};
 40 | 
 41 | // Creation is quick and easy using either `Ustr::from` or the `ustr` short
 42 | // function and only one copy of any string is stored.
 43 | let h1 = Ustr::from("hello");
 44 | let h2 = ustr("hello");
 45 | 
 46 | // Comparisons and copies are extremely cheap.
 47 | let h3 = h1;
 48 | assert_eq!(h2, h3);
 49 | 
 50 | // You can pass straight to FFI.
 51 | let len = unsafe {
 52 |     libc::strlen(h1.as_char_ptr())
 53 | };
 54 | assert_eq!(len, 5);
 55 | 
 56 | // For best performance when using Ustr as key for a HashMap or HashSet,
 57 | // you'll want to use the precomputed hash. To make this easier, just use
 58 | // the UstrMap and UstrSet exports:
 59 | use ustr::UstrMap;
 60 | 
 61 | // Key type is always Ustr.
 62 | let mut map: UstrMap<usize> = UstrMap::default();
 63 | map.insert(u1, 17);
 64 | assert_eq!(*map.get(&u1).unwrap(), 17);
 65 | ```
 66 | 
 67 | By enabling the `"serde"` feature you can serialize individual `Ustr`s or
 68 | the whole cache with serde.
 69 | 
 70 | ```rust
 71 | use ustr::{Ustr, ustr};
 72 | 
 73 | let u_ser = ustr("serialization is fun!");
 74 | let json = serde_json::to_string(&u_ser).unwrap();
 75 | let u_de : Ustr = serde_json::from_str(&json).unwrap();
 76 | 
 77 | assert_eq!(u_ser, u_de);
 78 | ```
 79 | 
 80 | Since the cache is global, use the `ustr::DeserializedCache` dummy object to
 81 | drive the deserialization.
 82 | 
 83 | ```rust
 84 | ustr("Send me to JSON and back");
 85 | let json = serde_json::to_string(ustr::cache()).unwrap();
 86 | 
 87 | // ... some time later ...
 88 | let _: ustr::DeserializedCache = serde_json::from_str(&json).unwrap();
 89 | assert_eq!(ustr::num_entries(), 1);
 90 | assert_eq!(ustr::string_cache_iter().collect::<Vec<_>>(), vec!["Send me to JSON and back"]);
 91 | 
 92 | ```
 93 | 
 94 | ## Calling from C/C++
 95 | 
 96 | If you are writing a library that uses ustr and want users to be able to create
 97 | `Ustr`s to pass to your API from C, add `ustr_extern.rs` to your crate and use
 98 | `include/ustr.h` or `include/ustr.hpp` for function declarations.
 99 | 
100 | ## Changelog
101 | 
102 | ### Changes since 1.0.0
103 | 
104 | * [Add a bunch of trait implementations that make it easier to work with stringy types](https://github.com/anderslanglands/ustr/pull/40). Thanks to @kornelski.
105 | * [Disable unused ahash features by default](https://github.com/anderslanglands/ustr/pull/46) to fix compilation on `wasm-unknown-unknown`. Thanks to @stephanemagnenat.
106 | 
107 | * [Fix panic if string cache is empty](https://github.com/anderslanglands/ustr/pull/44) and assorted miri warnings and errors. Thanks to @orzogc.
108 | 
109 | * [Bump versions of byteorder, lazy_static, ahash](https://github.com/anderslanglands/ustr/pull/49) and add additional documentation. Thanks to @virtualritz.
110 | 
111 | 
112 | ### Changes since 0.10
113 | 
114 | * Actually renamed `serialization` feature to `serde`
115 | 
116 | ### Changes since 0.9
117 | 
118 | * Fixed and [issue](https://github.com/anderslanglands/ustr/issues/33) that
119 |   would stop `Ustr` from working on `wasm32-unknown-unknown` (contributed by bouk)
120 | 
121 | and thanks to virtualritz:
122 | 
123 | * `Ustr::get_cache()` was [renamed](https://rust-lang.github.io/api-guidelines/naming.html#getter-names-follow-rust-convention-c-getter)
124 |   to `cache()`
125 | 
126 | * All dependencies were bumped to latest versions
127 | 
128 | * All features were removed (there are good defaults) except for
129 |   `serialization`
130 | 
131 | * The `serialization` feature was [renamed](https://github.com/rust-lang/api-guidelines/discussions/180)
132 |   to `serde`
133 | 
134 | * `ustr` now uses Rust 2021
135 | 
136 | ### Changes since 0.8
137 | 
138 | * Add `existing_ustr` function (contributed by macprog-guy)
139 | 
140 |   The idea behind this is to allow the creation of a `Ustr` only when that
141 |   `Ustr` already exists. This is particularly useful when `Ustr`s are being
142 |   created using untrusted user input (say from a web server or API). In that
143 |   case, by providing different values at each call we consume more and more
144 |   memory eventually running out (DoS).
145 | 
146 | * Add implementation for `Ord` (contributed by zigazeljko)
147 | 
148 | * Inlined a bunch of simple functions (contributed by g-plane)
149 | 
150 | * Fixed tests to lock rather than relying on `RUST_TEST_THREADS=1` (contributed
151 |   by kornelski)
152 | 
153 | * Fixed tests to handle serialization feature  properly when enabled
154 |   (contributed by kornelski)
155 | 
156 | * Added a check for a potential allocation failure in the allocator
157 |   (contributed by kornelski)
158 | 
159 | * Added `FromStr` impl (contributed by martinmr)
160 | 
161 | * Add `rustfmt.toml` to repo
162 | 
163 | ### Changes since 0.7
164 | 
165 | * Update dependencies
166 | 
167 |   The versions of `parking_lot` and `ahash` have been updated.
168 | 
169 | * Space optimization with `NonNull`
170 | 
171 |   The internal pointer is now a `NonNull` to take advanatge of layout
172 |   optimizations in `Option` etc.
173 | 
174 | * Add `as_cstr()` method
175 | 
176 |   Added `as_cstr(&self) -> std::ffi::CStr` to make it easier to interface with
177 |   APIs that rely on `CStr`.
178 | 
179 | ### Changes since 0.6
180 | 
181 | * Derive Ord for Ustr
182 | 
183 |   So now you can sort a `Vec` of `Ustr`s lexicographically.
184 | 
185 | ### Changes since 0.5
186 | 
187 | * Added `From<Ustr>` for `&str`
188 | 
189 |   This `impl` makes it easier to pass a `Ustr` to methods expecting an
190 |   `Into<&str>`.
191 | 
192 | ### Changes since 0.4
193 | 
194 | * 32-bit support added
195 | 
196 |   Removed the restriction to 64-bit systems and fixed a bug relating to pointer
197 |   maths. Thanks to agaussman for [bringing it up](https://github.com/anderslanglands/ustr/issues/8).
198 | 
199 | * Miri leak checks re-enabled
200 | 
201 |   Thanks to RalfJung for pointing out that Miri now ignores ["leaks" from statics](https://github.com/anderslanglands/ustr/pull/9).
202 | 
203 | * `PartialOrd` is now lexicographic
204 | *
205 |   Thanks to macprog-guy for the PR implementing PartialOrd by deferring to
206 |   `&str`. This will be slower than the previous derived implementation which
207 |   just did a pointer comparison, but is much [less surprising](https://github.com/anderslanglands/ustr/pull/10).
208 | 
209 | ### Changes since 0.3
210 | 
211 | * Added Miri to CI tests
212 | 
213 |   Miri sanity-checks the unsafe parts of the code to guard against some types
214 |   of UB.
215 | 
216 | * Switched to [ahash](https://github.com/tkaitchuck/aHash) as the default
217 |   hasher
218 | 
219 |   Ahash is a fast, non-cryptographic pure Rust hasher. Pure Rust is important
220 |   to be able to run Miri and ahash benchmarks the fastest I could find. The old
221 |   `fasthash`/`cityhash` is available by enabling `--features=hashcity`
222 | 
223 | ### Changes since 0.2
224 | 
225 | * Serde support
226 | 
227 |   `Ustr` can now be serialized with Serde when enabling
228 |   `--features=serialization`. The global string cache can also be serialized if
229 |   you really want to.
230 | 
231 | * Switched to `parking_lot::Mutex` as default synchronization
232 | 
233 |   Spinlocks have been getting a bad rap recently so the string cache now uses
234 |   `parking_lot::Mutex` as the default synchronization primitive. `spin::Mutex`
235 |   is still available behind the `--features=spinlock` feature gate if you
236 |   really want that extra 5% speed.
237 | 
238 | * Cleaned up `unsafe`
239 | 
240 |   Did a better job of documenting the invariants for the unsafe blocks and
241 |   replaced some blind additions with checked_add() and friends to avoid
242 |   potential (but very unlikely) overflow.
243 | 
244 | * Compared to `string-cache`
245 | 
246 |   [string-cache](https://github.com/servo/string-cache) provides a global cache
247 |   that can be created at compile time as well as at run time. Dynamic strings
248 |   in the cache appear to be reference-counted so will be freed when they are no
249 |   longer used, while `Ustr`s are never deleted.
250 | 
251 |   Creating a `string_cache::DefaultAtom` is much slower than creating a `Ustr`,
252 |   especially in a multi-threaded context. On the other hand if you can just
253 |   bake all your `Atom`s into your binary at compile-time this wouldn't be an
254 |   issue.
255 | 
256 | * Compared to `string-interner`
257 | 
258 |   [string-interner](https://github.com/robbepop/string-interner) gives you
259 |   individual `Interner` objects to work with rather than a global cache, which
260 |   could be more flexible. It's faster to create than string-cache but still
261 |   significantly slower than `Ustr`.
262 | 
263 | ## Speed
264 | 
265 | `Ustr`s are significantly faster to create than `string-interner` or
266 | `string-cache`. Creating 100,000 cycled copies of ~20,000 path strings of the
267 | form:
268 | 
269 | ```text
270 | /cgi-bin/images/admin
271 | /modules/templates/cache
272 | /libraries/themes/wp-includes
273 | ... etc.
274 | ```
275 | 
276 | ![raft bench](ustring_bench_raft.png)
277 | 
278 | ## Why?
279 | 
280 | It is common in certain types of applications to use strings as identifiers,
281 | but not really do any processing with them. To paraphrase from OIIO's `ustring`
282 | documentation:
283 | 
284 | Compared to standard strings, `Ustr`s have several advantages:
285 | 
286 | * Each individual `Ustr` is very small -- in fact, we guarantee that a `Ustr`
287 |   is the same size and memory layout as an ordinary *u8.
288 | 
289 | * Storage is frugal, since there is only one allocated copy of each unique
290 |   character sequence, throughout the lifetime of the program.
291 | 
292 | * Assignment from one `Ustr` to another is just copy of the pointer; no
293 |   allocation, no character copying, no reference counting.
294 | 
295 | * Equality testing (do the strings contain the same characters) is a single
296 |   operation, the comparison of the pointer.
297 | 
298 | * Memory allocation only occurs when a new `Ustr` is constructed from raw
299 |   characters the *first* time ‒ subsequent constructions of the same string
300 |   just finds it in the canonial string set, but doesn't need to allocate new
301 |   storage.  Destruction of a `Ustr` is trivial, there is no de-allocation
302 |   because the canonical version stays in the set.  Also, therefore, no user
303 |   code mistake can lead to memory leaks.
304 | 
305 |   But there are some problems, too.  Canonical strings are never freed from the
306 |   table. So in some sense all the strings "leak", but they only leak one copy
307 |   for each unique string that the program ever comes across. Creating a `Ustr`
308 |   is slower than `String::from()` on a single thread, and performance will be
309 |   worse if trying to create many `Ustr`s in tight loops from multiple threads
310 |   due to lock contention for the global cache.
311 | 
312 | On the whole, `Ustr`s are a really great string representation
313 | 
314 | * if you tend to have (relatively) few unique strings, but many copies of those
315 |   strings;
316 | 
317 | * if you tend to make the same strings over and over again, and if it's
318 |   relatively rare that a single unique character sequence is used only once in
319 |   the entire lifetime of the program; ‒ if your most common string operations
320 |   are assignment and equality testing and you want them to be as fast as
321 |   possible;
322 | 
323 | * if you are doing relatively little character-by-character assembly of
324 |   strings, string concatenation, or other "string manipulation" (other than
325 |   equality testing).
326 | 
327 | `Ustr`s are not so hot:
328 | 
329 | * if your program tends to have very few copies of each character sequence over
330 |   the entire lifetime of the program;
331 | 
332 | * if your program tends to generate a huge variety of unique strings over its
333 |   lifetime, each of which is used only a short time and then discarded, never
334 |   to be needed again;
335 | 
336 | * if you don't need to do a lot of string assignment or equality testing, but
337 |   lots of more complex string manipulation.
338 | 
339 | ## Safety and Compatibility
340 | 
341 | This crate contains a significant amount of unsafe but usage has been checked
342 | and is well-documented. It is also run through Miri as part of the CI process.
343 | 
344 | I use it regularly on 64-bit systems, and it has passed Miri on a 32-bit system
345 | as well, bit 32-bit is not checked regularly. If you want to use it on 32-bit,
346 | please make sure to run Miri and open and issue if you find any problems.
347 | 
348 | ## License
349 | 
350 | BSD+ License
351 | 
352 | Copyright © 2019—2024 Anders Langlands
353 | 
354 | Redistribution and use in source and binary forms, with or without
355 | modification, are permitted provided that the following conditions are met:
356 | 
357 | 1. Redistributions of source code must retain the above copyright notice, this
358 |    list of conditions and the following disclaimer.
359 | 
360 | 2. Redistributions in binary form must reproduce the above copyright notice,
361 |    this list of conditions and the following disclaimer in the documentation
362 |    and/or other materials provided with the distribution.
363 | 
364 | Subject to the terms and conditions of this license, each copyright holder and
365 | contributor hereby grants to those receiving rights under this license a
366 | perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable
367 | (except for failure to satisfy the conditions of this license) patent license
368 | to make, have made, use, offer to sell, sell, import, and otherwise transfer
369 | this software, where such license applies only to those patent claims, already
370 | acquired or hereafter acquired, licensable by such copyright holder or
371 | contributor that are necessarily infringed by:
372 | 
373 | (a) their Contribution(s) (the licensed copyrights of copyright holders and
374 | non-copyrightable additions of contributors, in source or binary form) alone;
375 | or
376 | 
377 | (b) combination of their Contribution(s) with the work of authorship to which
378 | such Contribution(s) was added by such copyright holder or contributor, if, at
379 | the time the Contribution is added, such addition causes such combination to be
380 | necessarily infringed. The patent license shall not apply to any other
381 | combinations which include the Contribution.
382 | 
383 | Except as expressly stated above, no rights or licenses from any copyright
384 | holder or contributor is granted under this license, whether expressly, by
385 | implication, estoppel or otherwise.
386 | 
387 | DISCLAIMER
388 | 
389 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
390 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
391 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
392 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE
393 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
394 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
395 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
396 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
397 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
398 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
399 | 
400 | Contains code ported from [OpenImageIO](https://github.com/OpenImageIO/oiio),
401 | BSD 3-clause license.
402 | 
403 | Contains a copy of Max Woolf's [Big List of Naughty Strings](https://github.com/minimaxir/big-list-of-naughty-strings),
404 | MIT license.
405 | 
406 | Contains some strings from
407 | [SecLists](https://github.com/danielmiessler/SecLists), MIT license.
408 | 


--------------------------------------------------------------------------------
/benches/creation.rs:
--------------------------------------------------------------------------------
  1 | #[macro_use]
  2 | extern crate criterion;
  3 | use criterion::black_box;
  4 | use criterion::Criterion;
  5 | use crossbeam_channel::bounded;
  6 | use crossbeam_utils::thread::scope;
  7 | use std::sync::Arc;
  8 | use string_cache::DefaultAtom;
  9 | use string_interner::StringInterner;
 10 | 
 11 | use ustr::*;
 12 | 
 13 | use parking_lot::Mutex;
 14 | 
 15 | fn criterion_benchmark(c: &mut Criterion) {
 16 |     let path =
 17 |         std::path::Path::new(&std::env::var("CARGO_MANIFEST_DIR").unwrap())
 18 |             .join("data")
 19 |             .join("raft-large-directories.txt");
 20 |     let raft = std::fs::read_to_string(path).unwrap();
 21 |     let raft = Arc::new(
 22 |         raft.split_whitespace()
 23 |             .collect::<Vec<_>>()
 24 |             .chunks(3)
 25 |             .map(|s| {
 26 |                 if s.len() == 3 {
 27 |                     format!("{}/{}/{}", s[0], s[1], s[2])
 28 |                 } else {
 29 |                     s[0].to_owned()
 30 |                 }
 31 |             })
 32 |             .collect::<Vec<_>>(),
 33 |     );
 34 | 
 35 |     let s = raft.clone();
 36 |     c.bench_function("single raft ustr", move |b| {
 37 |         b.iter(|| {
 38 |             unsafe { ustr::_clear_cache() };
 39 |             for s in s.iter().cycle().take(100_000) {
 40 |                 black_box(ustr(s));
 41 |             }
 42 |         });
 43 |     });
 44 | 
 45 |     let s = raft.clone();
 46 |     c.bench_function("single raft string-interner", move |b| {
 47 |         b.iter(|| {
 48 |             let mut interner = StringInterner::default();
 49 |             for s in s.iter().cycle().take(100_000) {
 50 |                 black_box(interner.get_or_intern(s));
 51 |             }
 52 |         });
 53 |     });
 54 | 
 55 |     let s = raft.clone();
 56 |     c.bench_function("single raft string-cache", move |b| {
 57 |         b.iter(|| {
 58 |             let mut v = Vec::with_capacity(100_000);
 59 |             for s in s.iter().cycle().take(100_000) {
 60 |                 v.push(DefaultAtom::from(s.as_str()));
 61 |             }
 62 |             black_box(v);
 63 |         });
 64 |     });
 65 | 
 66 |     let s = raft.clone();
 67 |     c.bench_function("single raft String", move |b| {
 68 |         b.iter(|| {
 69 |             for s in s.iter().cycle().take(100_000) {
 70 |                 black_box(String::from(s));
 71 |             }
 72 |         });
 73 |     });
 74 | 
 75 |     let num = 100_000;
 76 | 
 77 |     for num_threads in [1, 2, 4, 6, 8, 12].iter() {
 78 |         let num_threads = *num_threads;
 79 | 
 80 |         let s = Arc::clone(&raft);
 81 |         c.bench_function(
 82 |             &format!("raft ustr x {} threads", num_threads),
 83 |             move |b| {
 84 |                 let (tx1, rx1) = bounded(0);
 85 |                 let (tx2, rx2) = bounded(0);
 86 |                 let s = Arc::clone(&s);
 87 |                 scope(|scope| {
 88 |                     for tt in 0..num_threads {
 89 |                         let t = tt;
 90 |                         let rx1 = rx1.clone();
 91 |                         let tx2 = tx2.clone();
 92 |                         let s = Arc::clone(&s);
 93 |                         scope.spawn(move |_| {
 94 |                             while rx1.recv().is_ok() {
 95 |                                 for s in s.iter().cycle().skip(t * 17).take(num)
 96 |                                 {
 97 |                                     black_box(ustr(s));
 98 |                                 }
 99 |                                 tx2.send(()).unwrap();
100 |                             }
101 |                         });
102 |                     }
103 | 
104 |                     b.iter(|| {
105 |                         unsafe { ustr::_clear_cache() };
106 |                         for _ in 0..num_threads {
107 |                             tx1.send(()).unwrap();
108 |                         }
109 | 
110 |                         for _ in 0..num_threads {
111 |                             rx2.recv().unwrap();
112 |                         }
113 |                     });
114 |                     drop(tx1);
115 |                 })
116 |                 .unwrap();
117 |             },
118 |         );
119 | 
120 |         let s = Arc::clone(&raft);
121 |         c.bench_function(
122 |             &format!("raft string-interner x {} threads", num_threads),
123 |             move |b| {
124 |                 let (tx1, rx1) = bounded::<
125 |                     Arc<Mutex<StringInterner<string_interner::DefaultSymbol>>>,
126 |                 >(0);
127 |                 let (tx2, rx2) = bounded(0);
128 |                 scope(|scope| {
129 |                     for tt in 0..num_threads {
130 |                         let t = tt;
131 |                         let rx1 = rx1.clone();
132 |                         let tx2 = tx2.clone();
133 |                         let s = Arc::clone(&s);
134 |                         scope.spawn(move |_| {
135 |                             while let Ok(interner) = rx1.recv() {
136 |                                 for s in s.iter().cycle().skip(t * 17).take(num)
137 |                                 {
138 |                                     let mut int = interner.lock();
139 |                                     black_box(int.get_or_intern(s));
140 |                                 }
141 |                                 tx2.send(()).unwrap();
142 |                             }
143 |                         });
144 |                     }
145 | 
146 |                     b.iter(|| {
147 |                         let interner =
148 |                             Arc::new(Mutex::new(StringInterner::default()));
149 |                         for _ in 0..num_threads {
150 |                             tx1.send(interner.clone()).unwrap();
151 |                         }
152 | 
153 |                         for _ in 0..num_threads {
154 |                             rx2.recv().unwrap();
155 |                         }
156 |                     });
157 |                     drop(tx1);
158 |                 })
159 |                 .unwrap();
160 |             },
161 |         );
162 | 
163 |         let s = Arc::clone(&raft);
164 |         c.bench_function(
165 |             &format!("raft string-cache x {} threads", num_threads),
166 |             move |b| {
167 |                 let (tx1, rx1) = bounded(0);
168 |                 let (tx2, rx2) = bounded(0);
169 |                 scope(|scope| {
170 |                     for tt in 0..num_threads {
171 |                         let t = tt;
172 |                         let rx1 = rx1.clone();
173 |                         let tx2 = tx2.clone();
174 |                         let s = Arc::clone(&s);
175 |                         scope.spawn(move |_| {
176 |                             while rx1.recv().is_ok() {
177 |                                 let mut v = Vec::with_capacity(num);
178 |                                 for s in s.iter().cycle().skip(t * 17).take(num)
179 |                                 {
180 |                                     v.push(DefaultAtom::from(s.as_str()));
181 |                                 }
182 |                                 tx2.send(()).unwrap();
183 |                             }
184 |                         });
185 |                     }
186 | 
187 |                     b.iter(|| {
188 |                         for _ in 0..num_threads {
189 |                             tx1.send(()).unwrap();
190 |                         }
191 | 
192 |                         for _ in 0..num_threads {
193 |                             rx2.recv().unwrap();
194 |                         }
195 |                     });
196 |                     drop(tx1);
197 |                 })
198 |                 .unwrap();
199 |             },
200 |         );
201 | 
202 |         let s = Arc::clone(&raft);
203 |         c.bench_function(
204 |             &format!("raft String::from x {} threads", num_threads),
205 |             move |b| {
206 |                 let (tx1, rx1) = bounded(0);
207 |                 let (tx2, rx2) = bounded(0);
208 |                 scope(|scope| {
209 |                     for tt in 0..num_threads {
210 |                         let t = tt;
211 |                         let rx1 = rx1.clone();
212 |                         let tx2 = tx2.clone();
213 |                         let s = Arc::clone(&s);
214 |                         scope.spawn(move |_| {
215 |                             while rx1.recv().is_ok() {
216 |                                 for s in s.iter().cycle().skip(t * 17).take(num)
217 |                                 {
218 |                                     black_box(String::from(s));
219 |                                 }
220 |                                 tx2.send(()).unwrap();
221 |                             }
222 |                         });
223 |                     }
224 | 
225 |                     b.iter(|| {
226 |                         for _ in 0..num_threads {
227 |                             tx1.send(()).unwrap();
228 |                         }
229 | 
230 |                         for _ in 0..num_threads {
231 |                             rx2.recv().unwrap();
232 |                         }
233 |                     });
234 |                     drop(tx1);
235 |                 })
236 |                 .unwrap();
237 |             },
238 |         );
239 |     }
240 | 
241 |     let path =
242 |         std::path::Path::new(&std::env::var("CARGO_MANIFEST_DIR").unwrap())
243 |             .join("data")
244 |             .join("raft-large-directories.txt");
245 |     let raft_large = std::fs::read_to_string(path).unwrap();
246 |     let raft_large = Arc::new(
247 |         raft_large
248 |             .split_whitespace()
249 |             .collect::<Vec<_>>()
250 |             .chunks(11)
251 |             .map(|s| {
252 |                 // if s.len() == 3 {
253 |                 //     format!("{}/{}/{}", s[0], s[1], s[2])
254 |                 // } else {
255 |                 //     s[0].to_owned()
256 |                 // }
257 |                 s.join("/")
258 |             })
259 |             .collect::<Vec<_>>(),
260 |     );
261 | 
262 |     let s = raft_large.clone();
263 |     c.bench_function("raft large x1", move |b| {
264 |         b.iter(|| {
265 |             unsafe { ustr::_clear_cache() };
266 |             for s in s.iter().cycle().take(100_000) {
267 |                 black_box(ustr(s));
268 |             }
269 |         });
270 |     });
271 | 
272 |     let num_threads = 6;
273 |     let s = raft_large.clone();
274 |     c.bench_function("raft large x6", move |b| {
275 |         let (tx1, rx1) = bounded(0);
276 |         let (tx2, rx2) = bounded(0);
277 |         let s = Arc::clone(&s);
278 |         scope(|scope| {
279 |             for tt in 0..num_threads {
280 |                 let t = tt;
281 |                 let rx1 = rx1.clone();
282 |                 let tx2 = tx2.clone();
283 |                 let s = Arc::clone(&s);
284 |                 scope.spawn(move |_| {
285 |                     while rx1.recv().is_ok() {
286 |                         for s in s.iter().cycle().skip(t * 17).take(num) {
287 |                             black_box(ustr(s));
288 |                         }
289 |                         tx2.send(()).unwrap();
290 |                     }
291 |                 });
292 |             }
293 | 
294 |             b.iter(|| {
295 |                 unsafe { ustr::_clear_cache() };
296 |                 for _ in 0..num_threads {
297 |                     tx1.send(()).unwrap();
298 |                 }
299 | 
300 |                 for _ in 0..num_threads {
301 |                     rx2.recv().unwrap();
302 |                 }
303 |             });
304 |             drop(tx1);
305 |         })
306 |         .unwrap();
307 |     });
308 | }
309 | 
310 | criterion_group!(
311 |     name = benches;
312 |     config = Criterion::default().sample_size(30);
313 |     targets = criterion_benchmark
314 | );
315 | criterion_main!(benches);
316 | 


--------------------------------------------------------------------------------
/data/blns.txt:
--------------------------------------------------------------------------------
  1 | #	Reserved Strings
  2 | #
  3 | #	Strings which may be used elsewhere in code
  4 | 
  5 | undefined
  6 | undef
  7 | null
  8 | NULL
  9 | (null)
 10 | nil
 11 | NIL
 12 | true
 13 | false
 14 | True
 15 | False
 16 | TRUE
 17 | FALSE
 18 | None
 19 | hasOwnProperty
 20 | then
 21 | \
 22 | \\
 23 | 
 24 | #	Numeric Strings
 25 | #
 26 | #	Strings which can be interpreted as numeric
 27 | 
 28 | 0
 29 | 1
 30 | 1.00
 31 | $1.00
 32 | 1/2
 33 | 1E2
 34 | 1E02
 35 | 1E+02
 36 | -1
 37 | -1.00
 38 | -$1.00
 39 | -1/2
 40 | -1E2
 41 | -1E02
 42 | -1E+02
 43 | 1/0
 44 | 0/0
 45 | -2147483648/-1
 46 | -9223372036854775808/-1
 47 | -0
 48 | -0.0
 49 | +0
 50 | +0.0
 51 | 0.00
 52 | 0..0
 53 | .
 54 | 0.0.0
 55 | 0,00
 56 | 0,,0
 57 | ,
 58 | 0,0,0
 59 | 0.0/0
 60 | 1.0/0.0
 61 | 0.0/0.0
 62 | 1,0/0,0
 63 | 0,0/0,0
 64 | --1
 65 | -
 66 | -.
 67 | -,
 68 | 999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999
 69 | NaN
 70 | Infinity
 71 | -Infinity
 72 | INF
 73 | 1#INF
 74 | -1#IND
 75 | 1#QNAN
 76 | 1#SNAN
 77 | 1#IND
 78 | 0x0
 79 | 0xffffffff
 80 | 0xffffffffffffffff
 81 | 0xabad1dea
 82 | 123456789012345678901234567890123456789
 83 | 1,000.00
 84 | 1 000.00
 85 | 1'000.00
 86 | 1,000,000.00
 87 | 1 000 000.00
 88 | 1'000'000.00
 89 | 1.000,00
 90 | 1 000,00
 91 | 1'000,00
 92 | 1.000.000,00
 93 | 1 000 000,00
 94 | 1'000'000,00
 95 | 01000
 96 | 08
 97 | 09
 98 | 2.2250738585072011e-308
 99 | 
100 | #	Special Characters
101 | #
102 | # ASCII punctuation.  All of these characters may need to be escaped in some
103 | # contexts.  Divided into three groups based on (US-layout) keyboard position.
104 | 
105 | ,./;'[]\-=
106 | <>?:"{}|_+
107 | !@#$%^&*()`~
108 | 
109 | # Non-whitespace C0 controls: U+0001 through U+0008, U+000E through U+001F,
110 | # and U+007F (DEL)
111 | # Often forbidden to appear in various text-based file formats (e.g. XML),
112 | # or reused for internal delimiters on the theory that they should never
113 | # appear in input.
114 | # The next line may appear to be blank or mojibake in some viewers.
115 | 
116 | 
117 | # Non-whitespace C1 controls: U+0080 through U+0084 and U+0086 through U+009F.
118 | # Commonly misinterpreted as additional graphic characters.
119 | # The next line may appear to be blank, mojibake, or dingbats in some viewers.
120 | 
121 | 
122 | # Whitespace: all of the characters with category Zs, Zl, or Zp (in Unicode
123 | # version 8.0.0), plus U+0009 (HT), U+000B (VT), U+000C (FF), U+0085 (NEL),
124 | # and U+200B (ZERO WIDTH SPACE), which are in the C categories but are often
125 | # treated as whitespace in some contexts.
126 | # This file unfortunately cannot express strings containing
127 | # U+0000, U+000A, or U+000D (NUL, LF, CR).
128 | # The next line may appear to be blank or mojibake in some viewers.
129 | # The next line may be flagged for "trailing whitespace" in some viewers.
130 | 	              ​    　
131 | 
132 | # Unicode additional control characters: all of the characters with
133 | # general category Cf (in Unicode 8.0.0).
134 | # The next line may appear to be blank or mojibake in some viewers.
135 | ­؀؁؂؃؄؅؜۝܏᠎​‌‍‎‏‪‫‬‭‮⁠⁡⁢⁣⁤⁦⁧⁨⁩⁪⁫⁬⁭⁮⁯﻿￹￺￻𑂽𛲠𛲡𛲢𛲣𝅳𝅴𝅵𝅶𝅷𝅸𝅹𝅺󠀁󠀠󠀡󠀢󠀣󠀤󠀥󠀦󠀧󠀨󠀩󠀪󠀫󠀬󠀭󠀮󠀯󠀰󠀱󠀲󠀳󠀴󠀵󠀶󠀷󠀸󠀹󠀺󠀻󠀼󠀽󠀾󠀿󠁀󠁁󠁂󠁃󠁄󠁅󠁆󠁇󠁈󠁉󠁊󠁋󠁌󠁍󠁎󠁏󠁐󠁑󠁒󠁓󠁔󠁕󠁖󠁗󠁘󠁙󠁚󠁛󠁜󠁝󠁞󠁟󠁠󠁡󠁢󠁣󠁤󠁥󠁦󠁧󠁨󠁩󠁪󠁫󠁬󠁭󠁮󠁯󠁰󠁱󠁲󠁳󠁴󠁵󠁶󠁷󠁸󠁹󠁺󠁻󠁼󠁽󠁾󠁿
136 | 
137 | # "Byte order marks", U+FEFF and U+FFFE, each on its own line.
138 | # The next two lines may appear to be blank or mojibake in some viewers.
139 | ﻿
140 | ￾
141 | 
142 | #	Unicode Symbols
143 | #
144 | #	Strings which contain common unicode symbols (e.g. smart quotes)
145 | 
146 | Ω≈ç√∫˜µ≤≥÷
147 | åß∂ƒ©˙∆˚¬…æ
148 | œ∑´®†¥¨ˆøπ“‘
149 | ¡™£¢∞§¶•ªº–≠
150 | ¸˛Ç◊ı˜Â¯˘¿
151 | ÅÍÎÏ˝ÓÔÒÚÆ☃
152 | Œ„´‰ˇÁ¨ˆØ∏”’
153 | `⁄€‹›ﬁﬂ‡°·‚—±
154 | ⅛⅜⅝⅞
155 | ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя
156 | ٠١٢٣٤٥٦٧٨٩
157 | 
158 | #	Unicode Subscript/Superscript/Accents
159 | #
160 | #	Strings which contain unicode subscripts/superscripts; can cause rendering issues
161 | 
162 | ⁰⁴⁵
163 | ₀₁₂
164 | ⁰⁴⁵₀₁₂
165 | ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็
166 | 
167 | #	Quotation Marks
168 | #
169 | #	Strings which contain misplaced quotation marks; can cause encoding errors
170 | 
171 | '
172 | "
173 | ''
174 | ""
175 | '"'
176 | "''''"'"
177 | "'"'"''''"
178 | <foo val=“bar” />
179 | <foo val=“bar” />
180 | <foo val=”bar“ />
181 | <foo val=`bar' />
182 | 
183 | #	Two-Byte Characters
184 | #
185 | #	Strings which contain two-byte characters: can cause rendering issues or character-length issues
186 | 
187 | 田中さんにあげて下さい
188 | パーティーへ行かないか
189 | 和製漢語
190 | 部落格
191 | 사회과학원 어학연구소
192 | 찦차를 타고 온 펲시맨과 쑛다리 똠방각하
193 | 社會科學院語學研究所
194 | 울란바토르
195 | 𠜎𠜱𠝹𠱓𠱸𠲖𠳏
196 | 
197 | #	Special Unicode Characters Union
198 | #
199 | #	A super string recommended by VMware Inc. Globalization Team: can effectively cause rendering issues or character-length issues to validate product globalization readiness.
200 | #
201 | #	表          CJK_UNIFIED_IDEOGRAPHS (U+8868)
202 | #	ポ          KATAKANA LETTER PO (U+30DD)
203 | #	あ          HIRAGANA LETTER A (U+3042)
204 | #	A           LATIN CAPITAL LETTER A (U+0041)
205 | #	鷗          CJK_UNIFIED_IDEOGRAPHS (U+9DD7)
206 | #	Œ           LATIN SMALL LIGATURE OE (U+0153) 
207 | #	é           LATIN SMALL LETTER E WITH ACUTE (U+00E9)
208 | #	Ｂ           FULLWIDTH LATIN CAPITAL LETTER B (U+FF22)
209 | #	逍          CJK_UNIFIED_IDEOGRAPHS (U+900D)
210 | #	Ü           LATIN SMALL LETTER U WITH DIAERESIS (U+00FC)
211 | #	ß           LATIN SMALL LETTER SHARP S (U+00DF)
212 | #	ª           FEMININE ORDINAL INDICATOR (U+00AA)
213 | #	ą           LATIN SMALL LETTER A WITH OGONEK (U+0105)
214 | #	ñ           LATIN SMALL LETTER N WITH TILDE (U+00F1)
215 | #	丂          CJK_UNIFIED_IDEOGRAPHS (U+4E02)
216 | #	㐀          CJK Ideograph Extension A, First (U+3400)
217 | #	𠀀          CJK Ideograph Extension B, First (U+20000)
218 | 
219 | 表ポあA鷗ŒéＢ逍Üßªąñ丂㐀𠀀
220 | 
221 | #	Changing length when lowercased
222 | #
223 | #	Characters which increase in length (2 to 3 bytes) when lowercased
224 | #	Credit: https://twitter.com/jifa/status/625776454479970304
225 | 
226 | Ⱥ
227 | Ⱦ
228 | 
229 | #	Japanese Emoticons
230 | #
231 | #	Strings which consists of Japanese-style emoticons which are popular on the web
232 | 
233 | ヽ༼ຈل͜ຈ༽ﾉ ヽ༼ຈل͜ຈ༽ﾉ
234 | (｡◕ ∀ ◕｡)
235 | ｀ｨ(´∀｀∩
236 | __ﾛ(,_,*)
237 | ・(￣∀￣)・:*:
238 | ﾟ･✿ヾ╲(｡◕‿◕｡)╱✿･ﾟ
239 | ,。・:*:・゜’( ☻ ω ☻ )。・:*:・゜’
240 | (╯°□°）╯︵ ┻━┻)
241 | (ﾉಥ益ಥ）ﾉ﻿ ┻━┻
242 | ┬─┬ノ( º _ ºノ)
243 | ( ͡° ͜ʖ ͡°)
244 | ¯\_(ツ)_/¯
245 | 
246 | #	Emoji
247 | #
248 | #	Strings which contain Emoji; should be the same behavior as two-byte characters, but not always
249 | 
250 | 😍
251 | 👩🏽
252 | 👾 🙇 💁 🙅 🙆 🙋 🙎 🙍
253 | 🐵 🙈 🙉 🙊
254 | ❤️ 💔 💌 💕 💞 💓 💗 💖 💘 💝 💟 💜 💛 💚 💙
255 | ✋🏿 💪🏿 👐🏿 🙌🏿 👏🏿 🙏🏿
256 | 🚾 🆒 🆓 🆕 🆖 🆗 🆙 🏧
257 | 0️⃣ 1️⃣ 2️⃣ 3️⃣ 4️⃣ 5️⃣ 6️⃣ 7️⃣ 8️⃣ 9️⃣ 🔟
258 | 
259 | #       Regional Indicator Symbols
260 | #
261 | #       Regional Indicator Symbols can be displayed differently across
262 | #       fonts, and have a number of special behaviors
263 | 
264 | 🇺🇸🇷🇺🇸 🇦🇫🇦🇲🇸
265 | 🇺🇸🇷🇺🇸🇦🇫🇦🇲
266 | 🇺🇸🇷🇺🇸🇦
267 | 
268 | #	Unicode Numbers
269 | #
270 | #	Strings which contain unicode numbers; if the code is localized, it should see the input as numeric
271 | 
272 | １２３
273 | ١٢٣
274 | 
275 | #	Right-To-Left Strings
276 | #
277 | #	Strings which contain text that should be rendered RTL if possible (e.g. Arabic, Hebrew)
278 | 
279 | ثم نفس سقطت وبالتحديد،, جزيرتي باستخدام أن دنو. إذ هنا؟ الستار وتنصيب كان. أهّل ايطاليا، بريطانيا-فرنسا قد أخذ. سليمان، إتفاقية بين ما, يذكر الحدود أي بعد, معاملة بولندا، الإطلاق عل إيو.
280 | בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ
281 | הָיְתָהtestالصفحات التّحول
282 | ﷽
283 | ﷺ
284 | مُنَاقَشَةُ سُبُلِ اِسْتِخْدَامِ اللُّغَةِ فِي النُّظُمِ الْقَائِمَةِ وَفِيم يَخُصَّ التَّطْبِيقَاتُ الْحاسُوبِيَّةُ، 
285 | 
286 | #	Trick Unicode
287 | #
288 | #	Strings which contain unicode with unusual properties (e.g. Right-to-left override) (c.f. http://www.unicode.org/charts/PDF/U2000.pdf)
289 | 
290 | ‪‪test‪
291 | ‫test‫
292 |  test 
293 | test⁠test‫
294 | ⁦test⁧
295 | 
296 | #	Zalgo Text
297 | #
298 | #	Strings which contain "corrupted" text. The corruption will not appear in non-HTML text, however. (via http://www.eeemo.net)
299 | 
300 | Ṱ̺̺̕o͞ ̷i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤ ̖t̝͕̳̣̻̪͞h̼͓̲̦̳̘̲e͇̣̰̦̬͎ ̢̼̻̱̘h͚͎͙̜̣̲ͅi̦̲̣̰̤v̻͍e̺̭̳̪̰-m̢iͅn̖̺̞̲̯̰d̵̼̟͙̩̼̘̳ ̞̥̱̳̭r̛̗̘e͙p͠r̼̞̻̭̗e̺̠̣͟s̘͇̳͍̝͉e͉̥̯̞̲͚̬͜ǹ̬͎͎̟̖͇̤t͍̬̤͓̼̭͘ͅi̪̱n͠g̴͉ ͏͉ͅc̬̟h͡a̫̻̯͘o̫̟̖͍̙̝͉s̗̦̲.̨̹͈̣
301 | ̡͓̞ͅI̗̘̦͝n͇͇͙v̮̫ok̲̫̙͈i̖͙̭̹̠̞n̡̻̮̣̺g̲͈͙̭͙̬͎ ̰t͔̦h̞̲e̢̤ ͍̬̲͖f̴̘͕̣è͖ẹ̥̩l͖͔͚i͓͚̦͠n͖͍̗͓̳̮g͍ ̨o͚̪͡f̘̣̬ ̖̘͖̟͙̮c҉͔̫͖͓͇͖ͅh̵̤̣͚͔á̗̼͕ͅo̼̣̥s̱͈̺̖̦̻͢.̛̖̞̠̫̰
302 | ̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟
303 | ̦H̬̤̗̤͝e͜ ̜̥̝̻͍̟́w̕h̖̯͓o̝͙̖͎̱̮ ҉̺̙̞̟͈W̷̼̭a̺̪͍į͈͕̭͙̯̜t̶̼̮s̘͙͖̕ ̠̫̠B̻͍͙͉̳ͅe̵h̵̬͇̫͙i̹͓̳̳̮͎̫̕n͟d̴̪̜̖ ̰͉̩͇͙̲͞ͅT͖̼͓̪͢h͏͓̮̻e̬̝̟ͅ ̤̹̝W͙̞̝͔͇͝ͅa͏͓͔̹̼̣l̴͔̰̤̟͔ḽ̫.͕
304 | Z̮̞̠͙͔ͅḀ̗̞͈̻̗Ḷ͙͎̯̹̞͓G̻O̭̗̮
305 | 
306 | #	Unicode Upsidedown
307 | #
308 | #	Strings which contain unicode with an "upsidedown" effect (via http://www.upsidedowntext.com)
309 | 
310 | ˙ɐnbᴉlɐ ɐuƃɐɯ ǝɹolop ʇǝ ǝɹoqɐl ʇn ʇunpᴉpᴉɔuᴉ ɹodɯǝʇ poɯsnᴉǝ op pǝs 'ʇᴉlǝ ƃuᴉɔsᴉdᴉpɐ ɹnʇǝʇɔǝsuoɔ 'ʇǝɯɐ ʇᴉs ɹolop ɯnsdᴉ ɯǝɹo˥
311 | 00˙Ɩ$-
312 | 
313 | #	Unicode font
314 | #
315 | #	Strings which contain bold/italic/etc. versions of normal characters
316 | 
317 | Ｔｈｅ ｑｕｉｃｋ ｂｒｏｗｎ ｆｏｘ ｊｕｍｐｓ ｏｖｅｒ ｔｈｅ ｌａｚｙ ｄｏｇ
318 | 𝐓𝐡𝐞 𝐪𝐮𝐢𝐜𝐤 𝐛𝐫𝐨𝐰𝐧 𝐟𝐨𝐱 𝐣𝐮𝐦𝐩𝐬 𝐨𝐯𝐞𝐫 𝐭𝐡𝐞 𝐥𝐚𝐳𝐲 𝐝𝐨𝐠
319 | 𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌
320 | 𝑻𝒉𝒆 𝒒𝒖𝒊𝒄𝒌 𝒃𝒓𝒐𝒘𝒏 𝒇𝒐𝒙 𝒋𝒖𝒎𝒑𝒔 𝒐𝒗𝒆𝒓 𝒕𝒉𝒆 𝒍𝒂𝒛𝒚 𝒅𝒐𝒈
321 | 𝓣𝓱𝓮 𝓺𝓾𝓲𝓬𝓴 𝓫𝓻𝓸𝔀𝓷 𝓯𝓸𝔁 𝓳𝓾𝓶𝓹𝓼 𝓸𝓿𝓮𝓻 𝓽𝓱𝓮 𝓵𝓪𝔃𝔂 𝓭𝓸𝓰
322 | 𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘
323 | 𝚃𝚑𝚎 𝚚𝚞𝚒𝚌𝚔 𝚋𝚛𝚘𝚠𝚗 𝚏𝚘𝚡 𝚓𝚞𝚖𝚙𝚜 𝚘𝚟𝚎𝚛 𝚝𝚑𝚎 𝚕𝚊𝚣𝚢 𝚍𝚘𝚐
324 | ⒯⒣⒠ ⒬⒰⒤⒞⒦ ⒝⒭⒪⒲⒩ ⒡⒪⒳ ⒥⒰⒨⒫⒮ ⒪⒱⒠⒭ ⒯⒣⒠ ⒧⒜⒵⒴ ⒟⒪⒢
325 | 
326 | #	Script Injection
327 | #
328 | #	Strings which attempt to invoke a benign script injection; shows vulnerability to XSS
329 | 
330 | <script>alert(123)</script>
331 | &lt;script&gt;alert(&#39;123&#39;);&lt;/script&gt;
332 | <img src=x onerror=alert(123) />
333 | <svg><script>123<1>alert(123)</script>
334 | "><script>alert(123)</script>
335 | '><script>alert(123)</script>
336 | ><script>alert(123)</script>
337 | </script><script>alert(123)</script>
338 | < / script >< script >alert(123)< / script >
339 |  onfocus=JaVaSCript:alert(123) autofocus
340 | " onfocus=JaVaSCript:alert(123) autofocus
341 | ' onfocus=JaVaSCript:alert(123) autofocus
342 | ＜script＞alert(123)＜/script＞
343 | <sc<script>ript>alert(123)</sc</script>ript>
344 | --><script>alert(123)</script>
345 | ";alert(123);t="
346 | ';alert(123);t='
347 | JavaSCript:alert(123)
348 | ;alert(123);
349 | src=JaVaSCript:prompt(132)
350 | "><script>alert(123);</script x="
351 | '><script>alert(123);</script x='
352 | ><script>alert(123);</script x=
353 | " autofocus onkeyup="javascript:alert(123)
354 | ' autofocus onkeyup='javascript:alert(123)
355 | <script\x20type="text/javascript">javascript:alert(1);</script>
356 | <script\x3Etype="text/javascript">javascript:alert(1);</script>
357 | <script\x0Dtype="text/javascript">javascript:alert(1);</script>
358 | <script\x09type="text/javascript">javascript:alert(1);</script>
359 | <script\x0Ctype="text/javascript">javascript:alert(1);</script>
360 | <script\x2Ftype="text/javascript">javascript:alert(1);</script>
361 | <script\x0Atype="text/javascript">javascript:alert(1);</script>
362 | '`"><\x3Cscript>javascript:alert(1)</script>
363 | '`"><\x00script>javascript:alert(1)</script>
364 | ABC<div style="x\x3Aexpression(javascript:alert(1)">DEF
365 | ABC<div style="x:expression\x5C(javascript:alert(1)">DEF
366 | ABC<div style="x:expression\x00(javascript:alert(1)">DEF
367 | ABC<div style="x:exp\x00ression(javascript:alert(1)">DEF
368 | ABC<div style="x:exp\x5Cression(javascript:alert(1)">DEF
369 | ABC<div style="x:\x0Aexpression(javascript:alert(1)">DEF
370 | ABC<div style="x:\x09expression(javascript:alert(1)">DEF
371 | ABC<div style="x:\xE3\x80\x80expression(javascript:alert(1)">DEF
372 | ABC<div style="x:\xE2\x80\x84expression(javascript:alert(1)">DEF
373 | ABC<div style="x:\xC2\xA0expression(javascript:alert(1)">DEF
374 | ABC<div style="x:\xE2\x80\x80expression(javascript:alert(1)">DEF
375 | ABC<div style="x:\xE2\x80\x8Aexpression(javascript:alert(1)">DEF
376 | ABC<div style="x:\x0Dexpression(javascript:alert(1)">DEF
377 | ABC<div style="x:\x0Cexpression(javascript:alert(1)">DEF
378 | ABC<div style="x:\xE2\x80\x87expression(javascript:alert(1)">DEF
379 | ABC<div style="x:\xEF\xBB\xBFexpression(javascript:alert(1)">DEF
380 | ABC<div style="x:\x20expression(javascript:alert(1)">DEF
381 | ABC<div style="x:\xE2\x80\x88expression(javascript:alert(1)">DEF
382 | ABC<div style="x:\x00expression(javascript:alert(1)">DEF
383 | ABC<div style="x:\xE2\x80\x8Bexpression(javascript:alert(1)">DEF
384 | ABC<div style="x:\xE2\x80\x86expression(javascript:alert(1)">DEF
385 | ABC<div style="x:\xE2\x80\x85expression(javascript:alert(1)">DEF
386 | ABC<div style="x:\xE2\x80\x82expression(javascript:alert(1)">DEF
387 | ABC<div style="x:\x0Bexpression(javascript:alert(1)">DEF
388 | ABC<div style="x:\xE2\x80\x81expression(javascript:alert(1)">DEF
389 | ABC<div style="x:\xE2\x80\x83expression(javascript:alert(1)">DEF
390 | ABC<div style="x:\xE2\x80\x89expression(javascript:alert(1)">DEF
391 | <a href="\x0Bjavascript:javascript:alert(1)" id="fuzzelement1">test</a>
392 | <a href="\x0Fjavascript:javascript:alert(1)" id="fuzzelement1">test</a>
393 | <a href="\xC2\xA0javascript:javascript:alert(1)" id="fuzzelement1">test</a>
394 | <a href="\x05javascript:javascript:alert(1)" id="fuzzelement1">test</a>
395 | <a href="\xE1\xA0\x8Ejavascript:javascript:alert(1)" id="fuzzelement1">test</a>
396 | <a href="\x18javascript:javascript:alert(1)" id="fuzzelement1">test</a>
397 | <a href="\x11javascript:javascript:alert(1)" id="fuzzelement1">test</a>
398 | <a href="\xE2\x80\x88javascript:javascript:alert(1)" id="fuzzelement1">test</a>
399 | <a href="\xE2\x80\x89javascript:javascript:alert(1)" id="fuzzelement1">test</a>
400 | <a href="\xE2\x80\x80javascript:javascript:alert(1)" id="fuzzelement1">test</a>
401 | <a href="\x17javascript:javascript:alert(1)" id="fuzzelement1">test</a>
402 | <a href="\x03javascript:javascript:alert(1)" id="fuzzelement1">test</a>
403 | <a href="\x0Ejavascript:javascript:alert(1)" id="fuzzelement1">test</a>
404 | <a href="\x1Ajavascript:javascript:alert(1)" id="fuzzelement1">test</a>
405 | <a href="\x00javascript:javascript:alert(1)" id="fuzzelement1">test</a>
406 | <a href="\x10javascript:javascript:alert(1)" id="fuzzelement1">test</a>
407 | <a href="\xE2\x80\x82javascript:javascript:alert(1)" id="fuzzelement1">test</a>
408 | <a href="\x20javascript:javascript:alert(1)" id="fuzzelement1">test</a>
409 | <a href="\x13javascript:javascript:alert(1)" id="fuzzelement1">test</a>
410 | <a href="\x09javascript:javascript:alert(1)" id="fuzzelement1">test</a>
411 | <a href="\xE2\x80\x8Ajavascript:javascript:alert(1)" id="fuzzelement1">test</a>
412 | <a href="\x14javascript:javascript:alert(1)" id="fuzzelement1">test</a>
413 | <a href="\x19javascript:javascript:alert(1)" id="fuzzelement1">test</a>
414 | <a href="\xE2\x80\xAFjavascript:javascript:alert(1)" id="fuzzelement1">test</a>
415 | <a href="\x1Fjavascript:javascript:alert(1)" id="fuzzelement1">test</a>
416 | <a href="\xE2\x80\x81javascript:javascript:alert(1)" id="fuzzelement1">test</a>
417 | <a href="\x1Djavascript:javascript:alert(1)" id="fuzzelement1">test</a>
418 | <a href="\xE2\x80\x87javascript:javascript:alert(1)" id="fuzzelement1">test</a>
419 | <a href="\x07javascript:javascript:alert(1)" id="fuzzelement1">test</a>
420 | <a href="\xE1\x9A\x80javascript:javascript:alert(1)" id="fuzzelement1">test</a>
421 | <a href="\xE2\x80\x83javascript:javascript:alert(1)" id="fuzzelement1">test</a>
422 | <a href="\x04javascript:javascript:alert(1)" id="fuzzelement1">test</a>
423 | <a href="\x01javascript:javascript:alert(1)" id="fuzzelement1">test</a>
424 | <a href="\x08javascript:javascript:alert(1)" id="fuzzelement1">test</a>
425 | <a href="\xE2\x80\x84javascript:javascript:alert(1)" id="fuzzelement1">test</a>
426 | <a href="\xE2\x80\x86javascript:javascript:alert(1)" id="fuzzelement1">test</a>
427 | <a href="\xE3\x80\x80javascript:javascript:alert(1)" id="fuzzelement1">test</a>
428 | <a href="\x12javascript:javascript:alert(1)" id="fuzzelement1">test</a>
429 | <a href="\x0Djavascript:javascript:alert(1)" id="fuzzelement1">test</a>
430 | <a href="\x0Ajavascript:javascript:alert(1)" id="fuzzelement1">test</a>
431 | <a href="\x0Cjavascript:javascript:alert(1)" id="fuzzelement1">test</a>
432 | <a href="\x15javascript:javascript:alert(1)" id="fuzzelement1">test</a>
433 | <a href="\xE2\x80\xA8javascript:javascript:alert(1)" id="fuzzelement1">test</a>
434 | <a href="\x16javascript:javascript:alert(1)" id="fuzzelement1">test</a>
435 | <a href="\x02javascript:javascript:alert(1)" id="fuzzelement1">test</a>
436 | <a href="\x1Bjavascript:javascript:alert(1)" id="fuzzelement1">test</a>
437 | <a href="\x06javascript:javascript:alert(1)" id="fuzzelement1">test</a>
438 | <a href="\xE2\x80\xA9javascript:javascript:alert(1)" id="fuzzelement1">test</a>
439 | <a href="\xE2\x80\x85javascript:javascript:alert(1)" id="fuzzelement1">test</a>
440 | <a href="\x1Ejavascript:javascript:alert(1)" id="fuzzelement1">test</a>
441 | <a href="\xE2\x81\x9Fjavascript:javascript:alert(1)" id="fuzzelement1">test</a>
442 | <a href="\x1Cjavascript:javascript:alert(1)" id="fuzzelement1">test</a>
443 | <a href="javascript\x00:javascript:alert(1)" id="fuzzelement1">test</a>
444 | <a href="javascript\x3A:javascript:alert(1)" id="fuzzelement1">test</a>
445 | <a href="javascript\x09:javascript:alert(1)" id="fuzzelement1">test</a>
446 | <a href="javascript\x0D:javascript:alert(1)" id="fuzzelement1">test</a>
447 | <a href="javascript\x0A:javascript:alert(1)" id="fuzzelement1">test</a>
448 | `"'><img src=xxx:x \x0Aonerror=javascript:alert(1)>
449 | `"'><img src=xxx:x \x22onerror=javascript:alert(1)>
450 | `"'><img src=xxx:x \x0Bonerror=javascript:alert(1)>
451 | `"'><img src=xxx:x \x0Donerror=javascript:alert(1)>
452 | `"'><img src=xxx:x \x2Fonerror=javascript:alert(1)>
453 | `"'><img src=xxx:x \x09onerror=javascript:alert(1)>
454 | `"'><img src=xxx:x \x0Conerror=javascript:alert(1)>
455 | `"'><img src=xxx:x \x00onerror=javascript:alert(1)>
456 | `"'><img src=xxx:x \x27onerror=javascript:alert(1)>
457 | `"'><img src=xxx:x \x20onerror=javascript:alert(1)>
458 | "`'><script>\x3Bjavascript:alert(1)</script>
459 | "`'><script>\x0Djavascript:alert(1)</script>
460 | "`'><script>\xEF\xBB\xBFjavascript:alert(1)</script>
461 | "`'><script>\xE2\x80\x81javascript:alert(1)</script>
462 | "`'><script>\xE2\x80\x84javascript:alert(1)</script>
463 | "`'><script>\xE3\x80\x80javascript:alert(1)</script>
464 | "`'><script>\x09javascript:alert(1)</script>
465 | "`'><script>\xE2\x80\x89javascript:alert(1)</script>
466 | "`'><script>\xE2\x80\x85javascript:alert(1)</script>
467 | "`'><script>\xE2\x80\x88javascript:alert(1)</script>
468 | "`'><script>\x00javascript:alert(1)</script>
469 | "`'><script>\xE2\x80\xA8javascript:alert(1)</script>
470 | "`'><script>\xE2\x80\x8Ajavascript:alert(1)</script>
471 | "`'><script>\xE1\x9A\x80javascript:alert(1)</script>
472 | "`'><script>\x0Cjavascript:alert(1)</script>
473 | "`'><script>\x2Bjavascript:alert(1)</script>
474 | "`'><script>\xF0\x90\x96\x9Ajavascript:alert(1)</script>
475 | "`'><script>-javascript:alert(1)</script>
476 | "`'><script>\x0Ajavascript:alert(1)</script>
477 | "`'><script>\xE2\x80\xAFjavascript:alert(1)</script>
478 | "`'><script>\x7Ejavascript:alert(1)</script>
479 | "`'><script>\xE2\x80\x87javascript:alert(1)</script>
480 | "`'><script>\xE2\x81\x9Fjavascript:alert(1)</script>
481 | "`'><script>\xE2\x80\xA9javascript:alert(1)</script>
482 | "`'><script>\xC2\x85javascript:alert(1)</script>
483 | "`'><script>\xEF\xBF\xAEjavascript:alert(1)</script>
484 | "`'><script>\xE2\x80\x83javascript:alert(1)</script>
485 | "`'><script>\xE2\x80\x8Bjavascript:alert(1)</script>
486 | "`'><script>\xEF\xBF\xBEjavascript:alert(1)</script>
487 | "`'><script>\xE2\x80\x80javascript:alert(1)</script>
488 | "`'><script>\x21javascript:alert(1)</script>
489 | "`'><script>\xE2\x80\x82javascript:alert(1)</script>
490 | "`'><script>\xE2\x80\x86javascript:alert(1)</script>
491 | "`'><script>\xE1\xA0\x8Ejavascript:alert(1)</script>
492 | "`'><script>\x0Bjavascript:alert(1)</script>
493 | "`'><script>\x20javascript:alert(1)</script>
494 | "`'><script>\xC2\xA0javascript:alert(1)</script>
495 | <img \x00src=x onerror="alert(1)">
496 | <img \x47src=x onerror="javascript:alert(1)">
497 | <img \x11src=x onerror="javascript:alert(1)">
498 | <img \x12src=x onerror="javascript:alert(1)">
499 | <img\x47src=x onerror="javascript:alert(1)">
500 | <img\x10src=x onerror="javascript:alert(1)">
501 | <img\x13src=x onerror="javascript:alert(1)">
502 | <img\x32src=x onerror="javascript:alert(1)">
503 | <img\x47src=x onerror="javascript:alert(1)">
504 | <img\x11src=x onerror="javascript:alert(1)">
505 | <img \x47src=x onerror="javascript:alert(1)">
506 | <img \x34src=x onerror="javascript:alert(1)">
507 | <img \x39src=x onerror="javascript:alert(1)">
508 | <img \x00src=x onerror="javascript:alert(1)">
509 | <img src\x09=x onerror="javascript:alert(1)">
510 | <img src\x10=x onerror="javascript:alert(1)">
511 | <img src\x13=x onerror="javascript:alert(1)">
512 | <img src\x32=x onerror="javascript:alert(1)">
513 | <img src\x12=x onerror="javascript:alert(1)">
514 | <img src\x11=x onerror="javascript:alert(1)">
515 | <img src\x00=x onerror="javascript:alert(1)">
516 | <img src\x47=x onerror="javascript:alert(1)">
517 | <img src=x\x09onerror="javascript:alert(1)">
518 | <img src=x\x10onerror="javascript:alert(1)">
519 | <img src=x\x11onerror="javascript:alert(1)">
520 | <img src=x\x12onerror="javascript:alert(1)">
521 | <img src=x\x13onerror="javascript:alert(1)">
522 | <img[a][b][c]src[d]=x[e]onerror=[f]"alert(1)">
523 | <img src=x onerror=\x09"javascript:alert(1)">
524 | <img src=x onerror=\x10"javascript:alert(1)">
525 | <img src=x onerror=\x11"javascript:alert(1)">
526 | <img src=x onerror=\x12"javascript:alert(1)">
527 | <img src=x onerror=\x32"javascript:alert(1)">
528 | <img src=x onerror=\x00"javascript:alert(1)">
529 | <a href=java&#5&#6&#7&#8&#11&#12script:javascript:alert(1)>XXX</a>
530 | <img src="x` `<script>javascript:alert(1)</script>"` `>
531 | <img src onerror /" '"= alt=javascript:alert(1)//">
532 | <title onpropertychange=javascript:alert(1)></title><title title=>
533 | <a href=http://foo.bar/#x=`y></a><img alt="`><img src=x:x onerror=javascript:alert(1)></a>">
534 | <!--[if]><script>javascript:alert(1)</script -->
535 | <!--[if<img src=x onerror=javascript:alert(1)//]> -->
536 | <script src="/\%(jscript)s"></script>
537 | <script src="\\%(jscript)s"></script>
538 | <IMG """><SCRIPT>alert("XSS")</SCRIPT>">
539 | <IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>
540 | <IMG SRC=# onmouseover="alert('xxs')">
541 | <IMG SRC= onmouseover="alert('xxs')">
542 | <IMG onmouseover="alert('xxs')">
543 | <IMG SRC=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;>
544 | <IMG SRC=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>
545 | <IMG SRC=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>
546 | <IMG SRC="jav   ascript:alert('XSS');">
547 | <IMG SRC="jav&#x09;ascript:alert('XSS');">
548 | <IMG SRC="jav&#x0A;ascript:alert('XSS');">
549 | <IMG SRC="jav&#x0D;ascript:alert('XSS');">
550 | perl -e 'print "<IMG SRC=java\0script:alert(\"XSS\")>";' > out
551 | <IMG SRC=" &#14;  javascript:alert('XSS');">
552 | <SCRIPT/XSS SRC="http://ha.ckers.org/xss.js"></SCRIPT>
553 | <BODY onload!#$%&()*~+-_.,:;?@[/|\]^`=alert("XSS")>
554 | <SCRIPT/SRC="http://ha.ckers.org/xss.js"></SCRIPT>
555 | <<SCRIPT>alert("XSS");//<</SCRIPT>
556 | <SCRIPT SRC=http://ha.ckers.org/xss.js?< B >
557 | <SCRIPT SRC=//ha.ckers.org/.j>
558 | <IMG SRC="javascript:alert('XSS')"
559 | <iframe src=http://ha.ckers.org/scriptlet.html <
560 | \";alert('XSS');//
561 | <u oncopy=alert()> Copy me</u>
562 | <i onwheel=alert(1)> Scroll over me </i>
563 | <plaintext>
564 | http://a/%%30%30
565 | </textarea><script>alert(123)</script>
566 | 
567 | #	SQL Injection
568 | #
569 | #	Strings which can cause a SQL injection if inputs are not sanitized
570 | 
571 | 1;DROP TABLE users
572 | 1'; DROP TABLE users-- 1
573 | ' OR 1=1 -- 1
574 | ' OR '1'='1
575 |  
576 | %
577 | _
578 | 
579 | #	Server Code Injection
580 | #
581 | #	Strings which can cause user to run code on server as a privileged user (c.f. https://news.ycombinator.com/item?id=7665153)
582 | 
583 | -
584 | --
585 | --version
586 | --help
587 | $USER
588 | /dev/null; touch /tmp/blns.fail ; echo
589 | `touch /tmp/blns.fail`
590 | $(touch /tmp/blns.fail)
591 | @{[system "touch /tmp/blns.fail"]}
592 | 
593 | #	Command Injection (Ruby)
594 | #
595 | #	Strings which can call system commands within Ruby/Rails applications
596 | 
597 | eval("puts 'hello world'")
598 | System("ls -al /")
599 | `ls -al /`
600 | Kernel.exec("ls -al /")
601 | Kernel.exit(1)
602 | %x('ls -al /')
603 | 
604 | #      XXE Injection (XML)
605 | #
606 | #	String which can reveal system files when parsed by a badly configured XML parser
607 | 
608 | <?xml version="1.0" encoding="ISO-8859-1"?><!DOCTYPE foo [ <!ELEMENT foo ANY ><!ENTITY xxe SYSTEM "file:///etc/passwd" >]><foo>&xxe;</foo>
609 | 
610 | #	Unwanted Interpolation
611 | #
612 | #	Strings which can be accidentally expanded into different strings if evaluated in the wrong context, e.g. used as a printf format string or via Perl or shell eval. Might expose sensitive data from the program doing the interpolation, or might just represent the wrong string.
613 | 
614 | $HOME
615 | $ENV{'HOME'}
616 | %d
617 | %s%s%s%s%s
618 | {0}
619 | %*.*s
620 | %@
621 | %n
622 | File:///
623 | 
624 | #	File Inclusion
625 | #
626 | #	Strings which can cause user to pull in files that should not be a part of a web server
627 | 
628 | ../../../../../../../../../../../etc/passwd%00
629 | ../../../../../../../../../../../etc/hosts
630 | 
631 | #	Known CVEs and Vulnerabilities
632 | #
633 | #	Strings that test for known vulnerabilities
634 | 
635 | () { 0; }; touch /tmp/blns.shellshock1.fail;
636 | () { _; } >_[$($())] { touch /tmp/blns.shellshock2.fail; }
637 | <<< %s(un='%s') = %u
638 | +++ATH0
639 | 
640 | #	MSDOS/Windows Special Filenames
641 | #
642 | #	Strings which are reserved characters in MSDOS/Windows
643 | 
644 | CON
645 | PRN
646 | AUX
647 | CLOCK$
648 | NUL
649 | A:
650 | ZZ:
651 | COM1
652 | LPT1
653 | LPT2
654 | LPT3
655 | COM2
656 | COM3
657 | COM4
658 | 
659 | #   IRC specific strings
660 | #
661 | #   Strings that may occur on IRC clients that make security products freak out
662 | 
663 | DCC SEND STARTKEYLOGGER 0 0 0
664 | 
665 | #	Scunthorpe Problem
666 | #
667 | #	Innocuous strings which may be blocked by profanity filters (https://en.wikipedia.org/wiki/Scunthorpe_problem)
668 | 
669 | Scunthorpe General Hospital
670 | Penistone Community Church
671 | Lightwater Country Park
672 | Jimmy Clitheroe
673 | Horniman Museum
674 | shitake mushrooms
675 | RomansInSussex.co.uk
676 | http://www.cum.qc.ca/
677 | Craig Cockburn, Software Specialist
678 | Linda Callahan
679 | Dr. Herman I. Libshitz
680 | magna cum laude
681 | Super Bowl XXX
682 | medieval erection of parapets
683 | evaluate
684 | mocha
685 | expression
686 | Arsenal canal
687 | classic
688 | Tyson Gay
689 | Dick Van Dyke
690 | basement
691 | 
692 | #	Human injection
693 | #
694 | #	Strings which may cause human to reinterpret worldview
695 | 
696 | If you're reading this, you've been in a coma for almost 20 years now. We're trying a new technique. We don't know where this message will end up in your dream, but we hope it works. Please wake up, we miss you.
697 | 
698 | #	Terminal escape codes
699 | #
700 | #	Strings which punish the fools who use cat/type on this file
701 | 
702 | Roses are [0;31mred[0m, violets are [0;34mblue. Hope you enjoy terminal hue
703 | But now...[20Cfor my greatest trick...[8m
704 | The quick brown fox... [Beeeep]
705 | 
706 | #	iOS Vulnerabilities
707 | #
708 | #	Strings which crashed iMessage in various versions of iOS
709 | 
710 | Powerلُلُصّبُلُلصّبُررً ॣ ॣh ॣ ॣ冗
711 | 🏳0🌈️
712 | జ్ఞ‌ా
713 | 


--------------------------------------------------------------------------------
/include/ustr.h:
--------------------------------------------------------------------------------
 1 | #ifndef __USTR_H__
 2 | #define __USTR_H__
 3 | 
 4 | #include <stddef.h>
 5 | 
 6 | #ifdef __cplusplus
 7 | extern "C" {
 8 | #endif
 9 | 
10 | typedef struct {
11 |     const char* ptr;
12 | } ustr_t;
13 | 
14 | /*
15 |     Create a new ustr_t from the given char*.
16 |     It is assumed that `str` is a valid, non-null pointer. Passing anything else
17 |     will result in undefined behaviour.
18 |     Any invlid UTF-8 in `str` will be replaced by U+FFFD REPLACEMENT CHARACTER
19 | */
20 | ustr_t ustr(const char* str);
21 | 
22 | /*
23 |     Returns the length of the given ustr_t in bytes.
24 | */
25 | size_t ustr_len(ustr_t u);
26 | 
27 | /*
28 |     Returns the precomputed hash for the given ustr_t.
29 | */
30 | uint64_t ustr_hash(ustr_t u);
31 | 
32 | #ifdef __cplusplus
33 | }
34 | #endif
35 | 
36 | #endif


--------------------------------------------------------------------------------
/include/ustr.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef __USTR_HPP__
 2 | #define __USTR_HPP__
 3 | 
 4 | #include "ustr.h"
 5 | #include <string>
 6 | 
 7 | /// A class representing an interned string.
 8 | class Ustr {
 9 |     ustr_t _u;
10 | 
11 | public:
12 |     /// Creates the empty string
13 |     Ustr() { _u = ustr(""); }
14 | 
15 |     /// Create a new Ustr from a const char*
16 |     /// It is assumed that `str` is a valid, non-null pointer. Passing anything
17 |     /// else will result in undefined behaviour.
18 |     /// Any invlid UTF-8 in `str` will be replaced by U+FFFD REPLACEMENT
19 |     /// CHARACTER
20 |     Ustr(const char* ptr) { _u = ustr(ptr); }
21 | 
22 |     /// Create a new Ustr from a std::string
23 |     Ustr(const std::string& s) { _u = ustr(s.c_str()); }
24 | 
25 |     /// Returns true if the string is empty
26 |     bool is_empty() const { return len() == 0; }
27 | 
28 |     /// Returns the length of the string, in bytes.
29 |     size_t len() const { return ustr_len(_u); }
30 | 
31 |     /// Returns the precomputed hash of the string
32 |     size_t hash() const { return ustr_hash(_u); }
33 | 
34 |     /// Easy conversion to the underlying C struct
35 |     operator ustr_t() const { return _u; }
36 | 
37 |     /// Get the interned chars
38 |     const char* c_str() const { return _u.ptr; }
39 | };
40 | 
41 | #endif


--------------------------------------------------------------------------------
/miri.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env sh
 2 | 
 3 | set -ex
 4 | 
 5 | export CARGO_NET_RETRY=5
 6 | export CARGO_NET_TIMEOUT=10
 7 | 
 8 | MIRI_NIGHTLY=nightly-$(curl -s https://rust-lang.github.io/rustup-components-history/x86_64-unknown-linux-gnu/miri)
 9 | echo "Installing latest nightly with Miri: $MIRI_NIGHTLY"
10 | rustup default "$MIRI_NIGHTLY"
11 | 
12 | rustup component add miri
13 | cargo miri setup
14 | 
15 | export RUST_TEST_THREADS=1
16 | cargo miri test --features=serde
17 | 


--------------------------------------------------------------------------------
/mutex_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anderslanglands/ustr/4c1fde840807f8af9d4bfd38805c3d89ac51baeb/mutex_comparison.png


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | edition = "2018"
2 | max_width = 80
3 | wrap_comments = true
4 | 


--------------------------------------------------------------------------------
/src/bumpalloc.rs:
--------------------------------------------------------------------------------
 1 | use std::alloc::{GlobalAlloc, Layout, System};
 2 | 
 3 | // The world's dumbest allocator. Just keep bumping a pointer until we run out
 4 | // of memory, in which case we abort. StringCache is responsible for creating
 5 | // a new allocator when that's about to happen.
 6 | // This is now bumping downward rather than up, which simplifies the allocate()
 7 | // method and gives a small (5-7%) performance improvement in multithreaded
 8 | // benchmarks
 9 | // See https://fitzgeraldnick.com/2019/11/01/always-bump-downwards.html
10 | pub(crate) struct LeakyBumpAlloc {
11 |     layout: Layout,
12 |     start: *mut u8,
13 |     end: *mut u8,
14 |     ptr: *mut u8,
15 | }
16 | 
17 | impl LeakyBumpAlloc {
18 |     pub fn new(capacity: usize, alignment: usize) -> LeakyBumpAlloc {
19 |         let layout = Layout::from_size_align(capacity, alignment).unwrap();
20 |         let start = unsafe { System.alloc(layout) };
21 |         if start.is_null() {
22 |             panic!("oom");
23 |         }
24 |         let end = unsafe { start.add(layout.size()) };
25 |         let ptr = end;
26 |         LeakyBumpAlloc {
27 |             layout,
28 |             start,
29 |             end,
30 |             ptr,
31 |         }
32 |     }
33 | 
34 |     #[doc(hidden)]
35 |     // used for resetting the cache between benchmark runs. DO NOT CALL THIS.
36 |     pub unsafe fn clear(&mut self) {
37 |         System.dealloc(self.start, self.layout);
38 |     }
39 | 
40 |     // Allocates a new chunk. Aborts if out of memory.
41 |     pub unsafe fn allocate(&mut self, num_bytes: usize) -> *mut u8 {
42 |         // Our new ptr will be offset down the heap by num_bytes bytes.
43 |         let ptr = self.ptr as usize;
44 |         let new_ptr = ptr.checked_sub(num_bytes).expect("ptr sub overflowed");
45 |         // Round down to alignment.
46 |         let new_ptr = new_ptr & !(self.layout.align() - 1);
47 |         // Check we have enough capacity.
48 |         let start = self.start as usize;
49 |         if new_ptr < start {
50 |             eprintln!(
51 |                 "Allocator asked to bump to {} bytes with a capacity of {}",
52 |                 self.end as usize - new_ptr,
53 |                 self.capacity()
54 |             );
55 |             // We have to abort here rather than panic or the mutex may
56 |             // deadlock.
57 |             std::process::abort();
58 |         }
59 | 
60 |         self.ptr = self.ptr.sub(ptr - new_ptr);
61 |         self.ptr
62 |     }
63 | 
64 |     pub fn allocated(&self) -> usize {
65 |         self.end as usize - self.ptr as usize
66 |     }
67 | 
68 |     pub fn capacity(&self) -> usize {
69 |         self.layout.size()
70 |     }
71 | 
72 |     pub(crate) fn end(&self) -> *const u8 {
73 |         self.end
74 |     }
75 | 
76 |     pub(crate) fn ptr(&self) -> *const u8 {
77 |         self.ptr
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/hash.rs:
--------------------------------------------------------------------------------
 1 | use super::Ustr;
 2 | use byteorder::{ByteOrder, NativeEndian};
 3 | use std::{
 4 |     collections::{HashMap, HashSet},
 5 |     hash::{BuildHasherDefault, Hasher},
 6 | };
 7 | 
 8 | /// A standard `HashMap` using `Ustr` as the key type with a custom `Hasher`
 9 | /// that just uses the precomputed hash for speed instead of calculating it.
10 | pub type UstrMap<V> = HashMap<Ustr, V, BuildHasherDefault<IdentityHasher>>;
11 | 
12 | /// A standard `HashSet` using `Ustr` as the key type with a custom `Hasher`
13 | /// that just uses the precomputed hash for speed instead of calculating it.
14 | pub type UstrSet = HashSet<Ustr, BuildHasherDefault<IdentityHasher>>;
15 | 
16 | /// The worst hasher in the world -- the identity hasher.
17 | #[doc(hidden)]
18 | #[derive(Default)]
19 | pub struct IdentityHasher {
20 |     hash: u64,
21 | }
22 | 
23 | impl Hasher for IdentityHasher {
24 |     #[inline]
25 |     fn write(&mut self, bytes: &[u8]) {
26 |         if bytes.len() == 8 {
27 |             self.hash = NativeEndian::read_u64(bytes);
28 |         }
29 |     }
30 | 
31 |     #[inline]
32 |     fn finish(&self) -> u64 {
33 |         self.hash
34 |     }
35 | }
36 | 
37 | #[test]
38 | fn test_hashing() {
39 |     let _t = super::TEST_LOCK.lock();
40 |     use crate::ustr as u;
41 | 
42 |     use std::hash::Hash;
43 |     let u1 = u("the quick brown fox");
44 |     let u2 = u("jumped over the lazy dog");
45 | 
46 |     let mut hasher = IdentityHasher::default();
47 |     u1.hash(&mut hasher);
48 |     assert_eq!(hasher.finish(), u1.precomputed_hash());
49 | 
50 |     let mut hasher = IdentityHasher::default();
51 |     u2.hash(&mut hasher);
52 |     assert_eq!(hasher.finish(), u2.precomputed_hash());
53 | 
54 |     let mut hm = UstrMap::<u32>::default();
55 |     hm.insert(u1, 17);
56 |     hm.insert(u2, 42);
57 | 
58 |     assert_eq!(hm.get(&u1), Some(&17));
59 |     assert_eq!(hm.get(&u2), Some(&42));
60 | }
61 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
   1 | //! Fast, FFI-friendly string interning. A [`Ustr`] (**U**nique **Str**) is a
   2 | //! lightweight handle representing a static, immutable entry in a global string
   3 | //! cache, allowing for:
   4 | //!
   5 | //! * Extremely fast string assignment and comparisons -- it's just a pointer
   6 | //!   comparison.
   7 | //!
   8 | //! * Efficient storage -- only one copy of the string is held in memory, and
   9 | //!   getting access to it is just a pointer indirection.
  10 | //!
  11 | //! * Fast hashing -- the precomputed hash is stored with the string.
  12 | //!
  13 | //! * Fast FFI -- the string is stored with a terminating null byte so can be
  14 | //!   passed to C directly without doing the `CString` dance.
  15 | //!
  16 | //! The downside is no strings are ever freed, so if you're creating lots and
  17 | //! lots of strings, you might run out of memory. On the other hand, War and
  18 | //! Peace is only 3MB, so it's probably fine.
  19 | //!
  20 | //! This crate is based on [OpenImageIO's](https://openimageio.readthedocs.io/en/v2.4.10.0/)
  21 | //! (OIIO) [`ustring`](https://github.com/OpenImageIO/oiio/blob/master/src/include/OpenImageIO/ustring.h)
  22 | //! but it is *not* binary-compatible (yet). The underlying hash map
  23 | //! implementation is directy ported from OIIO.
  24 | //!
  25 | //! # Usage
  26 | //!
  27 | //! ```
  28 | //! use ustr::{Ustr, ustr, ustr as u};
  29 | //!
  30 | //! # unsafe { ustr::_clear_cache() };
  31 | //! // Creation is quick and easy using either `Ustr::from` or the ustr function
  32 | //! // and only one copy of any string is stored.
  33 | //! let u1 = Ustr::from("the quick brown fox");
  34 | //! let u2 = ustr("the quick brown fox");
  35 | //!
  36 | //! // Comparisons and copies are extremely cheap.
  37 | //! let u3 = u1;
  38 | //! assert_eq!(u2, u3);
  39 | //!
  40 | //! // You can pass straight to FFI.
  41 | //! let len = unsafe {
  42 | //!     libc::strlen(u1.as_char_ptr())
  43 | //! };
  44 | //! assert_eq!(len, 19);
  45 | //!
  46 | //! // Use as_str() to get a `str`.
  47 | //! let words: Vec<&str> = u1.as_str().split_whitespace().collect();
  48 | //! assert_eq!(words, ["the", "quick", "brown", "fox"]);
  49 | //!
  50 | //! // For best performance when using Ustr as key for a HashMap or HashSet,
  51 | //! // you'll want to use the precomputed hash. To make this easier, just use
  52 | //! // the UstrMap and UstrSet exports:
  53 | //! use ustr::UstrMap;
  54 | //!
  55 | //! // Key type is always `Ustr`.
  56 | //! let mut map: UstrMap<usize> = UstrMap::default();
  57 | //! map.insert(u1, 17);
  58 | //! assert_eq!(*map.get(&u1).unwrap(), 17);
  59 | //! ```
  60 | //!
  61 | //! By enabling the `"serde"` feature you can serialize individual `Ustr`s
  62 | //! or the whole cache with serde.
  63 | //!
  64 | //! ```
  65 | //! # #[cfg(feature = "serde")] {
  66 | //! use ustr::{Ustr, ustr};
  67 | //! let u_ser = ustr("serde");
  68 | //! let json = serde_json::to_string(&u_ser).unwrap();
  69 | //! let u_de : Ustr = serde_json::from_str(&json).unwrap();
  70 | //! assert_eq!(u_ser, u_de);
  71 | //! # }
  72 | //! ```
  73 | //!
  74 | //! Since the cache is global, use the `ustr::DeserializedCache` dummy object to
  75 | //! drive the deserialization.
  76 | //!
  77 | //! ```
  78 | //! # #[cfg(feature = "serde")] {
  79 | //! use ustr::{Ustr, ustr};
  80 | //! ustr("Send me to JSON and back");
  81 | //! let json = serde_json::to_string(ustr::cache()).unwrap();
  82 | //!
  83 | //! // ... some time later ...
  84 | //! let _: ustr::DeserializedCache = serde_json::from_str(&json).unwrap();
  85 | //! assert_eq!(ustr::num_entries(), 1);
  86 | //! assert_eq!(ustr::string_cache_iter().collect::<Vec<_>>(), vec!["Send me to JSON and back"]);
  87 | //! # }
  88 | //! ```
  89 | //!
  90 | //! ## Why?
  91 | //!
  92 | //! It is common in certain types of applications to use strings as identifiers,
  93 | //! but not really do any processing with them.
  94 | //! To paraphrase from OIIO's `Ustring` documentation -- compared to standard
  95 | //! strings, `Ustr`s have several advantages:
  96 | //!
  97 | //!   - Each individual `Ustr` is very small -- in fact, we guarantee that a
  98 | //!     `Ustr` is the same size and memory layout as an ordinary `*u8`.
  99 | //!
 100 | //!   - Storage is frugal, since there is only one allocated copy of each unique
 101 | //!     character sequence, throughout the lifetime of the program.
 102 | //!
 103 | //!   - Assignment from one `Ustr` to another is just copy of the pointer; no
 104 | //!     allocation, no character copying, no reference counting.
 105 | //!
 106 | //!   - Equality testing (do the strings contain the same characters) is a
 107 | //!     single operation, the comparison of the pointer.
 108 | //!
 109 | //!   - Memory allocation only occurs when a new `Ustr` is constructed from raw
 110 | //!     characters the FIRST time -- subsequent constructions of the same string
 111 | //!     just finds it in the canonial string set, but doesn't need to allocate
 112 | //!     new storage.  Destruction of a `Ustr` is trivial, there is no
 113 | //!     de-allocation because the canonical version stays in the set.  Also,
 114 | //!     therefore, no user code mistake can lead to memory leaks.
 115 | //!
 116 | //! But there are some problems, too.  Canonical strings are never freed
 117 | //! from the table.  So in some sense all the strings "leak", but they
 118 | //! only leak one copy for each unique string that the program ever comes
 119 | //! across.
 120 | //!
 121 | //! On the whole, `Ustr`s are a really great string representation
 122 | //!
 123 | //!   - if you tend to have (relatively) few unique strings, but many copies of
 124 | //!     those strings;
 125 | //!
 126 | //!   - if the creation of strings from raw characters is relatively rare
 127 | //!     compared to copying or comparing to existing strings;
 128 | //!
 129 | //!   - if you tend to make the same strings over and over again, and if it's
 130 | //!     relatively rare that a single unique character sequence is used only
 131 | //!     once in the entire lifetime of the program;
 132 | //!
 133 | //!   - if your most common string operations are assignment and equality
 134 | //!     testing and you want them to be as fast as possible;
 135 | //!
 136 | //!   - if you are doing relatively little character-by-character assembly of
 137 | //!     strings, string concatenation, or other "string manipulation" (other
 138 | //!     than equality testing).
 139 | //!
 140 | //! `Ustr`s are not so hot
 141 | //!
 142 | //!   - if your program tends to have very few copies of each character sequence
 143 | //!     over the entire lifetime of the program;
 144 | //!
 145 | //!   - if your program tends to generate a huge variety of unique strings over
 146 | //!     its lifetime, each of which is used only a short time and then
 147 | //!     discarded, never to be needed again;
 148 | //!
 149 | //!   - if you don't need to do a lot of string assignment or equality testing,
 150 | //!     but lots of more complex string manipulation.
 151 | //!
 152 | //! ## Safety and Compatibility
 153 | //!
 154 | //! This crate contains a significant amount of unsafe but usage has been
 155 | //! checked and is well-documented. It is also run through Miri as part of the
 156 | //! CI process. I use it regularly on 64-bit systems, and it has passed Miri on
 157 | //! a 32-bit system as well, bit 32-bit is not checked regularly. If you want to
 158 | //! use it on 32-bit, please make sure to run Miri and open and issue if you
 159 | //! find any problems.
 160 | use parking_lot::Mutex;
 161 | use std::{
 162 |     borrow::Cow,
 163 |     cmp::Ordering,
 164 |     ffi::{CStr, OsStr},
 165 |     fmt,
 166 |     hash::{Hash, Hasher},
 167 |     ops::Deref,
 168 |     os::raw::c_char,
 169 |     path::Path,
 170 |     ptr::NonNull,
 171 |     rc::Rc,
 172 |     slice, str,
 173 |     str::FromStr,
 174 |     sync::Arc,
 175 | };
 176 | 
 177 | mod hash;
 178 | pub use hash::*;
 179 | mod bumpalloc;
 180 | 
 181 | mod stringcache;
 182 | pub use stringcache::*;
 183 | #[cfg(feature = "serde")]
 184 | pub mod serialization;
 185 | #[cfg(feature = "serde")]
 186 | pub use serialization::DeserializedCache;
 187 | 
 188 | /// A handle representing a string in the global string cache.
 189 | ///
 190 | /// To use, create one using [`Ustr::from`] or the [`ustr`] function. You can
 191 | /// freely copy, destroy or send `Ustr`s to other threads: the underlying string
 192 | /// is always valid in memory (and is never destroyed).
 193 | #[derive(Copy, Clone, PartialEq)]
 194 | #[repr(transparent)]
 195 | pub struct Ustr {
 196 |     char_ptr: NonNull<u8>,
 197 | }
 198 | 
 199 | /// Defer to `str` for equality.
 200 | ///
 201 | /// Lexicographic ordering will be slower than pointer comparison, but much less
 202 | /// surprising if you use `Ustr`s as keys in e.g. a `BTreeMap`.
 203 | impl Ord for Ustr {
 204 |     fn cmp(&self, other: &Self) -> Ordering {
 205 |         self.as_str().cmp(other.as_str())
 206 |     }
 207 | }
 208 | 
 209 | /// Defer to `str` for equality.
 210 | ///
 211 | /// Lexicographic ordering will be slower thanpointer comparison, but much less
 212 | /// surprising if you use `Ustr`s as keys in e.g. a `BTreeMap`.
 213 | #[allow(clippy::non_canonical_partial_ord_impl)]
 214 | impl PartialOrd for Ustr {
 215 |     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
 216 |         Some(self.cmp(other))
 217 |     }
 218 | }
 219 | 
 220 | impl Ustr {
 221 |     /// Create a new `Ustr` from the given `str`.
 222 |     ///
 223 |     /// You can also use the [`ustr`] function.
 224 |     ///
 225 |     /// # Examples
 226 |     ///
 227 |     /// ```
 228 |     /// use ustr::{Ustr, ustr as u};
 229 |     /// # unsafe { ustr::_clear_cache() };
 230 |     ///
 231 |     /// let u1 = Ustr::from("the quick brown fox");
 232 |     /// let u2 = u("the quick brown fox");
 233 |     /// assert_eq!(u1, u2);
 234 |     /// assert_eq!(ustr::num_entries(), 1);
 235 |     /// ```
 236 |     pub fn from(string: &str) -> Ustr {
 237 |         let hash = {
 238 |             let mut hasher = ahash::AHasher::default();
 239 |             hasher.write(string.as_bytes());
 240 |             hasher.finish()
 241 |         };
 242 |         let mut sc = STRING_CACHE.0[whichbin(hash)].lock();
 243 |         Ustr {
 244 |             // SAFETY: sc.insert does not give back a null pointer
 245 |             char_ptr: unsafe {
 246 |                 NonNull::new_unchecked(sc.insert(string, hash) as *mut _)
 247 |             },
 248 |         }
 249 |     }
 250 | 
 251 |     pub fn from_existing(string: &str) -> Option<Ustr> {
 252 |         let hash = {
 253 |             let mut hasher = ahash::AHasher::default();
 254 |             hasher.write(string.as_bytes());
 255 |             hasher.finish()
 256 |         };
 257 |         let sc = STRING_CACHE.0[whichbin(hash)].lock();
 258 |         sc.get_existing(string, hash).map(|ptr| Ustr {
 259 |             char_ptr: unsafe { NonNull::new_unchecked(ptr as *mut _) },
 260 |         })
 261 |     }
 262 | 
 263 |     /// Get the cached `Ustr` as a `str`.
 264 |     ///
 265 |     /// # Examples
 266 |     ///
 267 |     /// ```
 268 |     /// use ustr::ustr as u;
 269 |     /// # unsafe { ustr::_clear_cache() };
 270 |     ///
 271 |     /// let u_fox = u("the quick brown fox");
 272 |     /// let words: Vec<&str> = u_fox.as_str().split_whitespace().collect();
 273 |     /// assert_eq!(words, ["the", "quick", "brown", "fox"]);
 274 |     /// ```
 275 |     pub fn as_str(&self) -> &'static str {
 276 |         // This is safe if:
 277 |         // 1) self.char_ptr points to a valid address
 278 |         // 2) len is a usize stored usize aligned usize bytes before char_ptr
 279 |         // 3) char_ptr points to a valid UTF-8 string of len bytes.
 280 |         // All these are guaranteed by StringCache::insert() and by the fact
 281 |         // we can only construct a Ustr from a valid &str.
 282 |         unsafe {
 283 |             str::from_utf8_unchecked(slice::from_raw_parts(
 284 |                 self.char_ptr.as_ptr(),
 285 |                 self.len(),
 286 |             ))
 287 |         }
 288 |     }
 289 | 
 290 |     /// Get the cached string as a C `char*`.
 291 |     ///
 292 |     /// This includes the null terminator so is safe to pass straight to FFI.
 293 |     ///
 294 |     /// # Examples
 295 |     ///
 296 |     /// ```
 297 |     /// use ustr::ustr as u;
 298 |     /// # unsafe { ustr::_clear_cache() };
 299 |     ///
 300 |     /// let u_fox = u("the quick brown fox");
 301 |     /// let len = unsafe {
 302 |     ///     libc::strlen(u_fox.as_char_ptr())
 303 |     /// };
 304 |     /// assert_eq!(len, 19);
 305 |     /// ```
 306 |     ///
 307 |     /// # Safety
 308 |     ///
 309 |     /// This is just passing a raw byte array with a null terminator to C. If
 310 |     /// your source string contains non-ascii bytes then this will pass them
 311 |     /// straight along with no checking.
 312 |     ///
 313 |     /// The string is **immutable**. That means that if you modify it across the
 314 |     /// FFI boundary then all sorts of terrible things will happen.
 315 |     pub fn as_char_ptr(&self) -> *const c_char {
 316 |         self.char_ptr.as_ptr() as *const c_char
 317 |     }
 318 | 
 319 |     /// Get this `Ustr` as a [`CStr`]
 320 |     ///
 321 |     /// This is useful for passing to APIs (like ash) that use `CStr`.
 322 |     ///
 323 |     /// # Safety
 324 |     ///
 325 |     /// This function by itself is safe as the pointer and length are guaranteed
 326 |     /// to be valid. All the same caveats for the use of the `CStr` as given in
 327 |     /// the `CStr` docs apply.
 328 |     pub fn as_cstr(&self) -> &CStr {
 329 |         unsafe {
 330 |             CStr::from_bytes_with_nul_unchecked(slice::from_raw_parts(
 331 |                 self.as_ptr(),
 332 |                 self.len() + 1,
 333 |             ))
 334 |         }
 335 |     }
 336 | 
 337 |     /// Get a raw pointer to the `StringCacheEntry`.
 338 |     #[inline]
 339 |     fn as_string_cache_entry(&self) -> &StringCacheEntry {
 340 |         // The allocator guarantees that the alignment is correct and that
 341 |         // this pointer is non-null
 342 |         unsafe { &*(self.char_ptr.as_ptr().cast::<StringCacheEntry>().sub(1)) }
 343 |     }
 344 | 
 345 |     /// Get the length (in bytes) of this string.
 346 |     #[inline]
 347 |     pub fn len(&self) -> usize {
 348 |         self.as_string_cache_entry().len
 349 |     }
 350 | 
 351 |     /// Returns true if the length is zero.
 352 |     pub fn is_empty(&self) -> bool {
 353 |         self.len() == 0
 354 |     }
 355 | 
 356 |     /// Get the precomputed hash for this string.
 357 |     #[inline]
 358 |     pub fn precomputed_hash(&self) -> u64 {
 359 |         self.as_string_cache_entry().hash
 360 |     }
 361 | 
 362 |     /// Get an owned String copy of this string.
 363 |     pub fn to_owned(&self) -> String {
 364 |         self.as_str().to_owned()
 365 |     }
 366 | }
 367 | 
 368 | // We're safe to impl these because the strings they reference are immutable
 369 | // and for all intents and purposes 'static since they're never deleted after
 370 | // being created
 371 | unsafe impl Send for Ustr {}
 372 | unsafe impl Sync for Ustr {}
 373 | 
 374 | impl PartialEq<str> for Ustr {
 375 |     fn eq(&self, other: &str) -> bool {
 376 |         self.as_str() == other
 377 |     }
 378 | }
 379 | 
 380 | impl PartialEq<Ustr> for str {
 381 |     fn eq(&self, u: &Ustr) -> bool {
 382 |         self == u.as_str()
 383 |     }
 384 | }
 385 | 
 386 | impl PartialEq<&str> for Ustr {
 387 |     fn eq(&self, other: &&str) -> bool {
 388 |         self.as_str() == *other
 389 |     }
 390 | }
 391 | 
 392 | impl PartialEq<Ustr> for &str {
 393 |     fn eq(&self, u: &Ustr) -> bool {
 394 |         *self == u.as_str()
 395 |     }
 396 | }
 397 | 
 398 | impl PartialEq<&&str> for Ustr {
 399 |     fn eq(&self, other: &&&str) -> bool {
 400 |         self.as_str() == **other
 401 |     }
 402 | }
 403 | 
 404 | impl PartialEq<Ustr> for &&str {
 405 |     fn eq(&self, u: &Ustr) -> bool {
 406 |         **self == u.as_str()
 407 |     }
 408 | }
 409 | 
 410 | impl PartialEq<String> for Ustr {
 411 |     fn eq(&self, other: &String) -> bool {
 412 |         self.as_str() == other
 413 |     }
 414 | }
 415 | 
 416 | impl PartialEq<Ustr> for String {
 417 |     fn eq(&self, u: &Ustr) -> bool {
 418 |         self == u.as_str()
 419 |     }
 420 | }
 421 | 
 422 | impl PartialEq<&String> for Ustr {
 423 |     fn eq(&self, other: &&String) -> bool {
 424 |         self.as_str() == *other
 425 |     }
 426 | }
 427 | 
 428 | impl PartialEq<Ustr> for &String {
 429 |     fn eq(&self, u: &Ustr) -> bool {
 430 |         *self == u.as_str()
 431 |     }
 432 | }
 433 | 
 434 | impl PartialEq<Box<str>> for Ustr {
 435 |     fn eq(&self, other: &Box<str>) -> bool {
 436 |         self.as_str() == &**other
 437 |     }
 438 | }
 439 | 
 440 | impl PartialEq<Ustr> for Box<str> {
 441 |     fn eq(&self, u: &Ustr) -> bool {
 442 |         &**self == u.as_str()
 443 |     }
 444 | }
 445 | 
 446 | impl PartialEq<Ustr> for &Box<str> {
 447 |     fn eq(&self, u: &Ustr) -> bool {
 448 |         &***self == u.as_str()
 449 |     }
 450 | }
 451 | 
 452 | impl PartialEq<Cow<'_, str>> for Ustr {
 453 |     fn eq(&self, other: &Cow<'_, str>) -> bool {
 454 |         self.as_str() == &*other
 455 |     }
 456 | }
 457 | 
 458 | impl PartialEq<Ustr> for Cow<'_, str> {
 459 |     fn eq(&self, u: &Ustr) -> bool {
 460 |         &*self == u.as_str()
 461 |     }
 462 | }
 463 | 
 464 | impl PartialEq<&Cow<'_, str>> for Ustr {
 465 |     fn eq(&self, other: &&Cow<'_, str>) -> bool {
 466 |         self.as_str() == &**other
 467 |     }
 468 | }
 469 | 
 470 | impl PartialEq<Ustr> for &Cow<'_, str> {
 471 |     fn eq(&self, u: &Ustr) -> bool {
 472 |         &**self == u.as_str()
 473 |     }
 474 | }
 475 | 
 476 | impl PartialEq<Ustr> for Path {
 477 |     fn eq(&self, u: &Ustr) -> bool {
 478 |         self == Path::new(u)
 479 |     }
 480 | }
 481 | 
 482 | impl PartialEq<Ustr> for &Path {
 483 |     fn eq(&self, u: &Ustr) -> bool {
 484 |         *self == Path::new(u)
 485 |     }
 486 | }
 487 | 
 488 | impl PartialEq<Ustr> for OsStr {
 489 |     fn eq(&self, u: &Ustr) -> bool {
 490 |         self == OsStr::new(u)
 491 |     }
 492 | }
 493 | 
 494 | impl PartialEq<Ustr> for &OsStr {
 495 |     fn eq(&self, u: &Ustr) -> bool {
 496 |         *self == OsStr::new(u)
 497 |     }
 498 | }
 499 | 
 500 | impl Eq for Ustr {}
 501 | 
 502 | impl<T: ?Sized> AsRef<T> for Ustr
 503 | where
 504 |     str: AsRef<T>,
 505 | {
 506 |     fn as_ref(&self) -> &T {
 507 |         self.as_str().as_ref()
 508 |     }
 509 | }
 510 | 
 511 | impl FromStr for Ustr {
 512 |     type Err = std::string::ParseError;
 513 | 
 514 |     #[inline]
 515 |     fn from_str(s: &str) -> Result<Self, Self::Err> {
 516 |         Ok(Ustr::from(s))
 517 |     }
 518 | }
 519 | 
 520 | impl From<&str> for Ustr {
 521 |     fn from(s: &str) -> Ustr {
 522 |         Ustr::from(s)
 523 |     }
 524 | }
 525 | 
 526 | impl From<Ustr> for &'static str {
 527 |     fn from(s: Ustr) -> &'static str {
 528 |         s.as_str()
 529 |     }
 530 | }
 531 | 
 532 | impl From<Ustr> for String {
 533 |     fn from(u: Ustr) -> Self {
 534 |         String::from(u.as_str())
 535 |     }
 536 | }
 537 | 
 538 | impl From<Ustr> for Box<str> {
 539 |     fn from(u: Ustr) -> Self {
 540 |         Box::from(u.as_str())
 541 |     }
 542 | }
 543 | 
 544 | impl From<Ustr> for Rc<str> {
 545 |     fn from(u: Ustr) -> Self {
 546 |         Rc::from(u.as_str())
 547 |     }
 548 | }
 549 | 
 550 | impl From<Ustr> for Arc<str> {
 551 |     fn from(u: Ustr) -> Self {
 552 |         Arc::from(u.as_str())
 553 |     }
 554 | }
 555 | 
 556 | impl From<Ustr> for Cow<'static, str> {
 557 |     fn from(u: Ustr) -> Self {
 558 |         Cow::Borrowed(u.as_str())
 559 |     }
 560 | }
 561 | 
 562 | impl From<String> for Ustr {
 563 |     fn from(s: String) -> Ustr {
 564 |         Ustr::from(&s)
 565 |     }
 566 | }
 567 | 
 568 | impl From<&String> for Ustr {
 569 |     fn from(s: &String) -> Ustr {
 570 |         Ustr::from(&**s)
 571 |     }
 572 | }
 573 | 
 574 | impl From<Box<str>> for Ustr {
 575 |     fn from(s: Box<str>) -> Ustr {
 576 |         Ustr::from(&*s)
 577 |     }
 578 | }
 579 | 
 580 | impl From<Rc<str>> for Ustr {
 581 |     fn from(s: Rc<str>) -> Ustr {
 582 |         Ustr::from(&*s)
 583 |     }
 584 | }
 585 | 
 586 | impl From<Arc<str>> for Ustr {
 587 |     fn from(s: Arc<str>) -> Ustr {
 588 |         Ustr::from(&*s)
 589 |     }
 590 | }
 591 | 
 592 | impl From<Cow<'_, str>> for Ustr {
 593 |     fn from(s: Cow<'_, str>) -> Ustr {
 594 |         Ustr::from(&*s)
 595 |     }
 596 | }
 597 | 
 598 | impl Default for Ustr {
 599 |     fn default() -> Self {
 600 |         Ustr::from("")
 601 |     }
 602 | }
 603 | 
 604 | impl Deref for Ustr {
 605 |     type Target = str;
 606 |     fn deref(&self) -> &Self::Target {
 607 |         self.as_str()
 608 |     }
 609 | }
 610 | 
 611 | impl fmt::Display for Ustr {
 612 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 613 |         write!(f, "{}", self.as_str())
 614 |     }
 615 | }
 616 | 
 617 | impl fmt::Debug for Ustr {
 618 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 619 |         write!(f, "u!({:?})", self.as_str())
 620 |     }
 621 | }
 622 | 
 623 | // Just feed the precomputed hash into the Hasher. Note that this will of course
 624 | // be terrible unless the Hasher in question is expecting a precomputed hash.
 625 | impl Hash for Ustr {
 626 |     fn hash<H: Hasher>(&self, state: &mut H) {
 627 |         self.precomputed_hash().hash(state);
 628 |     }
 629 | }
 630 | 
 631 | /// DO NOT CALL THIS.
 632 | ///
 633 | /// Clears the cache -- used for benchmarking and testing purposes to clear the
 634 | /// cache. Calling this will invalidate any previously created `UStr`s and
 635 | /// probably cause your house to burn down. DO NOT CALL THIS.
 636 | ///
 637 | /// # Safety
 638 | ///
 639 | /// DO NOT CALL THIS.
 640 | #[doc(hidden)]
 641 | pub unsafe fn _clear_cache() {
 642 |     for m in STRING_CACHE.0.iter() {
 643 |         m.lock().clear();
 644 |     }
 645 | }
 646 | 
 647 | /// Returns the total amount of memory allocated and in use by the cache in
 648 | /// bytes.
 649 | pub fn total_allocated() -> usize {
 650 |     STRING_CACHE
 651 |         .0
 652 |         .iter()
 653 |         .map(|sc| {
 654 |             let t = sc.lock().total_allocated();
 655 | 
 656 |             t
 657 |         })
 658 |         .sum()
 659 | }
 660 | 
 661 | /// Returns the total amount of memory reserved by the cache in bytes.
 662 | pub fn total_capacity() -> usize {
 663 |     STRING_CACHE
 664 |         .0
 665 |         .iter()
 666 |         .map(|sc| {
 667 |             let t = sc.lock().total_capacity();
 668 |             t
 669 |         })
 670 |         .sum()
 671 | }
 672 | 
 673 | /// Create a new `Ustr` from the given `str`.
 674 | ///
 675 | /// # Examples
 676 | ///
 677 | /// ```
 678 | /// use ustr::ustr;
 679 | /// # unsafe { ustr::_clear_cache() };
 680 | ///
 681 | /// let u1 = ustr("the quick brown fox");
 682 | /// let u2 = ustr("the quick brown fox");
 683 | /// assert_eq!(u1, u2);
 684 | /// assert_eq!(ustr::num_entries(), 1);
 685 | /// ```
 686 | #[inline]
 687 | pub fn ustr(s: &str) -> Ustr {
 688 |     Ustr::from(s)
 689 | }
 690 | 
 691 | /// Create a new `Ustr` from the given `str` but only if it already exists in
 692 | /// the string cache.
 693 | ///
 694 | /// # Examples
 695 | ///
 696 | /// ```
 697 | /// use ustr::{ustr, existing_ustr};
 698 | /// # unsafe { ustr::_clear_cache() };
 699 | ///
 700 | /// let u1 = existing_ustr("the quick brown fox");
 701 | /// let u2 = ustr("the quick brown fox");
 702 | /// let u3 = existing_ustr("the quick brown fox");
 703 | /// assert_eq!(u1, None);
 704 | /// assert_eq!(u3, Some(u2));
 705 | /// ```
 706 | #[inline]
 707 | pub fn existing_ustr(s: &str) -> Option<Ustr> {
 708 |     Ustr::from_existing(s)
 709 | }
 710 | 
 711 | /// Utility function to get a reference to the main cache object for use with
 712 | /// serialization.
 713 | ///
 714 | /// # Examples
 715 | ///
 716 | /// ```
 717 | /// # use ustr::{Ustr, ustr, ustr as u};
 718 | /// # #[cfg(feature="serde")]
 719 | /// # {
 720 | /// # unsafe { ustr::_clear_cache() };
 721 | /// ustr("Send me to JSON and back");
 722 | /// let json = serde_json::to_string(ustr::cache()).unwrap();
 723 | /// # }
 724 | pub fn cache() -> &'static Bins {
 725 |     &STRING_CACHE
 726 | }
 727 | 
 728 | /// Returns the number of unique strings in the cache.
 729 | ///
 730 | /// This may be an underestimate if other threads are writing to the cache
 731 | /// concurrently.
 732 | ///
 733 | /// # Examples
 734 | ///
 735 | /// ```
 736 | /// use ustr::ustr as u;
 737 | ///
 738 | /// let _ = u("Hello");
 739 | /// let _ = u(", World!");
 740 | /// assert_eq!(ustr::num_entries(), 2);
 741 | /// ```
 742 | pub fn num_entries() -> usize {
 743 |     STRING_CACHE
 744 |         .0
 745 |         .iter()
 746 |         .map(|sc| {
 747 |             let t = sc.lock().num_entries();
 748 |             t
 749 |         })
 750 |         .sum()
 751 | }
 752 | 
 753 | #[doc(hidden)]
 754 | pub fn num_entries_per_bin() -> Vec<usize> {
 755 |     STRING_CACHE
 756 |         .0
 757 |         .iter()
 758 |         .map(|sc| {
 759 |             let t = sc.lock().num_entries();
 760 |             t
 761 |         })
 762 |         .collect::<Vec<_>>()
 763 | }
 764 | 
 765 | /// Return an iterator over the entire string cache.
 766 | ///
 767 | /// If another thread is adding strings concurrently to this call then they
 768 | /// might not show up in the view of the cache presented by this iterator.
 769 | ///
 770 | /// # Safety
 771 | ///
 772 | /// This returns an iterator to the state of the cache at the time when
 773 | /// `string_cache_iter()` was called. It is of course possible that another
 774 | /// thread will add more strings to the cache after this, but since we never
 775 | /// destroy the strings, they remain valid, meaning it's safe to iterate over
 776 | /// them, the list just might not be completely up to date.
 777 | pub fn string_cache_iter() -> StringCacheIterator {
 778 |     let mut allocs = Vec::new();
 779 |     for m in STRING_CACHE.0.iter() {
 780 |         let sc = m.lock();
 781 |         // the start of the allocator's data is actually the ptr, start() just
 782 |         // points to the beginning of the allocated region. The first bytes will
 783 |         // be uninitialized since we're bumping down
 784 |         for a in &sc.old_allocs {
 785 |             allocs.push((a.ptr(), a.end()));
 786 |         }
 787 |         let ptr = sc.alloc.ptr();
 788 |         let end = sc.alloc.end();
 789 |         if ptr != end {
 790 |             allocs.push((sc.alloc.ptr(), sc.alloc.end()));
 791 |         }
 792 |     }
 793 | 
 794 |     let current_ptr =
 795 |         allocs.first().map(|s| s.0).unwrap_or_else(std::ptr::null);
 796 | 
 797 |     StringCacheIterator {
 798 |         allocs,
 799 |         current_alloc: 0,
 800 |         current_ptr,
 801 |     }
 802 | }
 803 | 
 804 | /// The type used for the global string cache.
 805 | ///
 806 | /// This is exposed to allow e.g. serialization of the data returned by the
 807 | /// [`cache()`] function.
 808 | #[repr(transparent)]
 809 | pub struct Bins(pub(crate) [Mutex<StringCache>; NUM_BINS]);
 810 | 
 811 | #[cfg(test)]
 812 | lazy_static::lazy_static! {
 813 |     static ref TEST_LOCK: Mutex<()> = Mutex::new(());
 814 | }
 815 | 
 816 | #[cfg(test)]
 817 | mod tests {
 818 |     use super::TEST_LOCK;
 819 |     use lazy_static::lazy_static;
 820 |     use std::ffi::OsStr;
 821 |     use std::path::Path;
 822 |     use std::sync::Mutex;
 823 | 
 824 |     #[test]
 825 |     fn it_works() {
 826 |         let _t = TEST_LOCK.lock();
 827 |         use super::ustr as u;
 828 | 
 829 |         let u_hello = u("hello");
 830 |         assert_eq!(u_hello, "hello");
 831 |         let u_world = u("world");
 832 |         assert_eq!(u_world, String::from("world"));
 833 |     }
 834 | 
 835 |     #[test]
 836 |     fn empty_string() {
 837 |         let _t = TEST_LOCK.lock();
 838 |         use super::ustr as u;
 839 | 
 840 |         unsafe {
 841 |             super::_clear_cache();
 842 |         }
 843 | 
 844 |         let _empty = u("");
 845 |         let empty = u("");
 846 | 
 847 |         assert!(empty.as_str().is_empty());
 848 |         assert_eq!(super::num_entries(), 1);
 849 |     }
 850 | 
 851 |     #[test]
 852 |     fn c_str_works() {
 853 |         let _t = TEST_LOCK.lock();
 854 |         use super::ustr as u;
 855 |         use std::ffi::CStr;
 856 | 
 857 |         let s_fox = "The quick brown fox jumps over the lazy dog.";
 858 |         let u_fox = u(s_fox);
 859 |         let fox = unsafe { CStr::from_ptr(u_fox.as_char_ptr()) }
 860 |             .to_string_lossy()
 861 |             .into_owned();
 862 |         assert_eq!(fox, s_fox);
 863 | 
 864 |         let s_odys = "Τη γλώσσα μου έδωσαν ελληνική";
 865 |         let u_odys = u(s_odys);
 866 |         let odys = unsafe { CStr::from_ptr(u_odys.as_char_ptr()) }
 867 |             .to_string_lossy()
 868 |             .into_owned();
 869 |         assert_eq!(odys, s_odys);
 870 |     }
 871 | 
 872 |     #[test]
 873 |     // We have to disable miri here as it's far too slow unfortunately
 874 |     #[cfg_attr(miri, ignore)]
 875 |     fn blns() {
 876 |         let _t = TEST_LOCK.lock();
 877 |         use super::{string_cache_iter, ustr as u};
 878 |         use std::collections::HashSet;
 879 | 
 880 |         // clear the cache first or our results will be wrong
 881 |         unsafe { super::_clear_cache() };
 882 | 
 883 |         // let path =
 884 |         // std::path::Path::new(&std::env::var("CARGO_MANIFEST_DIR").unwrap())
 885 |         //     .join("data")
 886 |         //     .join("blns.txt");
 887 |         // let blns = std::fs::read_to_string(path).unwrap();
 888 |         let blns = include_str!("../data/blns.txt");
 889 | 
 890 |         let mut hs = HashSet::new();
 891 |         for s in blns.split_whitespace() {
 892 |             hs.insert(s);
 893 |         }
 894 | 
 895 |         let mut us = Vec::new();
 896 |         let mut ss = Vec::new();
 897 | 
 898 |         for s in blns.split_whitespace().cycle().take(100_000) {
 899 |             let u = u(s);
 900 |             us.push(u);
 901 |             ss.push(s.to_owned());
 902 |         }
 903 | 
 904 |         let mut hs_u = HashSet::new();
 905 |         for s in string_cache_iter() {
 906 |             hs_u.insert(s);
 907 |         }
 908 |         let diff: HashSet<_> = hs.difference(&hs_u).collect();
 909 | 
 910 |         // check that the number of entries is the same
 911 |         assert_eq!(super::num_entries(), hs.len());
 912 | 
 913 |         // check that we have the exact same (unique) strings in the cache as in
 914 |         // the source data
 915 |         assert_eq!(diff.len(), 0);
 916 | 
 917 |         let nbs = super::num_entries_per_bin();
 918 |         println!("{:?}", nbs);
 919 | 
 920 |         println!("Total allocated: {}", super::total_allocated());
 921 |         println!("Total capacity: {}", super::total_capacity());
 922 | 
 923 |         println!(
 924 |             "size of StringCache: {}",
 925 |             std::mem::size_of::<super::StringCache>()
 926 |         );
 927 |     }
 928 | 
 929 |     #[test]
 930 |     // We have to disable miri here as it's far too slow unfortunately
 931 |     #[cfg_attr(miri, ignore)]
 932 |     fn raft() {
 933 |         let _t = TEST_LOCK.lock();
 934 |         use super::ustr as u;
 935 |         use std::sync::Arc;
 936 | 
 937 |         // let path =
 938 |         // std::path::Path::new(&std::env::var("CARGO_MANIFEST_DIR").unwrap())
 939 |         //     .join("data")
 940 |         //     .join("raft-large-directories.txt");
 941 |         // let raft = std::fs::read_to_string(path).unwrap();
 942 |         let raft = include_str!("../data/raft-large-directories.txt");
 943 |         let raft = Arc::new(
 944 |             raft.split_whitespace()
 945 |                 .collect::<Vec<_>>()
 946 |                 .chunks(3)
 947 |                 .map(|s| {
 948 |                     if s.len() == 3 {
 949 |                         format!("{}/{}/{}", s[0], s[1], s[2])
 950 |                     } else {
 951 |                         s[0].to_owned()
 952 |                     }
 953 |                 })
 954 |                 .collect::<Vec<_>>(),
 955 |         );
 956 | 
 957 |         let s = raft.clone();
 958 |         for _ in 0..600 {
 959 |             let mut v = Vec::with_capacity(20_000);
 960 |             unsafe { super::_clear_cache() };
 961 |             for s in s.iter().cycle().take(20_000) {
 962 |                 v.push(u(s));
 963 |             }
 964 |         }
 965 |     }
 966 | 
 967 |     // This test is to have miri check the allocation code paths, but miri
 968 |     // can't open files so it's not usable right now
 969 |     // #[test]
 970 |     // fn words() {
 971 |     //     let _t = TEST_LOCK.lock();
 972 |     //     use super::ustr as u;
 973 |     //     use std::sync::Arc;
 974 | 
 975 |     //     let path = std::path::Path::new("/usr/share/dict/words");
 976 |     //     let wordlist = std::fs::read_to_string(path).unwrap();
 977 |     //     let wordlist = Arc::new(
 978 |     //         wordlist
 979 |     //             .split_whitespace()
 980 |     //             .collect::<Vec<_>>()
 981 |     //             .chunks(7)
 982 |     //             .cycle()
 983 |     //             .take(4_000_000)
 984 |     //             .enumerate()
 985 |     //             .map(|(i, s)| u(&format!("{}{}", i, s.join("-"))))
 986 |     //             .collect::<Vec<_>>(),
 987 |     //     );
 988 |     // }
 989 | 
 990 |     #[cfg(all(feature = "serde", not(miri)))]
 991 |     #[test]
 992 |     fn serialization() {
 993 |         let _t = TEST_LOCK.lock();
 994 |         use super::{string_cache_iter, ustr as u};
 995 |         use std::collections::HashSet;
 996 | 
 997 |         // clear the cache first or our results will be wrong
 998 |         unsafe { super::_clear_cache() };
 999 | 
1000 |         let path = std::path::Path::new(
1001 |             &std::env::var("CARGO_MANIFEST_DIR")
1002 |                 .expect("CARGO_MANIFEST_DIR not set"),
1003 |         )
1004 |         .join("data")
1005 |         .join("blns.txt");
1006 |         let blns = std::fs::read_to_string(path).unwrap();
1007 | 
1008 |         let mut hs = HashSet::new();
1009 |         for s in blns.split_whitespace() {
1010 |             hs.insert(s);
1011 |         }
1012 | 
1013 |         let mut us = Vec::new();
1014 |         let mut ss = Vec::new();
1015 | 
1016 |         for s in blns.split_whitespace().cycle().take(100_000) {
1017 |             let u = u(s);
1018 |             us.push(u);
1019 |             ss.push(s.to_owned());
1020 |         }
1021 | 
1022 |         let json = serde_json::to_string(super::cache()).unwrap();
1023 |         unsafe {
1024 |             super::_clear_cache();
1025 |         }
1026 |         let _: super::DeserializedCache = serde_json::from_str(&json).unwrap();
1027 | 
1028 |         // now check that we've got the same data in the cache still
1029 |         let mut hs_u = HashSet::new();
1030 |         for s in string_cache_iter() {
1031 |             hs_u.insert(s);
1032 |         }
1033 |         let diff: HashSet<_> = hs.difference(&hs_u).collect();
1034 | 
1035 |         // check that the number of entries is the same
1036 |         assert_eq!(super::num_entries(), hs.len());
1037 | 
1038 |         // check that we have the exact same (unique) strings in the cache as in
1039 |         // the source data
1040 |         assert_eq!(diff.len(), 0);
1041 |     }
1042 | 
1043 |     #[cfg(all(feature = "serde", not(miri)))]
1044 |     #[test]
1045 |     fn serialization_ustr() {
1046 |         let _t = TEST_LOCK.lock();
1047 | 
1048 |         use super::{ustr, Ustr};
1049 | 
1050 |         let u_hello = ustr("hello");
1051 | 
1052 |         let json = serde_json::to_string(&u_hello).unwrap();
1053 |         let me_hello: Ustr = serde_json::from_str(&json).unwrap();
1054 | 
1055 |         assert_eq!(u_hello, me_hello);
1056 |     }
1057 | 
1058 |     #[test]
1059 |     fn partial_ord() {
1060 |         let _t = TEST_LOCK.lock();
1061 |         use super::ustr;
1062 |         let str_a = ustr("aaa");
1063 |         let str_z = ustr("zzz");
1064 |         let str_k = ustr("kkk");
1065 |         assert!(str_a < str_k);
1066 |         assert!(str_k < str_z);
1067 |     }
1068 | 
1069 |     #[test]
1070 |     fn ord() {
1071 |         let _t = TEST_LOCK.lock();
1072 |         use super::ustr;
1073 |         let u_apple = ustr("apple");
1074 |         let u_bravo = ustr("bravo");
1075 |         let u_charlie = ustr("charlie");
1076 |         let u_delta = ustr("delta");
1077 | 
1078 |         let mut v = vec![u_delta, u_bravo, u_charlie, u_apple];
1079 |         v.sort();
1080 |         assert_eq!(v, vec![u_apple, u_bravo, u_charlie, u_delta]);
1081 |     }
1082 | 
1083 |     fn takes_into_str<'a, S: Into<&'a str>>(s: S) -> &'a str {
1084 |         s.into()
1085 |     }
1086 | 
1087 |     #[test]
1088 |     fn test_into_str() {
1089 |         let _t = TEST_LOCK.lock();
1090 |         use super::ustr;
1091 | 
1092 |         assert_eq!("converted", takes_into_str(ustr("converted")));
1093 |     }
1094 | 
1095 |     #[test]
1096 |     fn test_existing_ustr() {
1097 |         let _t = TEST_LOCK.lock();
1098 |         use super::{existing_ustr, ustr};
1099 |         assert_eq!(existing_ustr("hello world!"), None);
1100 |         let s1 = ustr("hello world!");
1101 |         let s2 = existing_ustr("hello world!");
1102 |         assert_eq!(Some(s1), s2);
1103 |     }
1104 | 
1105 |     #[test]
1106 |     fn test_empty_cache() {
1107 |         unsafe { super::_clear_cache() };
1108 |         assert_eq!(
1109 |             super::string_cache_iter().collect::<Vec<_>>(),
1110 |             Vec::<&'static str>::new()
1111 |         );
1112 |     }
1113 | 
1114 |     #[test]
1115 |     fn as_refs() {
1116 |         let _t = TEST_LOCK.lock();
1117 | 
1118 |         let u = super::ustr("test");
1119 | 
1120 |         let s: String = u.to_owned();
1121 |         assert_eq!(u, s);
1122 |         assert_eq!(s, u);
1123 | 
1124 |         let p: &Path = u.as_ref();
1125 |         assert_eq!(p, u);
1126 | 
1127 |         let _: &[u8] = u.as_ref();
1128 | 
1129 |         let o: &OsStr = u.as_ref();
1130 |         assert_eq!(p, o);
1131 |         assert_eq!(o, p);
1132 | 
1133 |         let cow = std::borrow::Cow::from(u);
1134 |         assert_eq!(cow, u);
1135 |         assert_eq!(u, cow);
1136 | 
1137 |         let boxed: Box<str> = u.into();
1138 |         assert_eq!(boxed, u);
1139 |     }
1140 | }
1141 | 
1142 | lazy_static::lazy_static! {
1143 |     static ref STRING_CACHE: Bins = {
1144 |         use std::mem::{self, MaybeUninit};
1145 |         // This deeply unsafe feeling dance allows us to initialize an array of
1146 |         // arbitrary size and will have to tide us over until const generics
1147 |         // land. See:
1148 |         // https://doc.rust-lang.org/beta/std/mem/union.MaybeUninit.html#initializing-an-array-element-by-element
1149 | 
1150 |         // Create an uninitialized array of `MaybeUninit`. The `assume_init` is
1151 |         // safe because the type we are claiming to have initialized here is a
1152 |         // bunch of `MaybeUninit`s, which do not require initialization.
1153 |         let mut bins: [MaybeUninit<Mutex<StringCache>>; NUM_BINS] = unsafe {
1154 |             MaybeUninit::uninit().assume_init()
1155 |         };
1156 | 
1157 |         // Dropping a `MaybeUninit` does nothing. Thus using raw pointer
1158 |         // assignment instead of `ptr::write` does not cause the old
1159 |         // uninitialized value to be dropped. Also if there is a panic during
1160 |         // this loop, we have a memory leak, but there is no memory safety
1161 |         // issue.
1162 |         for bin in &mut bins[..] {
1163 |             *bin = MaybeUninit::new(Mutex::new(StringCache::default()));
1164 |         }
1165 | 
1166 |         // Everything is initialized. Transmute the array to the
1167 |         // initialized type.
1168 |         unsafe { mem::transmute::<_, Bins>(bins) }
1169 |     };
1170 | }
1171 | 
1172 | // Use the top bits of the hash to choose a bin
1173 | #[inline]
1174 | fn whichbin(hash: u64) -> usize {
1175 |     ((hash >> TOP_SHIFT as u64) % NUM_BINS as u64) as usize
1176 | }
1177 | 


--------------------------------------------------------------------------------
/src/serialization.rs:
--------------------------------------------------------------------------------
  1 | use super::*;
  2 | use serde::{
  3 |     de::{Deserialize, Deserializer, Error, SeqAccess, Visitor},
  4 |     ser::{Serialize, SerializeSeq, Serializer},
  5 | };
  6 | 
  7 | impl Serialize for Bins {
  8 |     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
  9 |     where
 10 |         S: Serializer,
 11 |     {
 12 |         let strings: Vec<&'static str> = string_cache_iter().collect();
 13 |         let mut seq = serializer.serialize_seq(Some(strings.len()))?;
 14 |         for s in strings {
 15 |             match seq.serialize_element(s) {
 16 |                 Ok(_) => (),
 17 |                 Err(e) => {
 18 |                     panic!("Error serializing \"{}\": {}", s, e);
 19 |                 }
 20 |             }
 21 |         }
 22 |         seq.end()
 23 |     }
 24 | }
 25 | 
 26 | pub struct BinsVisitor {}
 27 | 
 28 | impl BinsVisitor {
 29 |     #[allow(clippy::new_without_default)]
 30 |     pub fn new() -> Self {
 31 |         BinsVisitor {}
 32 |     }
 33 | }
 34 | 
 35 | impl<'de> Visitor<'de> for BinsVisitor {
 36 |     type Value = DeserializedCache;
 37 | 
 38 |     fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
 39 |         formatter.write_str("a sequence of strings")
 40 |     }
 41 | 
 42 |     fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
 43 |     where
 44 |         A: SeqAccess<'de>,
 45 |     {
 46 |         while let Some(s) = seq.next_element::<String>()? {
 47 |             ustr(&s);
 48 |         }
 49 | 
 50 |         Ok(DeserializedCache {})
 51 |     }
 52 | }
 53 | 
 54 | pub struct DeserializedCache {}
 55 | 
 56 | impl<'de> Deserialize<'de> for DeserializedCache {
 57 |     fn deserialize<D>(deserializer: D) -> Result<DeserializedCache, D::Error>
 58 |     where
 59 |         D: Deserializer<'de>,
 60 |     {
 61 |         deserializer.deserialize_seq(BinsVisitor::new())
 62 |     }
 63 | }
 64 | 
 65 | pub struct UstrVisitor {}
 66 | impl UstrVisitor {
 67 |     #[allow(clippy::new_without_default)]
 68 |     pub fn new() -> Self {
 69 |         UstrVisitor {}
 70 |     }
 71 | }
 72 | 
 73 | impl<'de> Visitor<'de> for UstrVisitor {
 74 |     type Value = Ustr;
 75 | 
 76 |     fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
 77 |         formatter.write_str("a &str")
 78 |     }
 79 | 
 80 |     fn visit_str<E>(self, s: &str) -> Result<Self::Value, E>
 81 |     where
 82 |         E: Error,
 83 |     {
 84 |         Ok(Ustr::from(s))
 85 |     }
 86 | }
 87 | 
 88 | impl<'de> Deserialize<'de> for Ustr {
 89 |     fn deserialize<D>(deserializer: D) -> Result<Ustr, D::Error>
 90 |     where
 91 |         D: Deserializer<'de>,
 92 |     {
 93 |         deserializer.deserialize_str(UstrVisitor::new())
 94 |     }
 95 | }
 96 | 
 97 | impl Serialize for Ustr {
 98 |     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
 99 |     where
100 |         S: Serializer,
101 |     {
102 |         serializer.serialize_str(self.as_str())
103 |     }
104 | }
105 | 


--------------------------------------------------------------------------------
/src/stringcache.rs:
--------------------------------------------------------------------------------
  1 | use super::bumpalloc::LeakyBumpAlloc;
  2 | 
  3 | // `StringCache` stores a `Vec` of pointers to the `StringCacheEntry` structs.
  4 | // The actual memory for the `StringCacheEntry` is stored in the LeakyBumpAlloc,
  5 | // and each `Alloc` is rotated out when it's full and a new one twice its size
  6 | // is allocated. The Allocator memory is never freed so our strings essentialy
  7 | // have a 'static lifetime.
  8 | //
  9 | // The actual memory representation is as follows. Each `StringCacheEntry` is
 10 | // aligned to 8 bytes on a 64-bit system. The 64-bit memoized hash of the string
 11 | // is stored first, then a usize length, then the u8 characters, followed by a
 12 | // null terminator (not included in len), then x<8 bytes of uninitialized memory
 13 | // as padding before the next aligned entry.
 14 | //
 15 | //       hash             len       H e l l o , W o r l d !\0
 16 | // |. . . . . . . .|. . . . . . . .|. . . . . . . .|. . . . . . . .|
 17 | // 0               8               16                     len
 18 | // ^ StringCacheEntry              ^ u8 chars               ^ null ^ Next entry
 19 | //
 20 | // Proper alignment is guaranteed when allocating each entry as the alignment
 21 | // is baked into the allocator. `StringCache` is responsible for monitoring the
 22 | // Allocator and creating a new one when it would overflow -- the `Alloc` itself
 23 | // will just `abort()` if it runs out of memory. Note that we abort() rather
 24 | // than panic because the behaviour of the spinlock in case of a panic while
 25 | // holding the lock is undefined.
 26 | //
 27 | // Thread safety is ensured because we can only access the `StringCache` through
 28 | // the spinlock in the `lazy_static` ref. The initial capacity of the cache is
 29 | // divided evenly among a number of 'bins' or shards each with their own lock,
 30 | // in order to reduce contention.
 31 | #[repr(align(128))]
 32 | pub(crate) struct StringCache {
 33 |     pub(crate) alloc: LeakyBumpAlloc,
 34 |     pub(crate) old_allocs: Vec<LeakyBumpAlloc>,
 35 |     entries: Vec<*mut StringCacheEntry>,
 36 |     num_entries: usize,
 37 |     mask: usize,
 38 |     total_allocated: usize,
 39 |     // Padding and aligning to 128 bytes gives up to 20% performance
 40 |     // improvement this actually aligns to 256 bytes because of the Mutex
 41 |     // around it.
 42 |     _pad: [u32; 3],
 43 | }
 44 | 
 45 | // TODO: make these configurable?
 46 | // Initial size of the StringCache table
 47 | pub(crate) const INITIAL_CAPACITY: usize = 1 << 20;
 48 | // Initial size of the allocator storage (in bytes)
 49 | pub(crate) const INITIAL_ALLOC: usize = 4 << 20;
 50 | // Number of bins (shards) for map
 51 | pub(crate) const BIN_SHIFT: usize = 6;
 52 | pub(crate) const NUM_BINS: usize = 1 << BIN_SHIFT;
 53 | // Shift for top bits to determine bin a hash falls into
 54 | pub(crate) const TOP_SHIFT: usize =
 55 |     8 * std::mem::size_of::<usize>() - BIN_SHIFT;
 56 | 
 57 | impl StringCache {
 58 |     /// Create a new StringCache with the given starting capacity
 59 |     pub fn new() -> StringCache {
 60 |         let capacity = INITIAL_CAPACITY / NUM_BINS;
 61 |         let alloc = LeakyBumpAlloc::new(
 62 |             INITIAL_ALLOC / NUM_BINS,
 63 |             std::mem::align_of::<StringCacheEntry>(),
 64 |         );
 65 |         StringCache {
 66 |             // Current allocator.
 67 |             alloc,
 68 |             // Old allocators we'll keep around for iteration purposes.
 69 |             // 16 would mean we've allocated 128GB of string storage since we
 70 |             // double each time.
 71 |             old_allocs: Vec::with_capacity(16),
 72 |             // Vector of pointers to the `StringCacheEntry` headers.
 73 |             entries: vec![std::ptr::null_mut(); capacity],
 74 |             num_entries: 0,
 75 |             mask: capacity - 1,
 76 |             total_allocated: capacity,
 77 |             _pad: [0u32; 3],
 78 |         }
 79 |     }
 80 | 
 81 |     pub(crate) fn get_existing(
 82 |         &self,
 83 |         string: &str,
 84 |         hash: u64,
 85 |     ) -> Option<*const u8> {
 86 |         let mut pos = self.mask & hash as usize;
 87 |         let mut dist = 0;
 88 |         loop {
 89 |             let entry = unsafe { self.entries.get_unchecked(pos) };
 90 |             if entry.is_null() {
 91 |                 return None;
 92 |             }
 93 |             // This is safe as long as entry points to a valid address and the
 94 |             // layout described in the `StringCache` doc comment holds.
 95 |             unsafe {
 96 |                 // entry is a `*StringCacheEntry` so offseting by 1 gives us a
 97 |                 // pointer to the end of the entry, aka the beginning of the
 98 |                 // chars.
 99 |                 // As long as the memory is valid and the layout is correct,
100 |                 // we're safe to create a string slice from the chars since
101 |                 // they were copied directly from a valid `str`.
102 |                 let entry_chars = entry.add(1) as *const u8;
103 |                 // if entry is non-null then it must point to a valid
104 |                 // StringCacheEntry
105 |                 let sce = &**entry;
106 |                 if sce.hash == hash
107 |                     && sce.len == string.len()
108 |                     && std::str::from_utf8_unchecked(
109 |                         std::slice::from_raw_parts(entry_chars, sce.len),
110 |                     ) == string
111 |                 {
112 |                     // found matching string in the cache already, return it
113 |                     return Some(entry_chars);
114 |                 }
115 |             }
116 | 
117 |             // Keep looking.
118 |             dist += 1;
119 |             debug_assert!(dist <= self.mask);
120 |             pos = (pos + dist) & self.mask;
121 |         }
122 |     }
123 | 
124 |     // Insert the given string with its given hash into the cache.
125 |     pub(crate) fn insert(&mut self, string: &str, hash: u64) -> *const u8 {
126 |         let mut pos = self.mask & hash as usize;
127 |         let mut dist = 0;
128 |         loop {
129 |             let entry = unsafe { self.entries.get_unchecked(pos) };
130 |             if entry.is_null() {
131 |                 // found empty slot to insert
132 |                 break;
133 |             }
134 | 
135 |             // This is safe as long as entry points to a valid address and the
136 |             // layout described in the `StringCache` doc comment holds.
137 |             unsafe {
138 |                 // entry is a `*StringCacheEntry` so offseting by 1 gives us a
139 |                 // pointer to the end of the entry, aka the beginning of the
140 |                 // chars.
141 |                 // As long as the memory is valid and the layout is correct,
142 |                 // we're safe to create a string slice from the chars since
143 |                 // they were copied directly from a valid `str`.
144 |                 let entry_chars = entry.add(1) as *const u8;
145 |                 // If entry is non-null then it must point to a valid
146 |                 // `StringCacheEntry`.
147 |                 let sce = &**entry;
148 |                 if sce.hash == hash
149 |                     && sce.len == string.len()
150 |                     && std::str::from_utf8_unchecked(
151 |                         std::slice::from_raw_parts(entry_chars, sce.len),
152 |                     ) == string
153 |                 {
154 |                     // found matching string in the cache already, return it
155 |                     return entry_chars;
156 |                 }
157 |             }
158 | 
159 |             // keep looking
160 |             dist += 1;
161 |             debug_assert!(dist <= self.mask);
162 |             pos = (pos + dist) & self.mask;
163 |         }
164 | 
165 |         //
166 |         // Insert the new string.
167 |         //
168 | 
169 |         // We know pos is in bounds as it's &ed with the mask above.
170 |         let entry_ptr = unsafe { self.entries.get_unchecked_mut(pos) };
171 |         // Ddd one to length for null byte.
172 |         // There's no way we could overflow here in practice since that would
173 |         // require having allocated a `u64::MAX`-length string, by which time
174 |         // we'll be using 128-bit pointers and we'll need to rewrite this
175 |         // crate anyway.
176 |         let byte_len = string.len() + 1;
177 |         let alloc_size = std::mem::size_of::<StringCacheEntry>() + byte_len;
178 | 
179 |         // if our new allocation would spill over the allocator, make a new
180 |         // allocator and let the old one leak
181 |         let capacity = self.alloc.capacity();
182 |         let allocated = self.alloc.allocated();
183 |         if alloc_size
184 |             .checked_add(allocated)
185 |             .expect("overflowed alloc_size + allocated")
186 |             > capacity
187 |         {
188 |             let new_capacity = capacity
189 |                 .checked_mul(2)
190 |                 .expect("capacity * 2 overflowed")
191 |                 .max(alloc_size);
192 |             let old_alloc = std::mem::replace(
193 |                 &mut self.alloc,
194 |                 LeakyBumpAlloc::new(
195 |                     new_capacity,
196 |                     std::mem::align_of::<StringCacheEntry>(),
197 |                 ),
198 |             );
199 |             self.old_allocs.push(old_alloc);
200 |             self.total_allocated += new_capacity;
201 |         }
202 | 
203 |         // This is safe as long as:
204 |         // 1. `alloc_size` is calculated correctly.
205 |         // 2. there is enough space in the allocator (checked in the block
206 |         //    above).
207 |         // 3. The `StringCacheEntry` layout descibed above holds and the memory
208 |         //    returned by allocate() is prooperly aligned.
209 |         unsafe {
210 |             *entry_ptr =
211 |                 self.alloc.allocate(alloc_size) as *mut StringCacheEntry;
212 | 
213 |             // Write the header.
214 |             // `entry_ptr` is guaranteed to point to a valid `StringCacheEntry`,
215 |             // or `alloc.allocate()` would have aborted.
216 |             std::ptr::write(
217 |                 *entry_ptr,
218 |                 StringCacheEntry {
219 |                     hash,
220 |                     len: string.len(),
221 |                 },
222 |             );
223 |             // Write the characters after the `StringCacheEntry`.
224 |             let char_ptr = entry_ptr.add(1) as *mut u8;
225 |             std::ptr::copy_nonoverlapping(
226 |                 string.as_bytes().as_ptr(),
227 |                 char_ptr,
228 |                 string.len(),
229 |             );
230 |             // Write the trailing null.
231 |             let write_ptr = char_ptr.add(string.len());
232 |             std::ptr::write(write_ptr, 0u8);
233 | 
234 |             self.num_entries += 1;
235 |             // We want to keep an 0.5 load factor for the map, so grow if we've
236 |             // exceeded that.
237 |             if self.num_entries * 2 > self.mask {
238 |                 self.grow();
239 |             }
240 | 
241 |             char_ptr
242 |         }
243 |     }
244 | 
245 |     // Double the size of the map storage.
246 |     //
247 |     // This is safe as long as:
248 |     // - The in-memory layout of the `StringCacheEntry` is correct.
249 |     //
250 |     // If there's not enough memory for the new entry table, it will just abort
251 |     pub(crate) unsafe fn grow(&mut self) {
252 |         let new_mask = self.mask * 2 + 1;
253 | 
254 |         let mut new_entries: std::vec::Vec<*mut StringCacheEntry> =
255 |             vec![std::ptr::null_mut(); new_mask + 1];
256 | 
257 |         // copy the existing map into the new map
258 |         let mut to_copy = self.num_entries;
259 |         for e in self.entries.iter_mut() {
260 |             if e.is_null() {
261 |                 continue;
262 |             }
263 | 
264 |             // Start of the entry is the hash.
265 |             let hash = *(*e as *const u64);
266 |             let mut pos = (hash as usize) & new_mask;
267 |             let mut dist = 0;
268 |             loop {
269 |                 if new_entries[pos].is_null() {
270 |                     // Here's an empty slot to put the pointer in.
271 |                     break;
272 |                 }
273 | 
274 |                 dist += 1;
275 |                 // This should be impossble as we've allocated twice as many
276 |                 // slots as we have entries.
277 |                 debug_assert!(dist <= new_mask, "Probing wrapped around");
278 |                 pos = pos.wrapping_add(dist) & new_mask;
279 |             }
280 | 
281 |             new_entries[pos] = *e;
282 |             to_copy -= 1;
283 |             if to_copy == 0 {
284 |                 break;
285 |             }
286 |         }
287 | 
288 |         self.entries = new_entries;
289 |         self.mask = new_mask;
290 |     }
291 | 
292 |     // This is only called by `clear()` during tests to clear the cache between
293 |     // runs. **DO NOT CALL THIS**.
294 |     pub(crate) unsafe fn clear(&mut self) {
295 |         // just zero all the pointers that have already been set
296 |         std::ptr::write_bytes(self.entries.as_mut_ptr(), 0, self.mask + 1);
297 |         self.num_entries = 0;
298 |         self.total_allocated = 0;
299 |         for a in self.old_allocs.iter_mut() {
300 |             a.clear();
301 |         }
302 |         self.old_allocs = Vec::new();
303 |         self.alloc.clear();
304 |         self.alloc = LeakyBumpAlloc::new(
305 |             INITIAL_ALLOC / NUM_BINS,
306 |             std::mem::align_of::<StringCacheEntry>(),
307 |         );
308 |     }
309 | 
310 |     pub(crate) fn total_allocated(&self) -> usize {
311 |         self.alloc.allocated()
312 |             + self.old_allocs.iter().map(|a| a.allocated()).sum::<usize>()
313 |     }
314 | 
315 |     pub(crate) fn total_capacity(&self) -> usize {
316 |         self.alloc.capacity()
317 |             + self.old_allocs.iter().map(|a| a.capacity()).sum::<usize>()
318 |     }
319 | 
320 |     pub(crate) fn num_entries(&self) -> usize {
321 |         self.num_entries
322 |     }
323 | }
324 | 
325 | impl Default for StringCache {
326 |     fn default() -> StringCache {
327 |         StringCache::new()
328 |     }
329 | }
330 | 
331 | // We are safe to be `Send` but not `Sync` (we get Sync by wrapping in a mutex).
332 | unsafe impl Send for StringCache {}
333 | 
334 | #[doc(hidden)]
335 | pub struct StringCacheIterator {
336 |     pub(crate) allocs: Vec<(*const u8, *const u8)>,
337 |     pub(crate) current_alloc: usize,
338 |     pub(crate) current_ptr: *const u8,
339 | }
340 | 
341 | fn round_up_to(n: usize, align: usize) -> usize {
342 |     debug_assert!(align.is_power_of_two());
343 |     (n.checked_add(align).expect("round_up_to overflowed") - 1) & !(align - 1)
344 | }
345 | 
346 | impl Iterator for StringCacheIterator {
347 |     type Item = &'static str;
348 |     fn next(&mut self) -> Option<Self::Item> {
349 |         // check that the cache is not empty before accessing
350 |         if self.allocs.is_empty() {
351 |             return None;
352 |         }
353 | 
354 |         let (_, end) = self.allocs[self.current_alloc];
355 |         if self.current_ptr >= end {
356 |             // We've reached the end of the current alloc.
357 |             if self.current_alloc == self.allocs.len() - 1 {
358 |                 // We've reached the end.
359 |                 return None;
360 |             } else {
361 |                 // Advance to the next alloc.
362 |                 self.current_alloc += 1;
363 |                 let (current_ptr, _) = self.allocs[self.current_alloc];
364 |                 self.current_ptr = current_ptr;
365 |             }
366 |         }
367 | 
368 |         // Cast the current ptr to a `StringCacheEntry` and create the next
369 |         // string from it.
370 |         unsafe {
371 |             let sce = &*(self.current_ptr as *const StringCacheEntry);
372 |             // The next entry will be the size of the number of bytes in the
373 |             // string, +1 for the null byte, rounded up to the alignment (8).
374 |             self.current_ptr = sce.next_entry();
375 | 
376 |             // We know we're safe not to check here since we put valid UTF-8 in.
377 |             let s = std::str::from_utf8_unchecked(std::slice::from_raw_parts(
378 |                 sce.char_ptr(),
379 |                 sce.len,
380 |             ));
381 |             Some(s)
382 |         }
383 |     }
384 | }
385 | 
386 | #[repr(C)]
387 | #[derive(Clone)]
388 | pub(crate) struct StringCacheEntry {
389 |     pub(crate) hash: u64,
390 |     pub(crate) len: usize,
391 | }
392 | 
393 | impl StringCacheEntry {
394 |     // Get the pointer to the characters.
395 |     pub(crate) fn char_ptr(&self) -> *const u8 {
396 |         // We know the chars are always directly after this struct in memory
397 |         // because that's the way they're laid out on initialization.
398 |         unsafe { (self as *const StringCacheEntry).add(1) as *const u8 }
399 |     }
400 | 
401 |     // Calcualte the address of the next entry in the cache. This is a utility
402 |     // function to hide the pointer arithmetic in iterators.
403 |     pub(crate) unsafe fn next_entry(&self) -> *const u8 {
404 |         #[allow(clippy::ptr_offset_with_cast)]
405 |         self.char_ptr().add(round_up_to(
406 |             self.len + 1,
407 |             std::mem::align_of::<StringCacheEntry>(),
408 |         ))
409 |     }
410 | }


--------------------------------------------------------------------------------
/src/ustr_extern.rs:
--------------------------------------------------------------------------------
 1 | use ustr::Ustr;
 2 | 
 3 | #[no_mangle]
 4 | pub extern "C" fn ustr(chars: *const std::os::raw::c_char) -> Ustr {
 5 |     let cs = unsafe { std::ffi::CStr::from_ptr(chars).to_string_lossy() };
 6 |     Ustr::from(&cs)
 7 | }
 8 | 
 9 | #[no_mangle]
10 | pub extern "C" fn ustr_len(u: Ustr) -> usize {
11 |     u.len()
12 | }
13 | 
14 | #[no_mangle]
15 | pub extern "C" fn ustr_hash(u: Ustr) -> u64 {
16 |     u.precomputed_hash()
17 | }
18 | 


--------------------------------------------------------------------------------
/ustring_bench_raft.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anderslanglands/ustr/4c1fde840807f8af9d4bfd38805c3d89ac51baeb/ustring_bench_raft.png


--------------------------------------------------------------------------------