├── .gitignore ├── Cargo.toml ├── LICENSE-MIT ├── src ├── lib.rs ├── util.rs ├── buf32.rs ├── utf8_decode.rs ├── bench.rs ├── fmt.rs ├── futf.rs ├── stream.rs └── tendril.rs ├── .github └── workflows │ └── main.yml ├── README.md ├── examples └── fuzz.rs └── LICENSE-APACHE /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | Cargo.lock 3 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tendril" 3 | version = "0.4.3" 4 | authors = ["Keegan McAllister ", 5 | "Simon Sapin ", 6 | "Chris Morgan "] 7 | repository = "https://github.com/servo/tendril" 8 | readme = "README.md" 9 | license = "MIT/Apache-2.0" 10 | description = "Compact buffer/string type for zero-copy parsing" 11 | 12 | [dependencies] 13 | encoding = {version = "0.2", optional = true} 14 | encoding_rs = {version = "0.8.12", optional = true} 15 | mac = "0.1" 16 | new_debug_unreachable = "1.0.2" 17 | utf-8 = "0.7" 18 | 19 | [dev-dependencies] 20 | rand = "0.4" 21 | 22 | [features] 23 | bench = [] 24 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Keegan McAllister 2 | 3 | Permission is hereby granted, free of charge, to any 4 | person obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the 6 | Software without restriction, including without 7 | limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software 10 | is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice 14 | shall be included in all copies or substantial portions 15 | of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 or the MIT license 3 | // , at your 4 | // option. This file may not be copied, modified, or distributed 5 | // except according to those terms. 6 | 7 | #![cfg_attr(all(test, feature = "bench"), feature(test))] 8 | //#![cfg_attr(test, deny(warnings))] 9 | 10 | #[macro_use] 11 | extern crate debug_unreachable; 12 | #[cfg(feature = "encoding")] 13 | pub extern crate encoding; 14 | #[cfg(feature = "encoding_rs")] 15 | pub extern crate encoding_rs; 16 | #[cfg(all(test, feature = "bench"))] 17 | extern crate test; 18 | #[macro_use] 19 | extern crate mac; 20 | extern crate utf8; 21 | 22 | pub use fmt::Format; 23 | pub use stream::TendrilSink; 24 | pub use tendril::{Atomic, Atomicity, NonAtomic, SendTendril}; 25 | pub use tendril::{ByteTendril, ReadExt, SliceExt, StrTendril, SubtendrilError, Tendril}; 26 | pub use utf8_decode::IncompleteUtf8; 27 | 28 | pub mod fmt; 29 | pub mod stream; 30 | 31 | mod buf32; 32 | mod futf; 33 | mod tendril; 34 | mod utf8_decode; 35 | mod util; 36 | 37 | static OFLOW: &'static str = "tendril: overflow in buffer arithmetic"; 38 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | workflow_dispatch: 8 | merge_group: 9 | types: [checks_requested] 10 | 11 | jobs: 12 | linux-ci: 13 | name: Linux 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | toolchain: ["stable", "beta", "nightly", "1.36.0"] 18 | steps: 19 | - uses: actions/checkout@v2 20 | 21 | - name: Install toolchain 22 | uses: actions-rs/toolchain@v1 23 | with: 24 | profile: minimal 25 | toolchain: ${{ matrix.toolchain }} 26 | override: true 27 | 28 | - name: Cargo build 29 | run: cargo build 30 | 31 | - name: Cargo doc 32 | run: cargo doc 33 | 34 | - name: Cargo test 35 | run: cargo test --features 'encoding encoding_rs' 36 | 37 | - name: Cargo bench 38 | if: matrix.toolchain == 'nightly' 39 | run: cargo test --features bench 40 | 41 | build_result: 42 | name: Result 43 | runs-on: ubuntu-latest 44 | needs: 45 | - "linux-ci" 46 | 47 | steps: 48 | - name: Mark the job as successful 49 | run: exit 0 50 | if: success() 51 | - name: Mark the job as unsuccessful 52 | run: exit 1 53 | if: "!success()" 54 | -------------------------------------------------------------------------------- /src/util.rs: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 or the MIT license 3 | // , at your 4 | // option. This file may not be copied, modified, or distributed 5 | // except according to those terms. 6 | 7 | use std::mem; 8 | use std::{ptr, slice}; 9 | 10 | #[inline(always)] 11 | pub unsafe fn unsafe_slice<'a>(buf: &'a [u8], start: usize, new_len: usize) -> &'a [u8] { 12 | debug_assert!(start <= buf.len()); 13 | debug_assert!(new_len <= (buf.len() - start)); 14 | slice::from_raw_parts(buf.as_ptr().offset(start as isize), new_len) 15 | } 16 | 17 | #[inline(always)] 18 | pub unsafe fn unsafe_slice_mut<'a>( 19 | buf: &'a mut [u8], 20 | start: usize, 21 | new_len: usize, 22 | ) -> &'a mut [u8] { 23 | debug_assert!(start <= buf.len()); 24 | debug_assert!(new_len <= (buf.len() - start)); 25 | slice::from_raw_parts_mut(buf.as_mut_ptr().offset(start as isize), new_len) 26 | } 27 | 28 | #[inline(always)] 29 | pub unsafe fn copy_and_advance(dest: &mut *mut u8, src: &[u8]) { 30 | ptr::copy_nonoverlapping(src.as_ptr(), *dest, src.len()); 31 | *dest = dest.offset(src.len() as isize) 32 | } 33 | 34 | #[inline(always)] 35 | pub unsafe fn copy_lifetime_mut<'a, S: ?Sized, T: ?Sized + 'a>( 36 | _ptr: &'a mut S, 37 | ptr: &mut T, 38 | ) -> &'a mut T { 39 | mem::transmute(ptr) 40 | } 41 | 42 | #[inline(always)] 43 | pub unsafe fn copy_lifetime<'a, S: ?Sized, T: ?Sized + 'a>(_ptr: &'a S, ptr: &T) -> &'a T { 44 | mem::transmute(ptr) 45 | } 46 | -------------------------------------------------------------------------------- /src/buf32.rs: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 or the MIT license 3 | // , at your 4 | // option. This file may not be copied, modified, or distributed 5 | // except according to those terms. 6 | 7 | //! Provides an unsafe owned buffer type, used in implementing `Tendril`. 8 | 9 | use std::{mem, ptr, slice, u32}; 10 | 11 | use OFLOW; 12 | 13 | pub const MIN_CAP: u32 = 16; 14 | 15 | pub const MAX_LEN: usize = u32::MAX as usize; 16 | 17 | /// A buffer points to a header of type `H`, which is followed by `MIN_CAP` or more 18 | /// bytes of storage. 19 | pub struct Buf32 { 20 | pub ptr: *mut H, 21 | pub len: u32, 22 | pub cap: u32, 23 | } 24 | 25 | #[inline(always)] 26 | fn bytes_to_vec_capacity(x: u32) -> usize { 27 | let header = mem::size_of::(); 28 | debug_assert!(header > 0); 29 | let x = (x as usize).checked_add(header).expect(OFLOW); 30 | // Integer ceil https://stackoverflow.com/a/2745086/1162888 31 | 1 + ((x - 1) / header) 32 | } 33 | 34 | impl Buf32 { 35 | #[inline] 36 | pub unsafe fn with_capacity(mut cap: u32, h: H) -> Buf32 { 37 | if cap < MIN_CAP { 38 | cap = MIN_CAP; 39 | } 40 | 41 | let mut vec = Vec::::with_capacity(bytes_to_vec_capacity::(cap)); 42 | let ptr = vec.as_mut_ptr(); 43 | mem::forget(vec); 44 | ptr::write(ptr, h); 45 | 46 | Buf32 { 47 | ptr: ptr, 48 | len: 0, 49 | cap: cap, 50 | } 51 | } 52 | 53 | #[inline] 54 | pub unsafe fn destroy(self) { 55 | mem::drop(Vec::from_raw_parts( 56 | self.ptr, 57 | 1, 58 | bytes_to_vec_capacity::(self.cap), 59 | )); 60 | } 61 | 62 | #[inline(always)] 63 | pub unsafe fn data_ptr(&self) -> *mut u8 { 64 | (self.ptr as *mut u8).offset(mem::size_of::() as isize) 65 | } 66 | 67 | #[inline(always)] 68 | pub unsafe fn data(&self) -> &[u8] { 69 | slice::from_raw_parts(self.data_ptr(), self.len as usize) 70 | } 71 | 72 | #[inline(always)] 73 | pub unsafe fn data_mut(&mut self) -> &mut [u8] { 74 | slice::from_raw_parts_mut(self.data_ptr(), self.len as usize) 75 | } 76 | 77 | /// Grow the capacity to at least `new_cap`. 78 | /// 79 | /// This will panic if the capacity calculation overflows `u32`. 80 | #[inline] 81 | pub unsafe fn grow(&mut self, new_cap: u32) { 82 | if new_cap <= self.cap { 83 | return; 84 | } 85 | 86 | let new_cap = new_cap.checked_next_power_of_two().expect(OFLOW); 87 | let mut vec = Vec::from_raw_parts(self.ptr, 0, bytes_to_vec_capacity::(self.cap)); 88 | vec.reserve_exact(bytes_to_vec_capacity::(new_cap)); 89 | self.ptr = vec.as_mut_ptr(); 90 | self.cap = new_cap; 91 | mem::forget(vec); 92 | } 93 | } 94 | 95 | #[cfg(test)] 96 | mod test { 97 | use super::Buf32; 98 | use std::ptr; 99 | 100 | #[test] 101 | fn smoke_test() { 102 | unsafe { 103 | let mut b = Buf32::with_capacity(0, 0u8); 104 | assert_eq!(b"", b.data()); 105 | 106 | b.grow(5); 107 | ptr::copy_nonoverlapping(b"Hello".as_ptr(), b.data_ptr(), 5); 108 | 109 | assert_eq!(b"", b.data()); 110 | b.len = 5; 111 | assert_eq!(b"Hello", b.data()); 112 | 113 | b.grow(1337); 114 | assert!(b.cap >= 1337); 115 | assert_eq!(b"Hello", b.data()); 116 | 117 | b.destroy(); 118 | } 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /src/utf8_decode.rs: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 or the MIT license 3 | // , at your 4 | // option. This file may not be copied, modified, or distributed 5 | // except according to those terms. 6 | 7 | use fmt; 8 | use tendril::{Atomicity, Tendril}; 9 | use utf8; 10 | 11 | pub struct IncompleteUtf8(utf8::Incomplete); 12 | 13 | impl Tendril 14 | where 15 | A: Atomicity, 16 | { 17 | pub fn decode_utf8_lossy(mut self, mut push_utf8: F) -> Option 18 | where 19 | F: FnMut(Tendril), 20 | { 21 | loop { 22 | if self.is_empty() { 23 | return None; 24 | } 25 | let unborrowed_result = match utf8::decode(&self) { 26 | Ok(s) => { 27 | debug_assert!(s.as_ptr() == self.as_ptr()); 28 | debug_assert!(s.len() == self.len()); 29 | Ok(()) 30 | } 31 | Err(utf8::DecodeError::Invalid { 32 | valid_prefix, 33 | invalid_sequence, 34 | .. 35 | }) => { 36 | debug_assert!(valid_prefix.as_ptr() == self.as_ptr()); 37 | debug_assert!(valid_prefix.len() <= self.len()); 38 | Err(( 39 | valid_prefix.len(), 40 | Err(valid_prefix.len() + invalid_sequence.len()), 41 | )) 42 | } 43 | Err(utf8::DecodeError::Incomplete { 44 | valid_prefix, 45 | incomplete_suffix, 46 | }) => { 47 | debug_assert!(valid_prefix.as_ptr() == self.as_ptr()); 48 | debug_assert!(valid_prefix.len() <= self.len()); 49 | Err((valid_prefix.len(), Ok(incomplete_suffix))) 50 | } 51 | }; 52 | match unborrowed_result { 53 | Ok(()) => { 54 | unsafe { push_utf8(self.reinterpret_without_validating()) } 55 | return None; 56 | } 57 | Err((valid_len, and_then)) => { 58 | if valid_len > 0 { 59 | let subtendril = self.subtendril(0, valid_len as u32); 60 | unsafe { push_utf8(subtendril.reinterpret_without_validating()) } 61 | } 62 | match and_then { 63 | Ok(incomplete) => return Some(IncompleteUtf8(incomplete)), 64 | Err(offset) => { 65 | push_utf8(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); 66 | self.pop_front(offset as u32) 67 | } 68 | } 69 | } 70 | } 71 | } 72 | } 73 | } 74 | 75 | impl IncompleteUtf8 { 76 | pub fn try_complete( 77 | &mut self, 78 | mut input: Tendril, 79 | mut push_utf8: F, 80 | ) -> Result, ()> 81 | where 82 | A: Atomicity, 83 | F: FnMut(Tendril), 84 | { 85 | let resume_at; 86 | match self.0.try_complete(&input) { 87 | None => return Err(()), 88 | Some((result, rest)) => { 89 | push_utf8(Tendril::from_slice( 90 | result.unwrap_or(utf8::REPLACEMENT_CHARACTER), 91 | )); 92 | resume_at = input.len() - rest.len(); 93 | } 94 | } 95 | input.pop_front(resume_at as u32); 96 | Ok(input) 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tendril 2 | 3 | **Warning**: This library is at a very early stage of development, and it 4 | contains a substantial amount of `unsafe` code. Use at your own risk! 5 | 6 | [![Build Status](https://github.com/servo/tendril/workflows/CI/badge.svg)](https://github.com/servo/tendril/actions) 7 | 8 | [API Documentation](https://doc.servo.org/tendril/index.html) 9 | 10 | ## Introduction 11 | 12 | `Tendril` is a compact string/buffer type, optimized for zero-copy parsing. 13 | Tendrils have the semantics of owned strings, but are sometimes views into 14 | shared buffers. When you mutate a tendril, an owned copy is made if necessary. 15 | Further mutations occur in-place until the string becomes shared, e.g. with 16 | `clone()` or `subtendril()`. 17 | 18 | Buffer sharing is accomplished through thread-local (non-atomic) reference 19 | counting, which has very low overhead. The Rust type system will prevent 20 | you at compile time from sending a tendril between threads. (See below 21 | for thoughts on relaxing this restriction.) 22 | 23 | Whereas `String` allocates in the heap for any non-empty string, `Tendril` can 24 | store small strings (up to 8 bytes) in-line, without a heap allocation. 25 | `Tendril` is also smaller than `String` on 64-bit platforms — 16 bytes versus 26 | 24. `Option` is the same size as `Tendril`, thanks to 27 | [`NonZero`][NonZero]. 28 | 29 | The maximum length of a tendril is 4 GB. The library will panic if you attempt 30 | to go over the limit. 31 | 32 | ## Formats and encoding 33 | 34 | `Tendril` uses 35 | [phantom types](https://doc.rust-lang.org/stable/rust-by-example/generics/phantom.html) 36 | to track a buffer's format. This determines at compile time which 37 | operations are available on a given tendril. For example, `Tendril` and 38 | `Tendril` can be borrowed as `&str` and `&[u8]` respectively. 39 | 40 | `Tendril` also integrates with 41 | [rust-encoding](https://github.com/lifthrasiir/rust-encoding) and has 42 | preliminary support for [WTF-8][] buffers. 43 | 44 | ## Plans for the future 45 | 46 | ### Ropes 47 | 48 | [html5ever][] will use `Tendril` as a zero-copy text representation. It would 49 | be good to preserve this all the way through to Servo's DOM. This would reduce 50 | memory consumption, and possibly speed up text shaping and painting. However, 51 | DOM text may conceivably be larger than 4 GB, and will anyway not be contiguous 52 | in memory around e.g. a character entity reference. 53 | 54 | *Solution:* Build a **[rope][] on top of these strings** and use that as 55 | Servo's representation of DOM text. We can perhaps do text shaping and/or 56 | painting in parallel for different chunks of a rope. html5ever can additionally 57 | use this rope type as a replacement for `BufferQueue`. 58 | 59 | Because the underlying buffers are reference-counted, the bulk of this rope 60 | is already a [persistent data structure][]. Consider what happens when 61 | appending two ropes to get a "new" rope. A vector-backed rope would copy a 62 | vector of small structs, one for each chunk, and would bump the corresponding 63 | refcounts. But it would not copy any of the string data. 64 | 65 | If we want more sharing, then a [2-3 finger tree][] could be a good choice. 66 | We would probably stick with `VecDeque` for ropes under a certain size. 67 | 68 | ### UTF-16 compatibility 69 | 70 | SpiderMonkey expects text to be in UCS-2 format for the most part. The 71 | semantics of JavaScript strings are difficult to implement on UTF-8. This also 72 | applies to HTML parsing via `document.write`. Also, passing SpiderMonkey a 73 | string that isn't contiguous in memory will incur additional overhead and 74 | complexity, if not a full copy. 75 | 76 | *Solution:* Use **WTF-8 in parsing** and in the DOM. Servo will **convert to 77 | contiguous UTF-16 when necessary**. The conversion can easily be parallelized, 78 | if we find a practical need to convert huge chunks of text all at once. 79 | 80 | ### Source span information 81 | 82 | Some html5ever API consumers want to know the originating location in the HTML 83 | source file(s) of each token or parse error. An example application would be a 84 | command-line HTML validator with diagnostic output similar to `rustc`'s. 85 | 86 | *Solution:* Accept **some metadata along with each input string**. The type of 87 | metadata is chosen by the API consumer; it defaults to `()`, which has size 88 | zero. For any non-inline string, we can provide the associated metadata as well 89 | as a byte offset. 90 | 91 | [NonZero]: https://doc.rust-lang.org/core/nonzero/struct.NonZero.html 92 | [html5ever]: https://github.com/servo/html5ever 93 | [WTF-8]: https://simonsapin.github.io/wtf-8/ 94 | [rope]: https://en.wikipedia.org/wiki/Rope_%28data_structure%29 95 | [persistent data structure]: https://en.wikipedia.org/wiki/Persistent_data_structure 96 | [2-3 finger tree]: https://www.staff.city.ac.uk/~ross/papers/FingerTree.html 97 | -------------------------------------------------------------------------------- /examples/fuzz.rs: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 or the MIT license 3 | // , at your 4 | // option. This file may not be copied, modified, or distributed 5 | // except according to those terms. 6 | 7 | //! A simple fuzz tester for the library. 8 | 9 | #![deny(warnings)] 10 | 11 | extern crate rand; 12 | extern crate tendril; 13 | 14 | use std::borrow::ToOwned; 15 | 16 | use rand::distributions::{IndependentSample, Range}; 17 | use rand::Rng; 18 | use tendril::StrTendril; 19 | 20 | fn fuzz() { 21 | let mut rng = rand::thread_rng(); 22 | let capacity = Range::new(0u32, 1 << 14).ind_sample(&mut rng); 23 | let mut buf_string = String::with_capacity(capacity as usize); 24 | let mut buf_tendril = StrTendril::with_capacity(capacity); 25 | let mut string_slices = vec![]; 26 | let mut tendril_slices = vec![]; 27 | 28 | for _ in 1..100_000 { 29 | if buf_string.len() > (1 << 30) { 30 | buf_string.truncate(0); 31 | buf_tendril.clear(); 32 | } 33 | 34 | let dist_action = Range::new(0, 100); 35 | match dist_action.ind_sample(&mut rng) { 36 | 0..=15 => { 37 | let (start, end) = random_slice(&mut rng, TEXT); 38 | let snip = &TEXT[start..end]; 39 | buf_string.push_str(snip); 40 | buf_tendril.push_slice(snip); 41 | assert_eq!(&*buf_string, &*buf_tendril); 42 | } 43 | 44 | 16..=31 => { 45 | let (start, end) = random_slice(&mut rng, &buf_string); 46 | let snip = &buf_string[start..end].to_owned(); 47 | buf_string.push_str(&snip); 48 | buf_tendril.push_slice(&snip); 49 | assert_eq!(&*buf_string, &*buf_tendril); 50 | } 51 | 52 | 32..=47 => { 53 | let lenstr = format!("[length = {}]", buf_tendril.len()); 54 | buf_string.push_str(&lenstr); 55 | buf_tendril.push_slice(&lenstr); 56 | assert_eq!(&*buf_string, &*buf_tendril); 57 | } 58 | 59 | 48..=63 => { 60 | let n = random_boundary(&mut rng, &buf_string); 61 | buf_tendril.pop_front(n as u32); 62 | buf_string = buf_string[n..].to_owned(); 63 | assert_eq!(&*buf_string, &*buf_tendril); 64 | } 65 | 66 | 64..=79 => { 67 | let new_len = random_boundary(&mut rng, &buf_string); 68 | let n = buf_string.len() - new_len; 69 | buf_string.truncate(new_len); 70 | buf_tendril.pop_back(n as u32); 71 | assert_eq!(&*buf_string, &*buf_tendril); 72 | } 73 | 74 | 80..=90 => { 75 | let (start, end) = random_slice(&mut rng, &buf_string); 76 | buf_string = buf_string[start..end].to_owned(); 77 | buf_tendril = buf_tendril.subtendril(start as u32, (end - start) as u32); 78 | assert_eq!(&*buf_string, &*buf_tendril); 79 | } 80 | 81 | 91..=96 => { 82 | let c = rng.gen(); 83 | buf_string.push(c); 84 | assert!(buf_tendril.try_push_char(c).is_ok()); 85 | assert_eq!(&*buf_string, &*buf_tendril); 86 | } 87 | 88 | 97 => { 89 | buf_string.truncate(0); 90 | buf_tendril.clear(); 91 | assert_eq!(&*buf_string, &*buf_tendril); 92 | } 93 | 94 | _ => { 95 | let (start, end) = random_slice(&mut rng, &buf_string); 96 | string_slices.push(buf_string[start..end].to_owned()); 97 | tendril_slices.push(buf_tendril.subtendril(start as u32, (end - start) as u32)); 98 | assert_eq!(string_slices.len(), tendril_slices.len()); 99 | assert!(string_slices 100 | .iter() 101 | .zip(tendril_slices.iter()) 102 | .all(|(s, t)| **s == **t)); 103 | } 104 | } 105 | } 106 | } 107 | 108 | fn random_boundary(rng: &mut R, text: &str) -> usize { 109 | loop { 110 | let i = Range::new(0, text.len() + 1).ind_sample(rng); 111 | if text.is_char_boundary(i) { 112 | return i; 113 | } 114 | } 115 | } 116 | 117 | fn random_slice(rng: &mut R, text: &str) -> (usize, usize) { 118 | loop { 119 | let start = Range::new(0, text.len() + 1).ind_sample(rng); 120 | let end = Range::new(start, text.len() + 1).ind_sample(rng); 121 | if !text.is_char_boundary(start) { 122 | continue; 123 | } 124 | if end < text.len() && !text.is_char_boundary(end) { 125 | continue; 126 | } 127 | return (start, end); 128 | } 129 | } 130 | 131 | static TEXT: &'static str = 132 | "It was from the artists and poets that the pertinent answers came, and I \ 133 | know that panic would have broken loose had they been able to compare notes. \ 134 | As it was, lacking their original letters, I half suspected the compiler of \ 135 | having asked leading questions, or of having edited the correspondence in \ 136 | corroboration of what he had latently resolved to see.\ 137 | \ 138 | ˙ǝǝs oʇ pǝʌʃosǝɹ ʎʃʇuǝʇɐʃ pɐɥ ǝɥ ʇɐɥʍ ɟo uoıʇɐɹoqoɹɹoɔ uı ǝɔuǝpuodsǝɹɹoɔ ǝɥʇ \ 139 | pǝʇıpǝ ƃuıʌɐɥ ɟo ɹo 'suoıʇsǝnb ƃuıpɐǝʃ pǝʞsɐ ƃuıʌɐɥ ɟo ɹǝʃıdɯoɔ ǝɥʇ pǝʇɔǝdsns \ 140 | ɟʃɐɥ I 'sɹǝʇʇǝʃ ʃɐuıƃıɹo ɹıǝɥʇ ƃuıʞɔɐʃ 'sɐʍ ʇı s∀ ˙sǝʇou ǝɹɐdɯoɔ oʇ ǝʃqɐ uǝǝq \ 141 | ʎǝɥʇ pɐɥ ǝsooʃ uǝʞoɹq ǝʌɐɥ pʃnoʍ ɔıuɐd ʇɐɥʇ ʍouʞ I puɐ 'ǝɯɐɔ sɹǝʍsuɐ ʇuǝuıʇɹǝd \ 142 | ǝɥʇ ʇɐɥʇ sʇǝod puɐ sʇsıʇɹɐ ǝɥʇ ɯoɹɟ sɐʍ ʇI"; 143 | 144 | fn main() { 145 | fuzz(); 146 | } 147 | -------------------------------------------------------------------------------- /src/bench.rs: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 or the MIT license 3 | // , at your 4 | // option. This file may not be copied, modified, or distributed 5 | // except according to those terms. 6 | 7 | use std::borrow::ToOwned; 8 | use std::collections::hash_map::{Entry, HashMap}; 9 | 10 | use tendril::StrTendril; 11 | 12 | fn index_words_string(input: &String) -> HashMap> { 13 | let mut index = HashMap::new(); 14 | for word in input.split(|c| c == ' ') { 15 | if word.len() == 0 { 16 | continue; 17 | } 18 | let word = word.to_owned(); 19 | match index.entry(word.chars().next().unwrap()) { 20 | Entry::Occupied(mut e) => { 21 | let x: &mut Vec = e.get_mut(); 22 | x.push(word); 23 | } 24 | Entry::Vacant(e) => { 25 | e.insert(vec![word]); 26 | } 27 | } 28 | } 29 | index 30 | } 31 | 32 | fn index_words_tendril(input: &StrTendril) -> HashMap> { 33 | let mut index = HashMap::new(); 34 | let mut t = input.clone(); 35 | loop { 36 | match t.pop_front_char_run(|c| c != ' ') { 37 | None => return index, 38 | Some((_, false)) => (), 39 | Some((word, true)) => match index.entry(word.chars().next().unwrap()) { 40 | Entry::Occupied(mut e) => { 41 | e.get_mut().push(word); 42 | } 43 | Entry::Vacant(e) => { 44 | e.insert(vec![word]); 45 | } 46 | }, 47 | } 48 | } 49 | } 50 | 51 | static EN_1: &'static str = "Days turn to nights turn to paper into rocks into plastic"; 52 | 53 | static EN_2: &'static str = 54 | "Here the notes in my laboratory journal cease. I was able to write the last \ 55 | words only with great effort. By now it was already clear to me that LSD had \ 56 | been the cause of the remarkable experience of the previous Friday, for the \ 57 | altered perceptions were of the same type as before, only much more intense. I \ 58 | had to struggle to speak intelligibly. I asked my laboratory assistant, who was \ 59 | informed of the self-experiment, to escort me home. We went by bicycle, no \ 60 | automobile being available because of wartime restrictions on their use. On the \ 61 | way home, my condition began to assume threatening forms. Everything in my \ 62 | field of vision wavered and was distorted as if seen in a curved mirror. I also \ 63 | had the sensation of being unable to move from the spot. Nevertheless, my \ 64 | assistant later told me that we had traveled very rapidly. Finally, we arrived \ 65 | at home safe and sound, and I was just barely capable of asking my companion to \ 66 | summon our family doctor and request milk from the neighbors.\n\n\ 67 | In spite of my delirious, bewildered condition, I had brief periods of clear \ 68 | and effective thinking—and chose milk as a nonspecific antidote for poisoning."; 69 | 70 | static KR_1: &'static str = 71 | "러스트(Rust)는 모질라(mozilla.org)에서 개발하고 있는, 메모리-안전하고 병렬 \ 72 | 프로그래밍이 쉬운 차세대 프로그래밍 언어입니다. 아직 \ 73 | 개발 단계이며 많은 기능이 구현 중으로, MIT/Apache2 라이선스로 배포됩니다."; 74 | 75 | static HTML_KR_1: &'static str = 76 | "

러스트(Rust)는 모질라(mozilla.org)에서 개발하고 있는, \ 78 | 메모리-안전하고 병렬 프로그래밍이 쉬운 차세대 프로그래밍 언어입니다. \ 79 | 아직 개발 단계이며 많은 기능이 구현 중으로, MIT/Apache2 라이선스로 배포됩니다.

"; 80 | 81 | mod index_words { 82 | macro_rules! bench { 83 | ($txt:ident) => { 84 | #[allow(non_snake_case)] 85 | mod $txt { 86 | const SMALL_SIZE: usize = 65536; 87 | const LARGE_SIZE: usize = (1 << 20); 88 | 89 | #[bench] 90 | fn index_words_string(b: &mut ::test::Bencher) { 91 | let mut s = String::new(); 92 | while s.len() < SMALL_SIZE { 93 | s.push_str(::tendril::bench::$txt); 94 | } 95 | b.iter(|| ::tendril::bench::index_words_string(&s)); 96 | } 97 | 98 | #[bench] 99 | fn index_words_tendril(b: &mut ::test::Bencher) { 100 | let mut t = ::tendril::StrTendril::new(); 101 | while t.len() < SMALL_SIZE { 102 | t.push_slice(::tendril::bench::$txt); 103 | } 104 | b.iter(|| ::tendril::bench::index_words_tendril(&t)); 105 | } 106 | 107 | #[bench] 108 | fn index_words_big_string(b: &mut ::test::Bencher) { 109 | let mut s = String::new(); 110 | while s.len() < LARGE_SIZE { 111 | s.push_str(::tendril::bench::$txt); 112 | } 113 | b.iter(|| ::tendril::bench::index_words_string(&s)); 114 | } 115 | 116 | #[bench] 117 | fn index_words_big_tendril(b: &mut ::test::Bencher) { 118 | let mut t = ::tendril::StrTendril::new(); 119 | while t.len() < LARGE_SIZE { 120 | t.push_slice(::tendril::bench::$txt); 121 | } 122 | b.iter(|| ::tendril::bench::index_words_tendril(&t)); 123 | } 124 | 125 | #[test] 126 | fn correctness() { 127 | use std::borrow::ToOwned; 128 | use tendril::bench::{index_words_string, index_words_tendril}; 129 | use tendril::SliceExt; 130 | 131 | let txt = ::tendril::bench::$txt; 132 | let input_string = txt.to_owned(); 133 | let count_s = index_words_string(&input_string); 134 | let mut keys: Vec = count_s.keys().cloned().collect(); 135 | keys.sort(); 136 | 137 | let input_tendril = txt.to_tendril(); 138 | let count_t = index_words_tendril(&input_tendril); 139 | let mut keys_t: Vec = count_t.keys().cloned().collect(); 140 | keys_t.sort(); 141 | 142 | assert_eq!(keys, keys_t); 143 | 144 | for k in &keys { 145 | let vs = &count_s[k]; 146 | let vt = &count_t[k]; 147 | assert_eq!(vs.len(), vt.len()); 148 | assert!(vs.iter().zip(vt.iter()).all(|(s, t)| **s == **t)); 149 | } 150 | } 151 | } 152 | }; 153 | } 154 | 155 | bench!(EN_1); 156 | bench!(EN_2); 157 | bench!(KR_1); 158 | bench!(HTML_KR_1); 159 | } 160 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /src/fmt.rs: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 or the MIT license 3 | // , at your 4 | // option. This file may not be copied, modified, or distributed 5 | // except according to those terms. 6 | 7 | //! Marker types for formats. 8 | //! 9 | //! This module defines the types and traits used to mark a `Tendril` 10 | //! with the format of data it contains. It includes those formats 11 | //! for which `Tendril` supports at least some operations without 12 | //! conversion. 13 | //! 14 | //! To convert a string tendril to/from a byte tendril in an arbitrary 15 | //! character encoding, see the `encode` and `decode` methods on 16 | //! `Tendril`. 17 | //! 18 | //! `Tendril` operations may become memory-unsafe if data invalid for 19 | //! the format sneaks in. For that reason, these traits require 20 | //! `unsafe impl`. 21 | 22 | use std::default::Default; 23 | use std::{char, mem, str}; 24 | 25 | use futf::{self, Codepoint, Meaning}; 26 | 27 | /// Implementation details. 28 | /// 29 | /// You don't need these unless you are implementing 30 | /// a new format. 31 | pub mod imp { 32 | use std::default::Default; 33 | use std::{iter, mem, slice}; 34 | 35 | /// Describes how to fix up encodings when concatenating. 36 | /// 37 | /// We can drop characters on either side of the splice, 38 | /// and insert up to 4 bytes in the middle. 39 | pub struct Fixup { 40 | pub drop_left: u32, 41 | pub drop_right: u32, 42 | pub insert_len: u32, 43 | pub insert_bytes: [u8; 4], 44 | } 45 | 46 | impl Default for Fixup { 47 | #[inline(always)] 48 | fn default() -> Fixup { 49 | Fixup { 50 | drop_left: 0, 51 | drop_right: 0, 52 | insert_len: 0, 53 | insert_bytes: [0; 4], 54 | } 55 | } 56 | } 57 | 58 | #[inline(always)] 59 | unsafe fn from_u32_unchecked(n: u32) -> char { 60 | mem::transmute(n) 61 | } 62 | 63 | pub struct SingleByteCharIndices<'a> { 64 | inner: iter::Enumerate>, 65 | } 66 | 67 | impl<'a> Iterator for SingleByteCharIndices<'a> { 68 | type Item = (usize, char); 69 | 70 | #[inline] 71 | fn next(&mut self) -> Option<(usize, char)> { 72 | self.inner 73 | .next() 74 | .map(|(i, &b)| unsafe { (i, from_u32_unchecked(b as u32)) }) 75 | } 76 | } 77 | 78 | impl<'a> SingleByteCharIndices<'a> { 79 | #[inline] 80 | pub fn new(buf: &'a [u8]) -> SingleByteCharIndices<'a> { 81 | SingleByteCharIndices { 82 | inner: buf.iter().enumerate(), 83 | } 84 | } 85 | } 86 | } 87 | 88 | /// Trait for format marker types. 89 | /// 90 | /// The type implementing this trait is usually not instantiated. 91 | /// It's used with a phantom type parameter of `Tendril`. 92 | pub unsafe trait Format { 93 | /// Check whether the buffer is valid for this format. 94 | fn validate(buf: &[u8]) -> bool; 95 | 96 | /// Check whether the buffer is valid for this format. 97 | /// 98 | /// You may assume the buffer is a prefix of a valid buffer. 99 | #[inline] 100 | fn validate_prefix(buf: &[u8]) -> bool { 101 | ::validate(buf) 102 | } 103 | 104 | /// Check whether the buffer is valid for this format. 105 | /// 106 | /// You may assume the buffer is a suffix of a valid buffer. 107 | #[inline] 108 | fn validate_suffix(buf: &[u8]) -> bool { 109 | ::validate(buf) 110 | } 111 | 112 | /// Check whether the buffer is valid for this format. 113 | /// 114 | /// You may assume the buffer is a contiguous subsequence 115 | /// of a valid buffer, but not necessarily a prefix or 116 | /// a suffix. 117 | #[inline] 118 | fn validate_subseq(buf: &[u8]) -> bool { 119 | ::validate(buf) 120 | } 121 | 122 | /// Compute any fixup needed when concatenating buffers. 123 | /// 124 | /// The default is to do nothing. 125 | /// 126 | /// The function is `unsafe` because it may assume the input 127 | /// buffers are already valid for the format. Also, no 128 | /// bounds-checking is performed on the return value! 129 | #[inline(always)] 130 | unsafe fn fixup(_lhs: &[u8], _rhs: &[u8]) -> imp::Fixup { 131 | Default::default() 132 | } 133 | } 134 | 135 | /// Indicates that one format is a subset of another. 136 | /// 137 | /// The subset format can be converted to the superset format 138 | /// for free. 139 | pub unsafe trait SubsetOf: Format 140 | where 141 | Super: Format, 142 | { 143 | /// Validate the *other* direction of conversion; check if 144 | /// this buffer from the superset format conforms to the 145 | /// subset format. 146 | /// 147 | /// The default calls `Self::validate`, but some conversions 148 | /// may implement a check which is cheaper than validating 149 | /// from scratch. 150 | fn revalidate_subset(x: &[u8]) -> bool { 151 | Self::validate(x) 152 | } 153 | } 154 | 155 | /// Indicates a format which corresponds to a Rust slice type, 156 | /// representing exactly the same invariants. 157 | pub unsafe trait SliceFormat: Format + Sized { 158 | type Slice: ?Sized + Slice; 159 | } 160 | 161 | /// Indicates a format which contains characters from Unicode 162 | /// (all of it, or some proper subset). 163 | pub unsafe trait CharFormat<'a>: Format { 164 | /// Iterator for characters and their byte indices. 165 | type Iter: Iterator; 166 | 167 | /// Iterate over the characters of the string and their byte 168 | /// indices. 169 | /// 170 | /// You may assume the buffer is *already validated* for `Format`. 171 | unsafe fn char_indices(buf: &'a [u8]) -> Self::Iter; 172 | 173 | /// Encode the character as bytes and pass them to a continuation. 174 | /// 175 | /// Returns `Err(())` iff the character cannot be represented. 176 | fn encode_char(ch: char, cont: F) -> Result<(), ()> 177 | where 178 | F: FnOnce(&[u8]); 179 | } 180 | 181 | /// Indicates a Rust slice type that is represented in memory as bytes. 182 | pub unsafe trait Slice { 183 | /// Access the raw bytes of the slice. 184 | fn as_bytes(&self) -> &[u8]; 185 | 186 | /// Convert a byte slice to this kind of slice. 187 | /// 188 | /// You may assume the buffer is *already validated* 189 | /// for `Format`. 190 | unsafe fn from_bytes(x: &[u8]) -> &Self; 191 | 192 | /// Convert a byte slice to this kind of slice. 193 | /// 194 | /// You may assume the buffer is *already validated* 195 | /// for `Format`. 196 | unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut Self; 197 | } 198 | 199 | /// Marker type for uninterpreted bytes. 200 | /// 201 | /// Validation will never fail for this format. 202 | #[derive(Copy, Clone, Default, Debug)] 203 | pub struct Bytes; 204 | 205 | unsafe impl Format for Bytes { 206 | #[inline(always)] 207 | fn validate(_: &[u8]) -> bool { 208 | true 209 | } 210 | } 211 | 212 | unsafe impl SliceFormat for Bytes { 213 | type Slice = [u8]; 214 | } 215 | 216 | unsafe impl Slice for [u8] { 217 | #[inline(always)] 218 | fn as_bytes(&self) -> &[u8] { 219 | self 220 | } 221 | 222 | #[inline(always)] 223 | unsafe fn from_bytes(x: &[u8]) -> &[u8] { 224 | x 225 | } 226 | 227 | #[inline(always)] 228 | unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut [u8] { 229 | x 230 | } 231 | } 232 | 233 | /// Marker type for ASCII text. 234 | #[derive(Copy, Clone, Default, Debug)] 235 | pub struct ASCII; 236 | 237 | unsafe impl Format for ASCII { 238 | #[inline] 239 | fn validate(buf: &[u8]) -> bool { 240 | buf.iter().all(|&n| n <= 127) 241 | } 242 | 243 | #[inline(always)] 244 | fn validate_prefix(_: &[u8]) -> bool { 245 | true 246 | } 247 | 248 | #[inline(always)] 249 | fn validate_suffix(_: &[u8]) -> bool { 250 | true 251 | } 252 | 253 | #[inline(always)] 254 | fn validate_subseq(_: &[u8]) -> bool { 255 | true 256 | } 257 | } 258 | 259 | unsafe impl SubsetOf for ASCII {} 260 | unsafe impl SubsetOf for ASCII {} 261 | 262 | unsafe impl<'a> CharFormat<'a> for ASCII { 263 | type Iter = imp::SingleByteCharIndices<'a>; 264 | 265 | #[inline] 266 | unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> { 267 | imp::SingleByteCharIndices::new(buf) 268 | } 269 | 270 | #[inline] 271 | fn encode_char(ch: char, cont: F) -> Result<(), ()> 272 | where 273 | F: FnOnce(&[u8]), 274 | { 275 | let n = ch as u32; 276 | if n > 0x7F { 277 | return Err(()); 278 | } 279 | cont(&[n as u8]); 280 | Ok(()) 281 | } 282 | } 283 | 284 | /// Marker type for UTF-8 text. 285 | #[derive(Copy, Clone, Default, Debug)] 286 | pub struct UTF8; 287 | 288 | unsafe impl Format for UTF8 { 289 | #[inline] 290 | fn validate(buf: &[u8]) -> bool { 291 | str::from_utf8(buf).is_ok() 292 | } 293 | 294 | #[inline] 295 | fn validate_prefix(buf: &[u8]) -> bool { 296 | if buf.len() == 0 { 297 | return true; 298 | } 299 | match futf::classify(buf, buf.len() - 1) { 300 | Some(Codepoint { 301 | meaning: Meaning::Whole(_), 302 | .. 303 | }) => true, 304 | _ => false, 305 | } 306 | } 307 | 308 | #[inline] 309 | fn validate_suffix(buf: &[u8]) -> bool { 310 | if buf.len() == 0 { 311 | return true; 312 | } 313 | match futf::classify(buf, 0) { 314 | Some(Codepoint { 315 | meaning: Meaning::Whole(_), 316 | .. 317 | }) => true, 318 | _ => false, 319 | } 320 | } 321 | 322 | #[inline] 323 | fn validate_subseq(buf: &[u8]) -> bool { 324 | ::validate_prefix(buf) && ::validate_suffix(buf) 325 | } 326 | } 327 | 328 | unsafe impl SubsetOf for UTF8 {} 329 | 330 | unsafe impl SliceFormat for UTF8 { 331 | type Slice = str; 332 | } 333 | 334 | unsafe impl Slice for str { 335 | #[inline(always)] 336 | fn as_bytes(&self) -> &[u8] { 337 | str::as_bytes(self) 338 | } 339 | 340 | #[inline(always)] 341 | unsafe fn from_bytes(x: &[u8]) -> &str { 342 | str::from_utf8_unchecked(x) 343 | } 344 | 345 | #[inline(always)] 346 | unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut str { 347 | mem::transmute(x) 348 | } 349 | } 350 | 351 | unsafe impl<'a> CharFormat<'a> for UTF8 { 352 | type Iter = str::CharIndices<'a>; 353 | 354 | #[inline] 355 | unsafe fn char_indices(buf: &'a [u8]) -> str::CharIndices<'a> { 356 | str::from_utf8_unchecked(buf).char_indices() 357 | } 358 | 359 | #[inline] 360 | fn encode_char(ch: char, cont: F) -> Result<(), ()> 361 | where 362 | F: FnOnce(&[u8]), 363 | { 364 | cont(ch.encode_utf8(&mut [0_u8; 4]).as_bytes()); 365 | Ok(()) 366 | } 367 | } 368 | 369 | /// Marker type for WTF-8 text. 370 | /// 371 | /// See the [WTF-8 spec](https://simonsapin.github.io/wtf-8/). 372 | #[derive(Copy, Clone, Default, Debug)] 373 | pub struct WTF8; 374 | 375 | #[inline] 376 | fn wtf8_meaningful(m: Meaning) -> bool { 377 | match m { 378 | Meaning::Whole(_) | Meaning::LeadSurrogate(_) | Meaning::TrailSurrogate(_) => true, 379 | _ => false, 380 | } 381 | } 382 | 383 | unsafe impl Format for WTF8 { 384 | #[inline] 385 | fn validate(buf: &[u8]) -> bool { 386 | let mut i = 0; 387 | let mut prev_lead = false; 388 | while i < buf.len() { 389 | let codept = unwrap_or_return!(futf::classify(buf, i), false); 390 | if !wtf8_meaningful(codept.meaning) { 391 | return false; 392 | } 393 | i += codept.bytes.len(); 394 | prev_lead = match codept.meaning { 395 | Meaning::TrailSurrogate(_) if prev_lead => return false, 396 | Meaning::LeadSurrogate(_) => true, 397 | _ => false, 398 | }; 399 | } 400 | 401 | true 402 | } 403 | 404 | #[inline] 405 | fn validate_prefix(buf: &[u8]) -> bool { 406 | if buf.len() == 0 { 407 | return true; 408 | } 409 | match futf::classify(buf, buf.len() - 1) { 410 | Some(c) => wtf8_meaningful(c.meaning), 411 | _ => false, 412 | } 413 | } 414 | 415 | #[inline] 416 | fn validate_suffix(buf: &[u8]) -> bool { 417 | if buf.len() == 0 { 418 | return true; 419 | } 420 | match futf::classify(buf, 0) { 421 | Some(c) => wtf8_meaningful(c.meaning), 422 | _ => false, 423 | } 424 | } 425 | 426 | #[inline] 427 | fn validate_subseq(buf: &[u8]) -> bool { 428 | ::validate_prefix(buf) && ::validate_suffix(buf) 429 | } 430 | 431 | #[inline] 432 | unsafe fn fixup(lhs: &[u8], rhs: &[u8]) -> imp::Fixup { 433 | const ERR: &'static str = "WTF8: internal error"; 434 | 435 | if lhs.len() >= 3 && rhs.len() >= 3 { 436 | if let ( 437 | Some(Codepoint { 438 | meaning: Meaning::LeadSurrogate(hi), 439 | .. 440 | }), 441 | Some(Codepoint { 442 | meaning: Meaning::TrailSurrogate(lo), 443 | .. 444 | }), 445 | ) = (futf::classify(lhs, lhs.len() - 1), futf::classify(rhs, 0)) 446 | { 447 | let mut fixup = imp::Fixup { 448 | drop_left: 3, 449 | drop_right: 3, 450 | insert_len: 0, 451 | insert_bytes: [0_u8; 4], 452 | }; 453 | 454 | let n = 0x10000 + ((hi as u32) << 10) + (lo as u32); 455 | 456 | let ch = char::from_u32(n).expect(ERR); 457 | fixup.insert_len = ch.encode_utf8(&mut fixup.insert_bytes).len() as u32; 458 | 459 | return fixup; 460 | } 461 | } 462 | 463 | Default::default() 464 | } 465 | } 466 | 467 | /// Marker type for the single-byte encoding of the first 256 Unicode codepoints. 468 | /// 469 | /// This is IANA's "ISO-8859-1". It's ISO's "ISO 8859-1" with the addition of the 470 | /// C0 and C1 control characters from ECMA-48 / ISO 6429. 471 | /// 472 | /// Not to be confused with WHATWG's "latin1" or "iso8859-1" labels (or the 473 | /// many other aliases), which actually stand for Windows-1252. 474 | #[derive(Copy, Clone, Default, Debug)] 475 | pub struct Latin1; 476 | 477 | unsafe impl Format for Latin1 { 478 | #[inline(always)] 479 | fn validate(_: &[u8]) -> bool { 480 | true 481 | } 482 | 483 | #[inline(always)] 484 | fn validate_prefix(_: &[u8]) -> bool { 485 | true 486 | } 487 | 488 | #[inline(always)] 489 | fn validate_suffix(_: &[u8]) -> bool { 490 | true 491 | } 492 | 493 | #[inline(always)] 494 | fn validate_subseq(_: &[u8]) -> bool { 495 | true 496 | } 497 | } 498 | 499 | unsafe impl<'a> CharFormat<'a> for Latin1 { 500 | type Iter = imp::SingleByteCharIndices<'a>; 501 | 502 | #[inline] 503 | unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> { 504 | imp::SingleByteCharIndices::new(buf) 505 | } 506 | 507 | #[inline] 508 | fn encode_char(ch: char, cont: F) -> Result<(), ()> 509 | where 510 | F: FnOnce(&[u8]), 511 | { 512 | let n = ch as u32; 513 | if n > 0xFF { 514 | return Err(()); 515 | } 516 | cont(&[n as u8]); 517 | Ok(()) 518 | } 519 | } 520 | -------------------------------------------------------------------------------- /src/futf.rs: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 or the MIT license 3 | // , at your 4 | // option. This file may not be copied, modified, or distributed 5 | // except according to those terms. 6 | 7 | use std::{char, slice}; 8 | 9 | /// Meaning of a complete or partial UTF-8 codepoint. 10 | /// 11 | /// Not all checking is performed eagerly. That is, a codepoint `Prefix` or 12 | /// `Suffix` may in reality have no valid completion. 13 | #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)] 14 | pub enum Meaning { 15 | /// We found a whole codepoint. 16 | Whole(char), 17 | 18 | /// We found something that isn't a valid Unicode codepoint, but 19 | /// it *would* correspond to a UTF-16 leading surrogate code unit, 20 | /// i.e. a value in the range `U+D800` - `U+DBFF`. 21 | /// 22 | /// The argument is the code unit's 10-bit index within that range. 23 | /// 24 | /// These are found in UTF-8 variants such as CESU-8 and WTF-8. 25 | LeadSurrogate(u16), 26 | 27 | /// We found something that isn't a valid Unicode codepoint, but 28 | /// it *would* correspond to a UTF-16 trailing surrogate code unit, 29 | /// i.e. a value in the range `U+DC00` - `U+DFFF`. 30 | /// 31 | /// The argument is the code unit's 10-bit index within that range. 32 | /// 33 | /// These are found in UTF-8 variants such as CESU-8 and WTF-8. 34 | TrailSurrogate(u16), 35 | 36 | /// We found only a prefix of a codepoint before the buffer ended. 37 | /// 38 | /// Includes the number of additional bytes needed. 39 | Prefix(usize), 40 | 41 | /// We found only a suffix of a codepoint before running off the 42 | /// start of the buffer. 43 | /// 44 | /// Up to 3 more bytes may be needed. 45 | Suffix, 46 | } 47 | 48 | /// Represents a complete or partial UTF-8 codepoint. 49 | #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)] 50 | pub struct Codepoint<'a> { 51 | /// The bytes that make up the partial or full codepoint. 52 | /// 53 | /// For a `Suffix` this depends on `idx`. We don't scan forward 54 | /// for additional continuation bytes after the reverse scan 55 | /// failed to locate a multibyte sequence start. 56 | pub bytes: &'a [u8], 57 | 58 | /// Start of the codepoint in the buffer, expressed as an offset 59 | /// back from `idx`. 60 | pub rewind: usize, 61 | 62 | /// Meaning of the partial or full codepoint. 63 | pub meaning: Meaning, 64 | } 65 | 66 | #[derive(Debug, PartialEq, Eq)] 67 | enum Byte { 68 | Ascii, 69 | Start(usize), 70 | Cont, 71 | } 72 | 73 | impl Byte { 74 | #[inline(always)] 75 | fn classify(x: u8) -> Option { 76 | match x & 0xC0 { 77 | 0xC0 => match x { 78 | x if x & 0b11111_000 == 0b11110_000 => Some(Byte::Start(4)), 79 | x if x & 0b1111_0000 == 0b1110_0000 => Some(Byte::Start(3)), 80 | x if x & 0b111_00000 == 0b110_00000 => Some(Byte::Start(2)), 81 | _ => None, 82 | }, 83 | 0x80 => Some(Byte::Cont), 84 | _ => Some(Byte::Ascii), 85 | } 86 | } 87 | } 88 | 89 | #[inline(always)] 90 | fn all_cont(buf: &[u8]) -> bool { 91 | buf.iter() 92 | .all(|&b| matches!(Byte::classify(b), Some(Byte::Cont))) 93 | } 94 | 95 | // NOTE: Assumes the buffer is a syntactically valid multi-byte UTF-8 sequence: 96 | // a starting byte followed by the correct number of continuation bytes. 97 | #[inline(always)] 98 | unsafe fn decode(buf: &[u8]) -> Option { 99 | debug_assert!(buf.len() >= 2); 100 | debug_assert!(buf.len() <= 4); 101 | let n; 102 | match buf.len() { 103 | 2 => { 104 | n = ((*buf.get_unchecked(0) & 0b11111) as u32) << 6 105 | | ((*buf.get_unchecked(1) & 0x3F) as u32); 106 | if n < 0x80 { 107 | return None; 108 | } // Overlong 109 | } 110 | 3 => { 111 | n = ((*buf.get_unchecked(0) & 0b1111) as u32) << 12 112 | | ((*buf.get_unchecked(1) & 0x3F) as u32) << 6 113 | | ((*buf.get_unchecked(2) & 0x3F) as u32); 114 | match n { 115 | 0x0000..=0x07FF => return None, // Overlong 116 | 0xD800..=0xDBFF => return Some(Meaning::LeadSurrogate(n as u16 - 0xD800)), 117 | 0xDC00..=0xDFFF => return Some(Meaning::TrailSurrogate(n as u16 - 0xDC00)), 118 | _ => {} 119 | } 120 | } 121 | 4 => { 122 | n = ((*buf.get_unchecked(0) & 0b111) as u32) << 18 123 | | ((*buf.get_unchecked(1) & 0x3F) as u32) << 12 124 | | ((*buf.get_unchecked(2) & 0x3F) as u32) << 6 125 | | ((*buf.get_unchecked(3) & 0x3F) as u32); 126 | if n < 0x1_0000 { 127 | return None; 128 | } // Overlong 129 | } 130 | _ => debug_unreachable!(), 131 | } 132 | 133 | char::from_u32(n).map(Meaning::Whole) 134 | } 135 | 136 | #[inline(always)] 137 | unsafe fn unsafe_slice<'a>(buf: &'a [u8], start: usize, new_len: usize) -> &'a [u8] { 138 | debug_assert!(start <= buf.len()); 139 | debug_assert!(new_len <= (buf.len() - start)); 140 | slice::from_raw_parts(buf.as_ptr().offset(start as isize), new_len) 141 | } 142 | 143 | macro_rules! otry { 144 | ($x:expr) => { 145 | unwrap_or_return!($x, None) 146 | }; 147 | } 148 | 149 | /// Describes the UTF-8 codepoint containing the byte at index `idx` within 150 | /// `buf`. 151 | /// 152 | /// Returns `None` if `idx` is out of range, or if `buf` contains invalid UTF-8 153 | /// in the vicinity of `idx`. 154 | #[inline] 155 | pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option> { 156 | if idx >= buf.len() { 157 | return None; 158 | } 159 | 160 | unsafe { 161 | let x = *buf.get_unchecked(idx); 162 | match otry!(Byte::classify(x)) { 163 | Byte::Ascii => Some(Codepoint { 164 | bytes: unsafe_slice(buf, idx, 1), 165 | rewind: 0, 166 | meaning: Meaning::Whole(x as char), 167 | }), 168 | Byte::Start(n) => { 169 | let avail = buf.len() - idx; 170 | if avail >= n { 171 | let bytes = unsafe_slice(buf, idx, n); 172 | if !all_cont(unsafe_slice(bytes, 1, n - 1)) { 173 | return None; 174 | } 175 | let meaning = otry!(decode(bytes)); 176 | Some(Codepoint { 177 | bytes: bytes, 178 | rewind: 0, 179 | meaning: meaning, 180 | }) 181 | } else { 182 | Some(Codepoint { 183 | bytes: unsafe_slice(buf, idx, avail), 184 | rewind: 0, 185 | meaning: Meaning::Prefix(n - avail), 186 | }) 187 | } 188 | } 189 | Byte::Cont => { 190 | let mut start = idx; 191 | let mut checked = 0; 192 | loop { 193 | if start == 0 { 194 | // Whoops, fell off the beginning. 195 | return Some(Codepoint { 196 | bytes: unsafe_slice(buf, 0, idx + 1), 197 | rewind: idx, 198 | meaning: Meaning::Suffix, 199 | }); 200 | } 201 | 202 | start -= 1; 203 | checked += 1; 204 | match otry!(Byte::classify(*buf.get_unchecked(start))) { 205 | Byte::Cont => (), 206 | Byte::Start(n) => { 207 | let avail = buf.len() - start; 208 | if avail >= n { 209 | let bytes = unsafe_slice(buf, start, n); 210 | if checked < n { 211 | if !all_cont(unsafe_slice(bytes, checked, n - checked)) { 212 | return None; 213 | } 214 | } 215 | let meaning = otry!(decode(bytes)); 216 | return Some(Codepoint { 217 | bytes: bytes, 218 | rewind: idx - start, 219 | meaning: meaning, 220 | }); 221 | } else { 222 | return Some(Codepoint { 223 | bytes: unsafe_slice(buf, start, avail), 224 | rewind: idx - start, 225 | meaning: Meaning::Prefix(n - avail), 226 | }); 227 | } 228 | } 229 | _ => return None, 230 | } 231 | 232 | if idx - start >= 3 { 233 | // We looked at 3 bytes before a continuation byte 234 | // and didn't find a start byte. 235 | return None; 236 | } 237 | } 238 | } 239 | } 240 | } 241 | } 242 | 243 | #[cfg(all(test, feature = "bench"))] 244 | mod tests { 245 | use super::{all_cont, classify, decode, Byte, Meaning}; 246 | use std::borrow::ToOwned; 247 | use std::io::Write; 248 | use test::Bencher; 249 | 250 | #[test] 251 | fn classify_all_bytes() { 252 | for n in 0x00..0x80 { 253 | assert_eq!(Byte::classify(n), Some(Byte::Ascii)); 254 | } 255 | for n in 0x80..0xC0 { 256 | assert_eq!(Byte::classify(n), Some(Byte::Cont)); 257 | } 258 | for n in 0xC0..0xE0 { 259 | assert_eq!(Byte::classify(n), Some(Byte::Start(2))); 260 | } 261 | for n in 0xE0..0xF0 { 262 | assert_eq!(Byte::classify(n), Some(Byte::Start(3))); 263 | } 264 | for n in 0xF0..0xF8 { 265 | assert_eq!(Byte::classify(n), Some(Byte::Start(4))); 266 | } 267 | for n in 0xF8..0xFF { 268 | assert_eq!(Byte::classify(n), None); 269 | } 270 | assert_eq!(Byte::classify(0xFF), None); 271 | } 272 | 273 | #[test] 274 | fn test_all_cont() { 275 | assert!(all_cont(b"")); 276 | assert!(all_cont(b"\x80")); 277 | assert!(all_cont(b"\xBF")); 278 | assert!(all_cont(b"\x80\xBF\x80\xBF")); 279 | 280 | assert!(!all_cont(b"z")); 281 | assert!(!all_cont(b"\xC0\xBF")); 282 | assert!(!all_cont(b"\xFF")); 283 | assert!(!all_cont(b"\x80\xBFz\x80\xBF")); 284 | assert!(!all_cont(b"\x80\xBF\xC0\x80\xBF")); 285 | assert!(!all_cont(b"\x80\xBF\xFF\x80\xBF")); 286 | assert!(!all_cont(b"\x80\xBF\x80\xBFz")); 287 | assert!(!all_cont(b"\x80\xBF\x80\xBF\xC0")); 288 | assert!(!all_cont(b"z\x80\xBF\x80\xBF")); 289 | assert!(!all_cont(b"\xC0\x80\xBF\x80\xBF")); 290 | } 291 | 292 | #[test] 293 | fn test_decode() { 294 | unsafe { 295 | assert_eq!(Some(Meaning::Whole('ő')), decode(b"\xC5\x91")); 296 | assert_eq!(Some(Meaning::Whole('\u{a66e}')), decode(b"\xEA\x99\xAE")); 297 | assert_eq!( 298 | Some(Meaning::Whole('\u{1f4a9}')), 299 | decode(b"\xF0\x9F\x92\xA9") 300 | ); 301 | assert_eq!( 302 | Some(Meaning::Whole('\u{10ffff}')), 303 | decode(b"\xF4\x8F\xBF\xBF") 304 | ); 305 | 306 | assert_eq!( 307 | Some(Meaning::LeadSurrogate(0x0000)), 308 | decode(b"\xED\xA0\x80") 309 | ); 310 | assert_eq!( 311 | Some(Meaning::LeadSurrogate(0x0001)), 312 | decode(b"\xED\xA0\x81") 313 | ); 314 | assert_eq!( 315 | Some(Meaning::LeadSurrogate(0x03FE)), 316 | decode(b"\xED\xAF\xBE") 317 | ); 318 | assert_eq!( 319 | Some(Meaning::LeadSurrogate(0x03FF)), 320 | decode(b"\xED\xAF\xBF") 321 | ); 322 | 323 | assert_eq!( 324 | Some(Meaning::TrailSurrogate(0x0000)), 325 | decode(b"\xED\xB0\x80") 326 | ); 327 | assert_eq!( 328 | Some(Meaning::TrailSurrogate(0x0001)), 329 | decode(b"\xED\xB0\x81") 330 | ); 331 | assert_eq!( 332 | Some(Meaning::TrailSurrogate(0x03FE)), 333 | decode(b"\xED\xBF\xBE") 334 | ); 335 | assert_eq!( 336 | Some(Meaning::TrailSurrogate(0x03FF)), 337 | decode(b"\xED\xBF\xBF") 338 | ); 339 | 340 | // The last 4-byte UTF-8 sequence. This would be U+1FFFFF, which is out of 341 | // range. 342 | assert_eq!(None, decode(b"\xF7\xBF\xBF\xBF")); 343 | 344 | // First otherwise-valid sequence (would be U+110000) that is out of range 345 | assert_eq!(None, decode(b"\xF4\x90\x80\x80")); 346 | 347 | // Overlong sequences 348 | assert_eq!(None, decode(b"\xC0\x80")); 349 | assert_eq!(None, decode(b"\xC1\xBF")); 350 | assert_eq!(None, decode(b"\xE0\x80\x80")); 351 | assert_eq!(None, decode(b"\xE0\x9F\xBF")); 352 | assert_eq!(None, decode(b"\xF0\x80\x80\x80")); 353 | assert_eq!(None, decode(b"\xF0\x8F\xBF\xBF")); 354 | 355 | // For not-overlong sequence for each sequence length 356 | assert_eq!(Some(Meaning::Whole('\u{80}')), decode(b"\xC2\x80")); 357 | assert_eq!(Some(Meaning::Whole('\u{800}')), decode(b"\xE0\xA0\x80")); 358 | assert_eq!( 359 | Some(Meaning::Whole('\u{10000}')), 360 | decode(b"\xF0\x90\x80\x80") 361 | ); 362 | } 363 | } 364 | 365 | static JUNK: &'static [u8] = b"\ 366 | \xf8\x0d\x07\x25\xa6\x7b\x95\xeb\x47\x01\x7f\xee\ 367 | \x3b\x00\x60\x57\x1d\x9e\x5d\x0a\x0b\x0a\x7c\x75\ 368 | \x13\xa1\x82\x46\x27\x34\xe9\x52\x61\x0d\xec\x10\ 369 | \x54\x49\x6e\x54\xdf\x7b\xe1\x31\x8c\x06\x21\x83\ 370 | \x0f\xb5\x1f\x4c\x6a\x71\x52\x42\x74\xe7\x7b\x50\ 371 | \x59\x1f\x6a\xd4\xff\x06\x92\x33\xc4\x34\x97\xff\ 372 | \xcc\xb5\xc4\x00\x7b\xc3\x4a\x7f\x7e\x63\x96\x58\ 373 | \x51\x63\x21\x54\x53\x2f\x03\x8a\x7d\x41\x79\x98\ 374 | \x5b\xcb\xb8\x94\x6b\x73\xf3\x0c\x5a\xd7\xc4\x12\ 375 | \x7a\x2b\x9a\x2e\x67\x62\x2a\x00\x45\x2c\xfe\x7d\ 376 | \x8d\xd6\x51\x4e\x59\x36\x72\x1b\xae\xaa\x06\xe8\ 377 | \x71\x1b\x85\xd3\x35\xb5\xbe\x9e\x16\x96\x72\xd8\ 378 | \x1a\x48\xba\x4d\x55\x4f\x1b\xa2\x77\xfa\x8f\x71\ 379 | \x58\x7d\x03\x93\xa2\x3a\x76\x51\xda\x48\xe2\x3f\ 380 | \xeb\x8d\xda\x89\xae\xf7\xbd\x3d\xb6\x37\x97\xca\ 381 | \x99\xcc\x4a\x8d\x62\x89\x97\xe3\xc0\xd1\x8d\xc1\ 382 | \x26\x11\xbb\x8d\x53\x61\x4f\x76\x03\x00\x30\xd3\ 383 | \x5f\x86\x19\x52\x9c\x3e\x99\x8c\xb7\x21\x48\x1c\ 384 | \x85\xae\xad\xd5\x74\x00\x6c\x3e\xd0\x17\xff\x76\ 385 | \x5c\x32\xc3\xfb\x24\x99\xd4\x4c\xa4\x1f\x66\x46\ 386 | \xe7\x2d\x44\x56\x7d\x14\xd9\x76\x91\x37\x2f\xb7\ 387 | \xcc\x1b\xd3\xc2"; 388 | 389 | #[test] 390 | fn classify_whole() { 391 | assert_eq!(JUNK.len(), 256); 392 | 393 | for &c in &[ 394 | '\0', 395 | '\x01', 396 | 'o', 397 | 'z', 398 | 'ő', 399 | '\u{2764}', 400 | '\u{a66e}', 401 | '\u{1f4a9}', 402 | '\u{1f685}', 403 | ] { 404 | for idx in 0..JUNK.len() - 3 { 405 | let mut buf = JUNK.to_owned(); 406 | let ch = format!("{}", c).into_bytes(); 407 | (&mut buf[idx..]).write_all(&ch).unwrap(); 408 | 409 | for j in 0..ch.len() { 410 | let class = classify(&buf, idx + j).unwrap(); 411 | assert_eq!(class.bytes, &*ch); 412 | assert_eq!(class.rewind, j); 413 | assert_eq!(class.meaning, Meaning::Whole(c)); 414 | } 415 | } 416 | } 417 | } 418 | 419 | #[test] 420 | fn classify_surrogates() { 421 | for &(s, b) in &[ 422 | (Meaning::LeadSurrogate(0x0000), b"\xED\xA0\x80"), 423 | (Meaning::LeadSurrogate(0x0001), b"\xED\xA0\x81"), 424 | (Meaning::LeadSurrogate(0x03FE), b"\xED\xAF\xBE"), 425 | (Meaning::LeadSurrogate(0x03FF), b"\xED\xAF\xBF"), 426 | (Meaning::TrailSurrogate(0x0000), b"\xED\xB0\x80"), 427 | (Meaning::TrailSurrogate(0x0001), b"\xED\xB0\x81"), 428 | (Meaning::TrailSurrogate(0x03FE), b"\xED\xBF\xBE"), 429 | (Meaning::TrailSurrogate(0x03FF), b"\xED\xBF\xBF"), 430 | ] { 431 | for idx in 0..JUNK.len() - 2 { 432 | let mut buf = JUNK.to_owned(); 433 | (&mut buf[idx..]).write_all(b).unwrap(); 434 | 435 | let class = classify(&buf, idx).unwrap(); 436 | assert_eq!(class.bytes, b); 437 | assert_eq!(class.rewind, 0); 438 | assert_eq!(class.meaning, s); 439 | } 440 | } 441 | } 442 | 443 | #[test] 444 | fn classify_prefix_suffix() { 445 | for &c in &['ő', '\u{a66e}', '\u{1f4a9}'] { 446 | let ch = format!("{}", c).into_bytes(); 447 | for pfx in 1..ch.len() - 1 { 448 | let mut buf = JUNK.to_owned(); 449 | let buflen = buf.len(); 450 | (&mut buf[buflen - pfx..buflen]) 451 | .write_all(&ch[..pfx]) 452 | .unwrap(); 453 | for j in 0..pfx { 454 | let idx = buflen - 1 - j; 455 | let class = classify(&buf, idx).unwrap(); 456 | assert_eq!(class.bytes, &ch[..pfx]); 457 | assert_eq!(class.rewind, pfx - 1 - j); 458 | assert_eq!(class.meaning, Meaning::Prefix(ch.len() - pfx)); 459 | } 460 | } 461 | for sfx in 1..ch.len() - 1 { 462 | let ch_bytes = &ch[ch.len() - sfx..]; 463 | let mut buf = JUNK.to_owned(); 464 | (&mut *buf).write_all(ch_bytes).unwrap(); 465 | for j in 0..sfx { 466 | let class = classify(&buf, j).unwrap(); 467 | assert!(ch_bytes.starts_with(class.bytes)); 468 | assert_eq!(class.rewind, j); 469 | assert_eq!(class.meaning, Meaning::Suffix); 470 | } 471 | } 472 | } 473 | } 474 | 475 | #[test] 476 | fn out_of_bounds() { 477 | assert!(classify(b"", 0).is_none()); 478 | assert!(classify(b"", 7).is_none()); 479 | assert!(classify(b"aaaaaaa", 7).is_none()); 480 | } 481 | 482 | #[test] 483 | fn malformed() { 484 | assert_eq!(None, classify(b"\xFF", 0)); 485 | assert_eq!(None, classify(b"\xC5\xC5", 0)); 486 | assert_eq!(None, classify(b"x\x91", 1)); 487 | assert_eq!(None, classify(b"\x91\x91\x91\x91", 3)); 488 | assert_eq!(None, classify(b"\x91\x91\x91\x91\x91", 4)); 489 | assert_eq!(None, classify(b"\xEA\x91\xFF", 1)); 490 | assert_eq!(None, classify(b"\xF0\x90\x90\xF0", 0)); 491 | assert_eq!(None, classify(b"\xF0\x90\x90\xF0", 1)); 492 | assert_eq!(None, classify(b"\xF0\x90\x90\xF0", 2)); 493 | 494 | for i in 0..4 { 495 | // out of range: U+110000 496 | assert_eq!(None, classify(b"\xF4\x90\x80\x80", i)); 497 | 498 | // out of range: U+1FFFFF 499 | assert_eq!(None, classify(b"\xF7\xBF\xBF\xBF", i)); 500 | 501 | // Overlong sequences 502 | assert_eq!(None, classify(b"\xC0\x80", i)); 503 | assert_eq!(None, classify(b"\xC1\xBF", i)); 504 | assert_eq!(None, classify(b"\xE0\x80\x80", i)); 505 | assert_eq!(None, classify(b"\xE0\x9F\xBF", i)); 506 | assert_eq!(None, classify(b"\xF0\x80\x80\x80", i)); 507 | assert_eq!(None, classify(b"\xF0\x8F\xBF\xBF", i)); 508 | } 509 | } 510 | 511 | static TEXT: &'static str = " 512 | All human beings are born free and equal in dignity and rights. 513 | They are endowed with reason and conscience and should act 514 | towards one another in a spirit of brotherhood. 515 | 516 | Minden emberi lény szabadon születik és egyenlő méltósága és 517 | joga van. Az emberek, ésszel és lelkiismerettel bírván, 518 | egymással szemben testvéri szellemben kell hogy viseltessenek. 519 | 520 | เราทุกคนเกิดมาอย่างอิสระ เราทุกคนมีความคิดและความเข้าใจเป็นของเราเอง 521 | เราทุกคนควรได้รับการปฏิบัติในทางเดียวกัน. 522 | 523 | 모든 인간은 태어날 때부터 자유로우며 그 존엄과 권리에 있어 524 | 동등하다. 인간은 천부적으로 이성과 양심을 부여받았으며 서로 525 | 형제애의 정신으로 행동하여야 한다. 526 | 527 | ro remna cu se jinzi co zifre je simdu'i be le ry. nilselsi'a 528 | .e lei ry. selcru .i ry. se menli gi'e se sezmarde .i .ei 529 | jeseki'ubo ry. simyzu'e ta'i le tunba 530 | 531 | ᏂᎦᏓ ᎠᏂᏴᏫ ᏂᎨᎫᏓᎸᎾ ᎠᎴ ᎤᏂᏠᏱ ᎤᎾᏕᎿ ᏚᏳᎧᏛ ᎨᏒᎢ. ᎨᏥᏁᎳ ᎤᎾᏓᏅᏖᏗ ᎠᎴ ᎤᏃᏟᏍᏗ 532 | ᎠᎴ ᏌᏊ ᎨᏒ ᏧᏂᎸᏫᏍᏓᏁᏗ ᎠᎾᏟᏅᏢ ᎠᏓᏅᏙ ᎬᏗ."; 533 | 534 | // random 535 | static IXES: &'static [usize] = &[ 536 | 778, 156, 87, 604, 1216, 365, 884, 311, 469, 515, 709, 162, 871, 206, 634, 442, 537 | ]; 538 | 539 | static BOUNDARY: &'static [bool] = &[ 540 | false, true, true, false, false, true, true, true, true, false, false, true, true, true, 541 | false, false, 542 | ]; 543 | 544 | #[bench] 545 | fn std_utf8_check(b: &mut Bencher) { 546 | b.iter(|| { 547 | assert!(IXES 548 | .iter() 549 | .zip(BOUNDARY.iter()) 550 | .all(|(&ix, &expect)| { expect == TEXT.is_char_boundary(ix) })); 551 | }); 552 | } 553 | 554 | // We don't expect to be as fast as is_char_boundary, because we provide more 555 | // information. But we shouldn't be tremendously slower, either. A factor of 556 | // 5-10 is expected on this text. 557 | #[bench] 558 | fn futf_check(b: &mut Bencher) { 559 | b.iter(|| { 560 | assert!(IXES.iter().zip(BOUNDARY.iter()).all(|(&ix, &expect)| { 561 | expect == (classify(TEXT.as_bytes(), ix).unwrap().rewind == 0) 562 | })); 563 | }); 564 | } 565 | } 566 | -------------------------------------------------------------------------------- /src/stream.rs: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 or the MIT license 3 | // , at your 4 | // option. This file may not be copied, modified, or distributed 5 | // except according to those terms. 6 | 7 | //! Streams of tendrils. 8 | 9 | use fmt; 10 | use tendril::{Atomicity, NonAtomic, Tendril}; 11 | 12 | use std::borrow::Cow; 13 | use std::fs::File; 14 | use std::io; 15 | use std::marker::PhantomData; 16 | use std::path::Path; 17 | 18 | #[cfg(feature = "encoding")] 19 | use encoding; 20 | #[cfg(feature = "encoding_rs")] 21 | use encoding_rs::{self, DecoderResult}; 22 | use utf8; 23 | 24 | /// Trait for types that can process a tendril. 25 | /// 26 | /// This is a "push" interface, unlike the "pull" interface of 27 | /// `Iterator>`. The push interface matches 28 | /// [html5ever][] and other incremental parsers with a similar 29 | /// architecture. 30 | /// 31 | /// [html5ever]: https://github.com/servo/html5ever 32 | pub trait TendrilSink 33 | where 34 | F: fmt::Format, 35 | A: Atomicity, 36 | { 37 | /// Process this tendril. 38 | fn process(&mut self, t: Tendril); 39 | 40 | /// Indicates that an error has occurred. 41 | fn error(&mut self, desc: Cow<'static, str>); 42 | 43 | /// What the overall result of processing is. 44 | type Output; 45 | 46 | /// Indicates the end of the stream. 47 | fn finish(self) -> Self::Output; 48 | 49 | /// Process one tendril and finish. 50 | fn one(mut self, t: T) -> Self::Output 51 | where 52 | Self: Sized, 53 | T: Into>, 54 | { 55 | self.process(t.into()); 56 | self.finish() 57 | } 58 | 59 | /// Consume an iterator of tendrils, processing each item, then finish. 60 | fn from_iter(mut self, i: I) -> Self::Output 61 | where 62 | Self: Sized, 63 | I: IntoIterator, 64 | I::Item: Into>, 65 | { 66 | for t in i { 67 | self.process(t.into()) 68 | } 69 | self.finish() 70 | } 71 | 72 | /// Read from the given stream of bytes until exhaustion and process incrementally, 73 | /// then finish. Return `Err` at the first I/O error. 74 | fn read_from(mut self, r: &mut R) -> io::Result 75 | where 76 | Self: Sized, 77 | R: io::Read, 78 | F: fmt::SliceFormat, 79 | { 80 | const BUFFER_SIZE: u32 = 4 * 1024; 81 | loop { 82 | let mut tendril = Tendril::::new(); 83 | // FIXME: this exposes uninitialized bytes to a generic R type 84 | // this is fine for R=File which never reads these bytes, 85 | // but user-defined types might. 86 | // The standard library pushes zeros to `Vec` for that reason. 87 | unsafe { 88 | tendril.push_uninitialized(BUFFER_SIZE); 89 | } 90 | loop { 91 | match r.read(&mut tendril) { 92 | Ok(0) => return Ok(self.finish()), 93 | Ok(n) => { 94 | tendril.pop_back(BUFFER_SIZE - n as u32); 95 | self.process(tendril); 96 | break; 97 | } 98 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} 99 | Err(e) => return Err(e), 100 | } 101 | } 102 | } 103 | } 104 | 105 | /// Read from the file at the given path and process incrementally, 106 | /// then finish. Return `Err` at the first I/O error. 107 | fn from_file

(self, path: P) -> io::Result 108 | where 109 | Self: Sized, 110 | P: AsRef, 111 | F: fmt::SliceFormat, 112 | { 113 | self.read_from(&mut File::open(path)?) 114 | } 115 | } 116 | 117 | /// A `TendrilSink` adaptor that takes bytes, decodes them as UTF-8, 118 | /// lossily replace ill-formed byte sequences with U+FFFD replacement characters, 119 | /// and emits Unicode (`StrTendril`). 120 | /// 121 | /// This does not allocate memory: the output is either subtendrils on the input, 122 | /// on inline tendrils for a single code point. 123 | pub struct Utf8LossyDecoder 124 | where 125 | Sink: TendrilSink, 126 | A: Atomicity, 127 | { 128 | pub inner_sink: Sink, 129 | incomplete: Option, 130 | marker: PhantomData, 131 | } 132 | 133 | impl Utf8LossyDecoder 134 | where 135 | Sink: TendrilSink, 136 | A: Atomicity, 137 | { 138 | /// Create a new incremental UTF-8 decoder. 139 | #[inline] 140 | pub fn new(inner_sink: Sink) -> Self { 141 | Utf8LossyDecoder { 142 | inner_sink: inner_sink, 143 | incomplete: None, 144 | marker: PhantomData, 145 | } 146 | } 147 | } 148 | 149 | impl TendrilSink for Utf8LossyDecoder 150 | where 151 | Sink: TendrilSink, 152 | A: Atomicity, 153 | { 154 | #[inline] 155 | fn process(&mut self, mut t: Tendril) { 156 | // FIXME: remove take() and map() when non-lexical borrows are stable. 157 | if let Some(mut incomplete) = self.incomplete.take() { 158 | let resume_at = incomplete.try_complete(&t).map(|(result, rest)| { 159 | match result { 160 | Ok(s) => self.inner_sink.process(Tendril::from_slice(s)), 161 | Err(_) => { 162 | self.inner_sink.error("invalid byte sequence".into()); 163 | self.inner_sink 164 | .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); 165 | } 166 | } 167 | t.len() - rest.len() 168 | }); 169 | match resume_at { 170 | None => { 171 | self.incomplete = Some(incomplete); 172 | return; 173 | } 174 | Some(resume_at) => t.pop_front(resume_at as u32), 175 | } 176 | } 177 | while !t.is_empty() { 178 | let unborrowed_result = match utf8::decode(&t) { 179 | Ok(s) => { 180 | debug_assert!(s.as_ptr() == t.as_ptr()); 181 | debug_assert!(s.len() == t.len()); 182 | Ok(()) 183 | } 184 | Err(utf8::DecodeError::Invalid { 185 | valid_prefix, 186 | invalid_sequence, 187 | .. 188 | }) => { 189 | debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); 190 | debug_assert!(valid_prefix.len() <= t.len()); 191 | Err(( 192 | valid_prefix.len(), 193 | Err(valid_prefix.len() + invalid_sequence.len()), 194 | )) 195 | } 196 | Err(utf8::DecodeError::Incomplete { 197 | valid_prefix, 198 | incomplete_suffix, 199 | }) => { 200 | debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); 201 | debug_assert!(valid_prefix.len() <= t.len()); 202 | Err((valid_prefix.len(), Ok(incomplete_suffix))) 203 | } 204 | }; 205 | match unborrowed_result { 206 | Ok(()) => { 207 | unsafe { self.inner_sink.process(t.reinterpret_without_validating()) } 208 | return; 209 | } 210 | Err((valid_len, and_then)) => { 211 | if valid_len > 0 { 212 | let subtendril = t.subtendril(0, valid_len as u32); 213 | unsafe { 214 | self.inner_sink 215 | .process(subtendril.reinterpret_without_validating()) 216 | } 217 | } 218 | match and_then { 219 | Ok(incomplete) => { 220 | self.incomplete = Some(incomplete); 221 | return; 222 | } 223 | Err(offset) => { 224 | self.inner_sink.error("invalid byte sequence".into()); 225 | self.inner_sink 226 | .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); 227 | t.pop_front(offset as u32); 228 | } 229 | } 230 | } 231 | } 232 | } 233 | } 234 | 235 | #[inline] 236 | fn error(&mut self, desc: Cow<'static, str>) { 237 | self.inner_sink.error(desc); 238 | } 239 | 240 | type Output = Sink::Output; 241 | 242 | #[inline] 243 | fn finish(mut self) -> Sink::Output { 244 | if self.incomplete.is_some() { 245 | self.inner_sink 246 | .error("incomplete byte sequence at end of stream".into()); 247 | self.inner_sink 248 | .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); 249 | } 250 | self.inner_sink.finish() 251 | } 252 | } 253 | 254 | /// A `TendrilSink` adaptor that takes bytes, decodes them as the given character encoding, 255 | /// lossily replace ill-formed byte sequences with U+FFFD replacement characters, 256 | /// and emits Unicode (`StrTendril`). 257 | /// 258 | /// This allocates new tendrils for encodings other than UTF-8. 259 | #[cfg(any(feature = "encoding", feature = "encoding_rs"))] 260 | pub struct LossyDecoder 261 | where 262 | Sink: TendrilSink, 263 | A: Atomicity, 264 | { 265 | inner: LossyDecoderInner, 266 | } 267 | 268 | #[cfg(any(feature = "encoding", feature = "encoding_rs"))] 269 | enum LossyDecoderInner 270 | where 271 | Sink: TendrilSink, 272 | A: Atomicity, 273 | { 274 | Utf8(Utf8LossyDecoder), 275 | #[cfg(feature = "encoding")] 276 | Encoding(Box, Sink), 277 | #[cfg(feature = "encoding_rs")] 278 | EncodingRs(encoding_rs::Decoder, Sink), 279 | } 280 | 281 | #[cfg(any(feature = "encoding", feature = "encoding_rs"))] 282 | impl LossyDecoder 283 | where 284 | Sink: TendrilSink, 285 | A: Atomicity, 286 | { 287 | /// Create a new incremental decoder using the encoding crate. 288 | #[cfg(feature = "encoding")] 289 | #[inline] 290 | pub fn new(encoding: encoding::EncodingRef, sink: Sink) -> Self { 291 | if encoding.name() == "utf-8" { 292 | LossyDecoder::utf8(sink) 293 | } else { 294 | LossyDecoder { 295 | inner: LossyDecoderInner::Encoding(encoding.raw_decoder(), sink), 296 | } 297 | } 298 | } 299 | 300 | /// Create a new incremental decoder using the encoding_rs crate. 301 | #[cfg(feature = "encoding_rs")] 302 | #[inline] 303 | pub fn new_encoding_rs(encoding: &'static encoding_rs::Encoding, sink: Sink) -> Self { 304 | if encoding == encoding_rs::UTF_8 { 305 | return Self::utf8(sink); 306 | } 307 | Self { 308 | inner: LossyDecoderInner::EncodingRs(encoding.new_decoder(), sink), 309 | } 310 | } 311 | 312 | /// Create a new incremental decoder for the UTF-8 encoding. 313 | /// 314 | /// This is useful for content that is known at run-time to be UTF-8 315 | /// (whereas `Utf8LossyDecoder` requires knowning at compile-time.) 316 | #[inline] 317 | pub fn utf8(sink: Sink) -> LossyDecoder { 318 | LossyDecoder { 319 | inner: LossyDecoderInner::Utf8(Utf8LossyDecoder::new(sink)), 320 | } 321 | } 322 | 323 | /// Give a reference to the inner sink. 324 | pub fn inner_sink(&self) -> &Sink { 325 | match self.inner { 326 | LossyDecoderInner::Utf8(ref utf8) => &utf8.inner_sink, 327 | #[cfg(feature = "encoding")] 328 | LossyDecoderInner::Encoding(_, ref inner_sink) => inner_sink, 329 | #[cfg(feature = "encoding_rs")] 330 | LossyDecoderInner::EncodingRs(_, ref inner_sink) => inner_sink, 331 | } 332 | } 333 | 334 | /// Give a mutable reference to the inner sink. 335 | pub fn inner_sink_mut(&mut self) -> &mut Sink { 336 | match self.inner { 337 | LossyDecoderInner::Utf8(ref mut utf8) => &mut utf8.inner_sink, 338 | #[cfg(feature = "encoding")] 339 | LossyDecoderInner::Encoding(_, ref mut inner_sink) => inner_sink, 340 | #[cfg(feature = "encoding_rs")] 341 | LossyDecoderInner::EncodingRs(_, ref mut inner_sink) => inner_sink, 342 | } 343 | } 344 | } 345 | 346 | #[cfg(any(feature = "encoding", feature = "encoding_rs"))] 347 | impl TendrilSink for LossyDecoder 348 | where 349 | Sink: TendrilSink, 350 | A: Atomicity, 351 | { 352 | #[inline] 353 | fn process(&mut self, t: Tendril) { 354 | match self.inner { 355 | LossyDecoderInner::Utf8(ref mut utf8) => return utf8.process(t), 356 | #[cfg(feature = "encoding")] 357 | LossyDecoderInner::Encoding(ref mut decoder, ref mut sink) => { 358 | let mut out = Tendril::new(); 359 | let mut t = t; 360 | loop { 361 | match decoder.raw_feed(&*t, &mut out) { 362 | (_, Some(err)) => { 363 | out.push_char('\u{fffd}'); 364 | sink.error(err.cause); 365 | debug_assert!(err.upto >= 0); 366 | t.pop_front(err.upto as u32); 367 | // continue loop and process remainder of t 368 | } 369 | (_, None) => break, 370 | } 371 | } 372 | if out.len() > 0 { 373 | sink.process(out); 374 | } 375 | } 376 | #[cfg(feature = "encoding_rs")] 377 | LossyDecoderInner::EncodingRs(ref mut decoder, ref mut sink) => { 378 | if t.is_empty() { 379 | return; 380 | } 381 | decode_to_sink(t, decoder, sink, false); 382 | } 383 | } 384 | } 385 | 386 | #[inline] 387 | fn error(&mut self, desc: Cow<'static, str>) { 388 | match self.inner { 389 | LossyDecoderInner::Utf8(ref mut utf8) => utf8.error(desc), 390 | #[cfg(feature = "encoding")] 391 | LossyDecoderInner::Encoding(_, ref mut sink) => sink.error(desc), 392 | #[cfg(feature = "encoding_rs")] 393 | LossyDecoderInner::EncodingRs(_, ref mut sink) => sink.error(desc), 394 | } 395 | } 396 | 397 | type Output = Sink::Output; 398 | 399 | #[inline] 400 | fn finish(self) -> Sink::Output { 401 | match self.inner { 402 | LossyDecoderInner::Utf8(utf8) => return utf8.finish(), 403 | #[cfg(feature = "encoding")] 404 | LossyDecoderInner::Encoding(mut decoder, mut sink) => { 405 | let mut out = Tendril::new(); 406 | if let Some(err) = decoder.raw_finish(&mut out) { 407 | out.push_char('\u{fffd}'); 408 | sink.error(err.cause); 409 | } 410 | if out.len() > 0 { 411 | sink.process(out); 412 | } 413 | sink.finish() 414 | } 415 | #[cfg(feature = "encoding_rs")] 416 | LossyDecoderInner::EncodingRs(mut decoder, mut sink) => { 417 | decode_to_sink(Tendril::new(), &mut decoder, &mut sink, true); 418 | sink.finish() 419 | } 420 | } 421 | } 422 | } 423 | 424 | #[cfg(feature = "encoding_rs")] 425 | fn decode_to_sink( 426 | mut t: Tendril, 427 | decoder: &mut encoding_rs::Decoder, 428 | sink: &mut Sink, 429 | last: bool, 430 | ) where 431 | Sink: TendrilSink, 432 | A: Atomicity, 433 | { 434 | loop { 435 | let mut out = >::new(); 436 | let max_len = decoder 437 | .max_utf8_buffer_length_without_replacement(t.len()) 438 | .unwrap_or(8192); 439 | unsafe { 440 | out.push_uninitialized(std::cmp::min(max_len as u32, 8192)); 441 | } 442 | let (result, bytes_read, bytes_written) = 443 | decoder.decode_to_utf8_without_replacement(&t, &mut out, last); 444 | if bytes_written > 0 { 445 | sink.process(unsafe { 446 | out.subtendril(0, bytes_written as u32) 447 | .reinterpret_without_validating() 448 | }); 449 | } 450 | match result { 451 | DecoderResult::InputEmpty => return, 452 | DecoderResult::OutputFull => {} 453 | DecoderResult::Malformed(_, _) => { 454 | sink.error(Cow::Borrowed("invalid sequence")); 455 | sink.process("\u{FFFD}".into()); 456 | } 457 | } 458 | t.pop_front(bytes_read as u32); 459 | if t.is_empty() { 460 | return; 461 | } 462 | } 463 | } 464 | 465 | #[cfg(test)] 466 | mod test { 467 | use super::{TendrilSink, Utf8LossyDecoder}; 468 | use fmt; 469 | use std::borrow::Cow; 470 | use tendril::{Atomicity, NonAtomic, Tendril}; 471 | 472 | #[cfg(any(feature = "encoding", feature = "encoding_rs"))] 473 | use super::LossyDecoder; 474 | #[cfg(any(feature = "encoding", feature = "encoding_rs"))] 475 | use tendril::SliceExt; 476 | 477 | #[cfg(feature = "encoding")] 478 | use encoding::all as enc; 479 | #[cfg(feature = "encoding_rs")] 480 | use encoding_rs as enc_rs; 481 | 482 | struct Accumulate 483 | where 484 | A: Atomicity, 485 | { 486 | tendrils: Vec>, 487 | errors: Vec, 488 | } 489 | 490 | impl Accumulate 491 | where 492 | A: Atomicity, 493 | { 494 | fn new() -> Accumulate { 495 | Accumulate { 496 | tendrils: vec![], 497 | errors: vec![], 498 | } 499 | } 500 | } 501 | 502 | impl TendrilSink for Accumulate 503 | where 504 | A: Atomicity, 505 | { 506 | fn process(&mut self, t: Tendril) { 507 | self.tendrils.push(t); 508 | } 509 | 510 | fn error(&mut self, desc: Cow<'static, str>) { 511 | self.errors.push(desc.into_owned()); 512 | } 513 | 514 | type Output = (Vec>, Vec); 515 | 516 | fn finish(self) -> Self::Output { 517 | (self.tendrils, self.errors) 518 | } 519 | } 520 | 521 | fn check_utf8(input: &[&[u8]], expected: &[&str], errs: usize) { 522 | let decoder = Utf8LossyDecoder::new(Accumulate::::new()); 523 | let (tendrils, errors) = decoder.from_iter(input.iter().cloned()); 524 | assert_eq!( 525 | expected, 526 | &*tendrils.iter().map(|t| &**t).collect::>() 527 | ); 528 | assert_eq!(errs, errors.len()); 529 | } 530 | 531 | #[test] 532 | fn utf8() { 533 | check_utf8(&[], &[], 0); 534 | check_utf8(&[b""], &[], 0); 535 | check_utf8(&[b"xyz"], &["xyz"], 0); 536 | check_utf8(&[b"x", b"y", b"z"], &["x", "y", "z"], 0); 537 | 538 | check_utf8(&[b"xy\xEA\x99\xAEzw"], &["xy\u{a66e}zw"], 0); 539 | check_utf8(&[b"xy\xEA", b"\x99\xAEzw"], &["xy", "\u{a66e}z", "w"], 0); 540 | check_utf8(&[b"xy\xEA\x99", b"\xAEzw"], &["xy", "\u{a66e}z", "w"], 0); 541 | check_utf8( 542 | &[b"xy\xEA", b"\x99", b"\xAEzw"], 543 | &["xy", "\u{a66e}z", "w"], 544 | 0, 545 | ); 546 | check_utf8(&[b"\xEA", b"", b"\x99", b"", b"\xAE"], &["\u{a66e}"], 0); 547 | check_utf8( 548 | &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""], 549 | &["\u{a66e}"], 550 | 0, 551 | ); 552 | 553 | check_utf8( 554 | &[b"xy\xEA", b"\xFF", b"\x99\xAEz"], 555 | &["xy", "\u{fffd}", "\u{fffd}", "\u{fffd}", "\u{fffd}", "z"], 556 | 4, 557 | ); 558 | check_utf8( 559 | &[b"xy\xEA\x99", b"\xFFz"], 560 | &["xy", "\u{fffd}", "\u{fffd}", "z"], 561 | 2, 562 | ); 563 | 564 | check_utf8(&[b"\xC5\x91\xC5\x91\xC5\x91"], &["őőő"], 0); 565 | check_utf8( 566 | &[b"\xC5\x91", b"\xC5\x91", b"\xC5\x91"], 567 | &["ő", "ő", "ő"], 568 | 0, 569 | ); 570 | check_utf8( 571 | &[b"\xC5", b"\x91\xC5", b"\x91\xC5", b"\x91"], 572 | &["ő", "ő", "ő"], 573 | 0, 574 | ); 575 | check_utf8( 576 | &[b"\xC5", b"\x91\xff", b"\x91\xC5", b"\x91"], 577 | &["ő", "\u{fffd}", "\u{fffd}", "ő"], 578 | 2, 579 | ); 580 | 581 | // incomplete char at end of input 582 | check_utf8(&[b"\xC0"], &["\u{fffd}"], 1); 583 | check_utf8(&[b"\xEA\x99"], &["\u{fffd}"], 1); 584 | } 585 | 586 | #[cfg(any(feature = "encoding", feature = "encoding_rs"))] 587 | fn check_decode( 588 | mut decoder: LossyDecoder>, 589 | input: &[&[u8]], 590 | expected: &str, 591 | errs: usize, 592 | ) { 593 | for x in input { 594 | decoder.process(x.to_tendril()); 595 | } 596 | let (tendrils, errors) = decoder.finish(); 597 | let mut tendril: Tendril = Tendril::new(); 598 | for t in tendrils { 599 | tendril.push_tendril(&t); 600 | } 601 | assert_eq!(expected, &*tendril); 602 | assert_eq!(errs, errors.len()); 603 | } 604 | 605 | #[cfg(any(feature = "encoding", feature = "encoding_rs"))] 606 | pub type Tests = &'static [(&'static [&'static [u8]], &'static str, usize)]; 607 | 608 | #[cfg(any(feature = "encoding"))] 609 | const ASCII: Tests = &[ 610 | (&[], "", 0), 611 | (&[b""], "", 0), 612 | (&[b"xyz"], "xyz", 0), 613 | (&[b"xy", b"", b"", b"z"], "xyz", 0), 614 | (&[b"x", b"y", b"z"], "xyz", 0), 615 | (&[b"\xFF"], "\u{fffd}", 1), 616 | (&[b"x\xC0yz"], "x\u{fffd}yz", 1), 617 | (&[b"x", b"\xC0y", b"z"], "x\u{fffd}yz", 1), 618 | (&[b"x\xC0yz\xFF\xFFw"], "x\u{fffd}yz\u{fffd}\u{fffd}w", 3), 619 | ]; 620 | 621 | #[cfg(feature = "encoding")] 622 | #[test] 623 | fn decode_ascii() { 624 | for &(input, expected, errs) in ASCII { 625 | let decoder = LossyDecoder::new(enc::ASCII, Accumulate::new()); 626 | check_decode(decoder, input, expected, errs); 627 | } 628 | } 629 | 630 | #[cfg(any(feature = "encoding", feature = "encoding_rs"))] 631 | const UTF_8: Tests = &[ 632 | (&[], "", 0), 633 | (&[b""], "", 0), 634 | (&[b"xyz"], "xyz", 0), 635 | (&[b"x", b"y", b"z"], "xyz", 0), 636 | (&[b"\xEA\x99\xAE"], "\u{a66e}", 0), 637 | (&[b"\xEA", b"\x99\xAE"], "\u{a66e}", 0), 638 | (&[b"\xEA\x99", b"\xAE"], "\u{a66e}", 0), 639 | (&[b"\xEA", b"\x99", b"\xAE"], "\u{a66e}", 0), 640 | (&[b"\xEA", b"", b"\x99", b"", b"\xAE"], "\u{a66e}", 0), 641 | ( 642 | &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""], 643 | "\u{a66e}", 644 | 0, 645 | ), 646 | (&[b"xy\xEA", b"\x99\xAEz"], "xy\u{a66e}z", 0), 647 | ( 648 | &[b"xy\xEA", b"\xFF", b"\x99\xAEz"], 649 | "xy\u{fffd}\u{fffd}\u{fffd}\u{fffd}z", 650 | 4, 651 | ), 652 | (&[b"xy\xEA\x99", b"\xFFz"], "xy\u{fffd}\u{fffd}z", 2), 653 | // incomplete char at end of input 654 | (&[b"\xC0"], "\u{fffd}", 1), 655 | (&[b"\xEA\x99"], "\u{fffd}", 1), 656 | ]; 657 | 658 | #[cfg(feature = "encoding")] 659 | #[test] 660 | fn decode_utf8() { 661 | for &(input, expected, errs) in UTF_8 { 662 | let decoder = LossyDecoder::new(enc::UTF_8, Accumulate::new()); 663 | check_decode(decoder, input, expected, errs); 664 | } 665 | } 666 | 667 | #[cfg(feature = "encoding_rs")] 668 | #[test] 669 | fn decode_utf8_encoding_rs() { 670 | for &(input, expected, errs) in UTF_8 { 671 | let decoder = LossyDecoder::new_encoding_rs(enc_rs::UTF_8, Accumulate::new()); 672 | check_decode(decoder, input, expected, errs); 673 | } 674 | } 675 | 676 | #[cfg(any(feature = "encoding", feature = "encoding_rs"))] 677 | const KOI8_U: Tests = &[ 678 | (&[b"\xfc\xce\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0), 679 | (&[b"\xfc\xce", b"\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0), 680 | (&[b"\xfc\xce", b"\xc5\xd2\xc7", b"\xc9\xd1"], "Энергия", 0), 681 | ( 682 | &[b"\xfc\xce", b"", b"\xc5\xd2\xc7", b"\xc9\xd1", b""], 683 | "Энергия", 684 | 0, 685 | ), 686 | ]; 687 | 688 | #[cfg(feature = "encoding")] 689 | #[test] 690 | fn decode_koi8_u() { 691 | for &(input, expected, errs) in KOI8_U { 692 | let decoder = LossyDecoder::new(enc::KOI8_U, Accumulate::new()); 693 | check_decode(decoder, input, expected, errs); 694 | } 695 | } 696 | 697 | #[cfg(feature = "encoding_rs")] 698 | #[test] 699 | fn decode_koi8_u_encoding_rs() { 700 | for &(input, expected, errs) in KOI8_U { 701 | let decoder = LossyDecoder::new_encoding_rs(enc_rs::KOI8_U, Accumulate::new()); 702 | check_decode(decoder, input, expected, errs); 703 | } 704 | } 705 | 706 | #[cfg(any(feature = "encoding", feature = "encoding_rs"))] 707 | const WINDOWS_949: Tests = &[ 708 | (&[], "", 0), 709 | (&[b""], "", 0), 710 | (&[b"\xbe\xc8\xb3\xe7"], "안녕", 0), 711 | (&[b"\xbe", b"\xc8\xb3\xe7"], "안녕", 0), 712 | (&[b"\xbe", b"", b"\xc8\xb3\xe7"], "안녕", 0), 713 | ( 714 | &[b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4"], 715 | "안녕하세요", 716 | 0, 717 | ), 718 | (&[b"\xbe\xc8\xb3\xe7\xc7"], "안녕\u{fffd}", 1), 719 | (&[b"\xbe", b"", b"\xc8\xb3"], "안\u{fffd}", 1), 720 | (&[b"\xbe\x28\xb3\xe7"], "\u{fffd}(녕", 1), 721 | ]; 722 | 723 | #[cfg(feature = "encoding")] 724 | #[test] 725 | fn decode_windows_949() { 726 | for &(input, expected, errs) in WINDOWS_949 { 727 | let decoder = LossyDecoder::new(enc::WINDOWS_949, Accumulate::new()); 728 | check_decode(decoder, input, expected, errs); 729 | } 730 | } 731 | 732 | #[cfg(feature = "encoding_rs")] 733 | #[test] 734 | fn decode_windows_949_encoding_rs() { 735 | for &(input, expected, errs) in WINDOWS_949 { 736 | let decoder = LossyDecoder::new_encoding_rs(enc_rs::EUC_KR, Accumulate::new()); 737 | check_decode(decoder, input, expected, errs); 738 | } 739 | } 740 | 741 | #[test] 742 | fn read_from() { 743 | let decoder = Utf8LossyDecoder::new(Accumulate::::new()); 744 | let mut bytes: &[u8] = b"foo\xffbar"; 745 | let (tendrils, errors) = decoder.read_from(&mut bytes).unwrap(); 746 | assert_eq!( 747 | &*tendrils.iter().map(|t| &**t).collect::>(), 748 | &["foo", "\u{FFFD}", "bar"] 749 | ); 750 | assert_eq!(errors, &["invalid byte sequence"]); 751 | } 752 | } 753 | -------------------------------------------------------------------------------- /src/tendril.rs: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 or the MIT license 3 | // , at your 4 | // option. This file may not be copied, modified, or distributed 5 | // except according to those terms. 6 | 7 | use std::borrow::Borrow; 8 | use std::cell::{Cell, UnsafeCell}; 9 | use std::cmp::Ordering; 10 | use std::default::Default; 11 | use std::fmt as strfmt; 12 | use std::iter::FromIterator; 13 | use std::marker::PhantomData; 14 | use std::num::NonZeroUsize; 15 | use std::ops::{Deref, DerefMut}; 16 | use std::sync::atomic::Ordering as AtomicOrdering; 17 | use std::sync::atomic::{self, AtomicUsize}; 18 | use std::{hash, io, mem, ptr, str, u32}; 19 | 20 | #[cfg(feature = "encoding")] 21 | use encoding::{self, DecoderTrap, EncoderTrap, EncodingRef}; 22 | 23 | use buf32::{self, Buf32}; 24 | use fmt::imp::Fixup; 25 | use fmt::{self, Slice}; 26 | use util::{copy_and_advance, copy_lifetime, copy_lifetime_mut, unsafe_slice, unsafe_slice_mut}; 27 | use OFLOW; 28 | 29 | const MAX_INLINE_LEN: usize = 8; 30 | const MAX_INLINE_TAG: usize = 0xF; 31 | const EMPTY_TAG: usize = 0xF; 32 | 33 | #[inline(always)] 34 | fn inline_tag(len: u32) -> NonZeroUsize { 35 | debug_assert!(len <= MAX_INLINE_LEN as u32); 36 | unsafe { NonZeroUsize::new_unchecked(if len == 0 { EMPTY_TAG } else { len as usize }) } 37 | } 38 | 39 | /// The multithreadedness of a tendril. 40 | /// 41 | /// Exactly two types implement this trait: 42 | /// 43 | /// - `Atomic`: use this in your tendril and you will have a `Send` tendril which works 44 | /// across threads; this is akin to `Arc`. 45 | /// 46 | /// - `NonAtomic`: use this in your tendril and you will have a tendril which is neither 47 | /// `Send` nor `Sync` but should be a tad faster; this is akin to `Rc`. 48 | /// 49 | /// The layout of this trait is also mandated to be that of a `usize`, 50 | /// for it is used for reference counting. 51 | pub unsafe trait Atomicity: 'static { 52 | #[doc(hidden)] 53 | fn new() -> Self; 54 | 55 | #[doc(hidden)] 56 | fn increment(&self) -> usize; 57 | 58 | #[doc(hidden)] 59 | fn decrement(&self) -> usize; 60 | 61 | #[doc(hidden)] 62 | fn fence_acquire(); 63 | } 64 | 65 | /// A marker of a non-atomic tendril. 66 | /// 67 | /// This is the default for the second type parameter of a `Tendril` 68 | /// and so doesn't typically need to be written. 69 | /// 70 | /// This is akin to using `Rc` for reference counting. 71 | #[repr(C)] 72 | pub struct NonAtomic(Cell); 73 | 74 | unsafe impl Atomicity for NonAtomic { 75 | #[inline] 76 | fn new() -> Self { 77 | NonAtomic(Cell::new(1)) 78 | } 79 | 80 | #[inline] 81 | fn increment(&self) -> usize { 82 | let value = self.0.get(); 83 | self.0.set(value.checked_add(1).expect(OFLOW)); 84 | value 85 | } 86 | 87 | #[inline] 88 | fn decrement(&self) -> usize { 89 | let value = self.0.get(); 90 | self.0.set(value - 1); 91 | value 92 | } 93 | 94 | #[inline] 95 | fn fence_acquire() {} 96 | } 97 | 98 | /// A marker of an atomic (and hence concurrent) tendril. 99 | /// 100 | /// This is used as the second, optional type parameter of a `Tendril`; 101 | /// `Tendril` thus implements`Send`. 102 | /// 103 | /// This is akin to using `Arc` for reference counting. 104 | pub struct Atomic(AtomicUsize); 105 | 106 | unsafe impl Atomicity for Atomic { 107 | #[inline] 108 | fn new() -> Self { 109 | Atomic(AtomicUsize::new(1)) 110 | } 111 | 112 | #[inline] 113 | fn increment(&self) -> usize { 114 | // Relaxed is OK because we have a reference already. 115 | self.0.fetch_add(1, AtomicOrdering::Relaxed) 116 | } 117 | 118 | #[inline] 119 | fn decrement(&self) -> usize { 120 | self.0.fetch_sub(1, AtomicOrdering::Release) 121 | } 122 | 123 | #[inline] 124 | fn fence_acquire() { 125 | atomic::fence(AtomicOrdering::Acquire); 126 | } 127 | } 128 | 129 | #[repr(C)] // Preserve field order for cross-atomicity transmutes 130 | struct Header { 131 | refcount: A, 132 | cap: u32, 133 | } 134 | 135 | impl Header 136 | where 137 | A: Atomicity, 138 | { 139 | #[inline(always)] 140 | unsafe fn new() -> Header { 141 | Header { 142 | refcount: A::new(), 143 | cap: 0, 144 | } 145 | } 146 | } 147 | 148 | /// Errors that can occur when slicing a `Tendril`. 149 | #[derive(Copy, Clone, Hash, Debug, PartialEq, Eq)] 150 | pub enum SubtendrilError { 151 | OutOfBounds, 152 | ValidationFailed, 153 | } 154 | 155 | /// Compact string type for zero-copy parsing. 156 | /// 157 | /// `Tendril`s have the semantics of owned strings, but are sometimes views 158 | /// into shared buffers. When you mutate a `Tendril`, an owned copy is made 159 | /// if necessary. Further mutations occur in-place until the string becomes 160 | /// shared, e.g. with `clone()` or `subtendril()`. 161 | /// 162 | /// Buffer sharing is accomplished through thread-local (non-atomic) reference 163 | /// counting, which has very low overhead. The Rust type system will prevent 164 | /// you at compile time from sending a `Tendril` between threads. We plan to 165 | /// relax this restriction in the future; see `README.md`. 166 | /// 167 | /// Whereas `String` allocates in the heap for any non-empty string, `Tendril` 168 | /// can store small strings (up to 8 bytes) in-line, without a heap allocation. 169 | /// `Tendril` is also smaller than `String` on 64-bit platforms — 16 bytes 170 | /// versus 24. 171 | /// 172 | /// The type parameter `F` specifies the format of the tendril, for example 173 | /// UTF-8 text or uninterpreted bytes. The parameter will be instantiated 174 | /// with one of the marker types from `tendril::fmt`. See the `StrTendril` 175 | /// and `ByteTendril` type aliases for two examples. 176 | /// 177 | /// The type parameter `A` indicates the atomicity of the tendril; it is by 178 | /// default `NonAtomic`, but can be specified as `Atomic` to get a tendril 179 | /// which implements `Send` (viz. a thread-safe tendril). 180 | /// 181 | /// The maximum length of a `Tendril` is 4 GB. The library will panic if 182 | /// you attempt to go over the limit. 183 | #[repr(C)] 184 | pub struct Tendril 185 | where 186 | F: fmt::Format, 187 | A: Atomicity, 188 | { 189 | ptr: Cell, 190 | buf: UnsafeCell, 191 | marker: PhantomData<*mut F>, 192 | refcount_marker: PhantomData, 193 | } 194 | 195 | #[repr(C)] 196 | union Buffer { 197 | heap: Heap, 198 | inline: [u8; 8], 199 | } 200 | 201 | #[derive(Copy, Clone)] 202 | #[repr(C)] 203 | struct Heap { 204 | len: u32, 205 | aux: u32, 206 | } 207 | 208 | unsafe impl Send for Tendril 209 | where 210 | F: fmt::Format, 211 | A: Atomicity + Sync, 212 | { 213 | } 214 | 215 | /// `Tendril` for storing native Rust strings. 216 | pub type StrTendril = Tendril; 217 | 218 | /// `Tendril` for storing binary data. 219 | pub type ByteTendril = Tendril; 220 | 221 | impl Clone for Tendril 222 | where 223 | F: fmt::Format, 224 | A: Atomicity, 225 | { 226 | #[inline] 227 | fn clone(&self) -> Tendril { 228 | unsafe { 229 | if self.ptr.get().get() > MAX_INLINE_TAG { 230 | self.make_buf_shared(); 231 | self.incref(); 232 | } 233 | 234 | ptr::read(self) 235 | } 236 | } 237 | } 238 | 239 | impl Drop for Tendril 240 | where 241 | F: fmt::Format, 242 | A: Atomicity, 243 | { 244 | #[inline] 245 | fn drop(&mut self) { 246 | unsafe { 247 | let p = self.ptr.get().get(); 248 | if p <= MAX_INLINE_TAG { 249 | return; 250 | } 251 | 252 | let (buf, shared, _) = self.assume_buf(); 253 | if shared { 254 | let header = self.header(); 255 | if (*header).refcount.decrement() == 1 { 256 | A::fence_acquire(); 257 | buf.destroy(); 258 | } 259 | } else { 260 | buf.destroy(); 261 | } 262 | } 263 | } 264 | } 265 | 266 | macro_rules! from_iter_method { 267 | ($ty:ty) => { 268 | #[inline] 269 | fn from_iter(iterable: I) -> Self 270 | where 271 | I: IntoIterator, 272 | { 273 | let mut output = Self::new(); 274 | output.extend(iterable); 275 | output 276 | } 277 | }; 278 | } 279 | 280 | impl Extend for Tendril 281 | where 282 | A: Atomicity, 283 | { 284 | #[inline] 285 | fn extend(&mut self, iterable: I) 286 | where 287 | I: IntoIterator, 288 | { 289 | let iterator = iterable.into_iter(); 290 | self.force_reserve(iterator.size_hint().0 as u32); 291 | for c in iterator { 292 | self.push_char(c); 293 | } 294 | } 295 | } 296 | 297 | impl FromIterator for Tendril 298 | where 299 | A: Atomicity, 300 | { 301 | from_iter_method!(char); 302 | } 303 | 304 | impl Extend for Tendril 305 | where 306 | A: Atomicity, 307 | { 308 | #[inline] 309 | fn extend(&mut self, iterable: I) 310 | where 311 | I: IntoIterator, 312 | { 313 | let iterator = iterable.into_iter(); 314 | self.force_reserve(iterator.size_hint().0 as u32); 315 | for b in iterator { 316 | self.push_slice(&[b]); 317 | } 318 | } 319 | } 320 | 321 | impl FromIterator for Tendril 322 | where 323 | A: Atomicity, 324 | { 325 | from_iter_method!(u8); 326 | } 327 | 328 | impl<'a, A> Extend<&'a u8> for Tendril 329 | where 330 | A: Atomicity, 331 | { 332 | #[inline] 333 | fn extend(&mut self, iterable: I) 334 | where 335 | I: IntoIterator, 336 | { 337 | let iterator = iterable.into_iter(); 338 | self.force_reserve(iterator.size_hint().0 as u32); 339 | for &b in iterator { 340 | self.push_slice(&[b]); 341 | } 342 | } 343 | } 344 | 345 | impl<'a, A> FromIterator<&'a u8> for Tendril 346 | where 347 | A: Atomicity, 348 | { 349 | from_iter_method!(&'a u8); 350 | } 351 | 352 | impl<'a, A> Extend<&'a str> for Tendril 353 | where 354 | A: Atomicity, 355 | { 356 | #[inline] 357 | fn extend(&mut self, iterable: I) 358 | where 359 | I: IntoIterator, 360 | { 361 | for s in iterable { 362 | self.push_slice(s); 363 | } 364 | } 365 | } 366 | 367 | impl<'a, A> FromIterator<&'a str> for Tendril 368 | where 369 | A: Atomicity, 370 | { 371 | from_iter_method!(&'a str); 372 | } 373 | 374 | impl<'a, A> Extend<&'a [u8]> for Tendril 375 | where 376 | A: Atomicity, 377 | { 378 | #[inline] 379 | fn extend(&mut self, iterable: I) 380 | where 381 | I: IntoIterator, 382 | { 383 | for s in iterable { 384 | self.push_slice(s); 385 | } 386 | } 387 | } 388 | 389 | impl<'a, A> FromIterator<&'a [u8]> for Tendril 390 | where 391 | A: Atomicity, 392 | { 393 | from_iter_method!(&'a [u8]); 394 | } 395 | 396 | impl<'a, F, A> Extend<&'a Tendril> for Tendril 397 | where 398 | F: fmt::Format + 'a, 399 | A: Atomicity, 400 | { 401 | #[inline] 402 | fn extend(&mut self, iterable: I) 403 | where 404 | I: IntoIterator>, 405 | { 406 | for t in iterable { 407 | self.push_tendril(t); 408 | } 409 | } 410 | } 411 | 412 | impl<'a, F, A> FromIterator<&'a Tendril> for Tendril 413 | where 414 | F: fmt::Format + 'a, 415 | A: Atomicity, 416 | { 417 | from_iter_method!(&'a Tendril); 418 | } 419 | 420 | impl Deref for Tendril 421 | where 422 | F: fmt::SliceFormat, 423 | A: Atomicity, 424 | { 425 | type Target = F::Slice; 426 | 427 | #[inline] 428 | fn deref(&self) -> &F::Slice { 429 | unsafe { F::Slice::from_bytes(self.as_byte_slice()) } 430 | } 431 | } 432 | 433 | impl DerefMut for Tendril 434 | where 435 | F: fmt::SliceFormat, 436 | A: Atomicity, 437 | { 438 | #[inline] 439 | fn deref_mut(&mut self) -> &mut F::Slice { 440 | unsafe { F::Slice::from_mut_bytes(self.as_mut_byte_slice()) } 441 | } 442 | } 443 | 444 | impl Borrow<[u8]> for Tendril 445 | where 446 | F: fmt::SliceFormat, 447 | A: Atomicity, 448 | { 449 | fn borrow(&self) -> &[u8] { 450 | self.as_byte_slice() 451 | } 452 | } 453 | 454 | // Why not impl Borrow for Tendril? str and [u8] hash differently, 455 | // and so a HashMap would silently break if we indexed by str. Ick. 456 | // https://github.com/rust-lang/rust/issues/27108 457 | 458 | impl PartialEq for Tendril 459 | where 460 | F: fmt::Format, 461 | A: Atomicity, 462 | { 463 | #[inline] 464 | fn eq(&self, other: &Self) -> bool { 465 | self.as_byte_slice() == other.as_byte_slice() 466 | } 467 | 468 | #[inline] 469 | fn ne(&self, other: &Self) -> bool { 470 | self.as_byte_slice() != other.as_byte_slice() 471 | } 472 | } 473 | 474 | impl Eq for Tendril 475 | where 476 | F: fmt::Format, 477 | A: Atomicity, 478 | { 479 | } 480 | 481 | impl PartialOrd for Tendril 482 | where 483 | F: fmt::SliceFormat, 484 | ::Slice: PartialOrd, 485 | A: Atomicity, 486 | { 487 | #[inline] 488 | fn partial_cmp(&self, other: &Self) -> Option { 489 | PartialOrd::partial_cmp(&**self, &**other) 490 | } 491 | } 492 | 493 | impl Ord for Tendril 494 | where 495 | F: fmt::SliceFormat, 496 | ::Slice: Ord, 497 | A: Atomicity, 498 | { 499 | #[inline] 500 | fn cmp(&self, other: &Self) -> Ordering { 501 | Ord::cmp(&**self, &**other) 502 | } 503 | } 504 | 505 | impl Default for Tendril 506 | where 507 | F: fmt::Format, 508 | A: Atomicity, 509 | { 510 | #[inline(always)] 511 | fn default() -> Tendril { 512 | Tendril::new() 513 | } 514 | } 515 | 516 | impl strfmt::Debug for Tendril 517 | where 518 | F: fmt::SliceFormat + Default + strfmt::Debug, 519 | ::Slice: strfmt::Debug, 520 | A: Atomicity, 521 | { 522 | #[inline] 523 | fn fmt(&self, f: &mut strfmt::Formatter) -> strfmt::Result { 524 | let kind = match self.ptr.get().get() { 525 | p if p <= MAX_INLINE_TAG => "inline", 526 | p if p & 1 == 1 => "shared", 527 | _ => "owned", 528 | }; 529 | 530 | write!(f, "Tendril<{:?}>({}: ", ::default(), kind)?; 531 | <::Slice as strfmt::Debug>::fmt(&**self, f)?; 532 | write!(f, ")") 533 | } 534 | } 535 | 536 | impl hash::Hash for Tendril 537 | where 538 | F: fmt::Format, 539 | A: Atomicity, 540 | { 541 | #[inline] 542 | fn hash(&self, hasher: &mut H) { 543 | self.as_byte_slice().hash(hasher) 544 | } 545 | } 546 | 547 | impl Tendril 548 | where 549 | F: fmt::Format, 550 | A: Atomicity, 551 | { 552 | /// Create a new, empty `Tendril` in any format. 553 | #[inline(always)] 554 | pub fn new() -> Tendril { 555 | unsafe { Tendril::inline(&[]) } 556 | } 557 | 558 | /// Create a new, empty `Tendril` with a specified capacity. 559 | #[inline] 560 | pub fn with_capacity(capacity: u32) -> Tendril { 561 | let mut t: Tendril = Tendril::new(); 562 | if capacity > MAX_INLINE_LEN as u32 { 563 | unsafe { 564 | t.make_owned_with_capacity(capacity); 565 | } 566 | } 567 | t 568 | } 569 | 570 | /// Reserve space for additional bytes. 571 | /// 572 | /// This is only a suggestion. There are cases where `Tendril` will 573 | /// decline to allocate until the buffer is actually modified. 574 | #[inline] 575 | pub fn reserve(&mut self, additional: u32) { 576 | if !self.is_shared() { 577 | // Don't grow a shared tendril because we'd have to copy 578 | // right away. 579 | self.force_reserve(additional); 580 | } 581 | } 582 | 583 | /// Reserve space for additional bytes, even for shared buffers. 584 | #[inline] 585 | fn force_reserve(&mut self, additional: u32) { 586 | let new_len = self.len32().checked_add(additional).expect(OFLOW); 587 | if new_len > MAX_INLINE_LEN as u32 { 588 | unsafe { 589 | self.make_owned_with_capacity(new_len); 590 | } 591 | } 592 | } 593 | 594 | /// Get the length of the `Tendril`. 595 | /// 596 | /// This is named not to conflict with `len()` on the underlying 597 | /// slice, if any. 598 | #[inline(always)] 599 | pub fn len32(&self) -> u32 { 600 | match self.ptr.get().get() { 601 | EMPTY_TAG => 0, 602 | n if n <= MAX_INLINE_LEN => n as u32, 603 | _ => unsafe { self.raw_len() }, 604 | } 605 | } 606 | 607 | /// Is the backing buffer shared? 608 | #[inline] 609 | pub fn is_shared(&self) -> bool { 610 | let n = self.ptr.get().get(); 611 | 612 | (n > MAX_INLINE_TAG) && ((n & 1) == 1) 613 | } 614 | 615 | /// Is the backing buffer shared with this other `Tendril`? 616 | #[inline] 617 | pub fn is_shared_with(&self, other: &Tendril) -> bool { 618 | let n = self.ptr.get().get(); 619 | 620 | (n > MAX_INLINE_TAG) && (n == other.ptr.get().get()) 621 | } 622 | 623 | /// Truncate to length 0 without discarding any owned storage. 624 | #[inline] 625 | pub fn clear(&mut self) { 626 | if self.ptr.get().get() <= MAX_INLINE_TAG { 627 | self.ptr 628 | .set(unsafe { NonZeroUsize::new_unchecked(EMPTY_TAG) }); 629 | } else { 630 | let (_, shared, _) = unsafe { self.assume_buf() }; 631 | if shared { 632 | // No need to keep a reference alive for a 0-size slice. 633 | *self = Tendril::new(); 634 | } else { 635 | unsafe { self.set_len(0) }; 636 | } 637 | } 638 | } 639 | 640 | /// Build a `Tendril` by copying a byte slice, if it conforms to the format. 641 | #[inline] 642 | pub fn try_from_byte_slice(x: &[u8]) -> Result, ()> { 643 | match F::validate(x) { 644 | true => Ok(unsafe { Tendril::from_byte_slice_without_validating(x) }), 645 | false => Err(()), 646 | } 647 | } 648 | 649 | /// View as uninterpreted bytes. 650 | #[inline(always)] 651 | pub fn as_bytes(&self) -> &Tendril { 652 | unsafe { mem::transmute(self) } 653 | } 654 | 655 | /// Convert into uninterpreted bytes. 656 | #[inline(always)] 657 | pub fn into_bytes(self) -> Tendril { 658 | unsafe { mem::transmute(self) } 659 | } 660 | 661 | /// Convert `self` into a type which is `Send`. 662 | /// 663 | /// If the tendril is owned or inline, this is free, 664 | /// but if it's shared this will entail a copy of the contents. 665 | #[inline] 666 | pub fn into_send(mut self) -> SendTendril { 667 | self.make_owned(); 668 | SendTendril { 669 | // This changes the header.refcount from A to NonAtomic, but that's 670 | // OK because we have defined the format of A as a usize. 671 | tendril: unsafe { mem::transmute(self) }, 672 | } 673 | } 674 | 675 | /// View as a superset format, for free. 676 | #[inline(always)] 677 | pub fn as_superset(&self) -> &Tendril 678 | where 679 | F: fmt::SubsetOf, 680 | Super: fmt::Format, 681 | { 682 | unsafe { mem::transmute(self) } 683 | } 684 | 685 | /// Convert into a superset format, for free. 686 | #[inline(always)] 687 | pub fn into_superset(self) -> Tendril 688 | where 689 | F: fmt::SubsetOf, 690 | Super: fmt::Format, 691 | { 692 | unsafe { mem::transmute(self) } 693 | } 694 | 695 | /// View as a subset format, if the `Tendril` conforms to that subset. 696 | #[inline] 697 | pub fn try_as_subset(&self) -> Result<&Tendril, ()> 698 | where 699 | Sub: fmt::SubsetOf, 700 | { 701 | match Sub::revalidate_subset(self.as_byte_slice()) { 702 | true => Ok(unsafe { mem::transmute(self) }), 703 | false => Err(()), 704 | } 705 | } 706 | 707 | /// Convert into a subset format, if the `Tendril` conforms to that subset. 708 | #[inline] 709 | pub fn try_into_subset(self) -> Result, Self> 710 | where 711 | Sub: fmt::SubsetOf, 712 | { 713 | match Sub::revalidate_subset(self.as_byte_slice()) { 714 | true => Ok(unsafe { mem::transmute(self) }), 715 | false => Err(self), 716 | } 717 | } 718 | 719 | /// View as another format, if the bytes of the `Tendril` are valid for 720 | /// that format. 721 | #[inline] 722 | pub fn try_reinterpret_view(&self) -> Result<&Tendril, ()> 723 | where 724 | Other: fmt::Format, 725 | { 726 | match Other::validate(self.as_byte_slice()) { 727 | true => Ok(unsafe { mem::transmute(self) }), 728 | false => Err(()), 729 | } 730 | } 731 | 732 | /// Convert into another format, if the `Tendril` conforms to that format. 733 | /// 734 | /// This only re-validates the existing bytes under the new format. It 735 | /// will *not* change the byte content of the tendril! 736 | /// 737 | /// See the `encode` and `decode` methods for character encoding conversion. 738 | #[inline] 739 | pub fn try_reinterpret(self) -> Result, Self> 740 | where 741 | Other: fmt::Format, 742 | { 743 | match Other::validate(self.as_byte_slice()) { 744 | true => Ok(unsafe { mem::transmute(self) }), 745 | false => Err(self), 746 | } 747 | } 748 | 749 | /// Push some bytes onto the end of the `Tendril`, if they conform to the 750 | /// format. 751 | #[inline] 752 | pub fn try_push_bytes(&mut self, buf: &[u8]) -> Result<(), ()> { 753 | match F::validate(buf) { 754 | true => unsafe { 755 | self.push_bytes_without_validating(buf); 756 | Ok(()) 757 | }, 758 | false => Err(()), 759 | } 760 | } 761 | 762 | /// Push another `Tendril` onto the end of this one. 763 | #[inline] 764 | pub fn push_tendril(&mut self, other: &Tendril) { 765 | let new_len = self.len32().checked_add(other.len32()).expect(OFLOW); 766 | 767 | unsafe { 768 | if (self.ptr.get().get() > MAX_INLINE_TAG) && (other.ptr.get().get() > MAX_INLINE_TAG) { 769 | let (self_buf, self_shared, _) = self.assume_buf(); 770 | let (other_buf, other_shared, _) = other.assume_buf(); 771 | 772 | if self_shared 773 | && other_shared 774 | && (self_buf.data_ptr() == other_buf.data_ptr()) 775 | && other.aux() == self.aux() + self.raw_len() 776 | { 777 | self.set_len(new_len); 778 | return; 779 | } 780 | } 781 | 782 | self.push_bytes_without_validating(other.as_byte_slice()) 783 | } 784 | } 785 | 786 | /// Attempt to slice this `Tendril` as a new `Tendril`. 787 | /// 788 | /// This will share the buffer when possible. Mutating a shared buffer 789 | /// will copy the contents. 790 | /// 791 | /// The offset and length are in bytes. The function will return 792 | /// `Err` if these are out of bounds, or if the resulting slice 793 | /// does not conform to the format. 794 | #[inline] 795 | pub fn try_subtendril( 796 | &self, 797 | offset: u32, 798 | length: u32, 799 | ) -> Result, SubtendrilError> { 800 | let self_len = self.len32(); 801 | if offset > self_len || length > (self_len - offset) { 802 | return Err(SubtendrilError::OutOfBounds); 803 | } 804 | 805 | unsafe { 806 | let byte_slice = unsafe_slice(self.as_byte_slice(), offset as usize, length as usize); 807 | if !F::validate_subseq(byte_slice) { 808 | return Err(SubtendrilError::ValidationFailed); 809 | } 810 | 811 | Ok(self.unsafe_subtendril(offset, length)) 812 | } 813 | } 814 | 815 | /// Slice this `Tendril` as a new `Tendril`. 816 | /// 817 | /// Panics on bounds or validity check failure. 818 | #[inline] 819 | pub fn subtendril(&self, offset: u32, length: u32) -> Tendril { 820 | self.try_subtendril(offset, length).unwrap() 821 | } 822 | 823 | /// Try to drop `n` bytes from the front. 824 | /// 825 | /// Returns `Err` if the bytes are not available, or the suffix fails 826 | /// validation. 827 | #[inline] 828 | pub fn try_pop_front(&mut self, n: u32) -> Result<(), SubtendrilError> { 829 | if n == 0 { 830 | return Ok(()); 831 | } 832 | let old_len = self.len32(); 833 | if n > old_len { 834 | return Err(SubtendrilError::OutOfBounds); 835 | } 836 | let new_len = old_len - n; 837 | 838 | unsafe { 839 | if !F::validate_suffix(unsafe_slice( 840 | self.as_byte_slice(), 841 | n as usize, 842 | new_len as usize, 843 | )) { 844 | return Err(SubtendrilError::ValidationFailed); 845 | } 846 | 847 | self.unsafe_pop_front(n); 848 | Ok(()) 849 | } 850 | } 851 | 852 | /// Drop `n` bytes from the front. 853 | /// 854 | /// Panics if the bytes are not available, or the suffix fails 855 | /// validation. 856 | #[inline] 857 | pub fn pop_front(&mut self, n: u32) { 858 | self.try_pop_front(n).unwrap() 859 | } 860 | 861 | /// Drop `n` bytes from the back. 862 | /// 863 | /// Returns `Err` if the bytes are not available, or the prefix fails 864 | /// validation. 865 | #[inline] 866 | pub fn try_pop_back(&mut self, n: u32) -> Result<(), SubtendrilError> { 867 | if n == 0 { 868 | return Ok(()); 869 | } 870 | let old_len = self.len32(); 871 | if n > old_len { 872 | return Err(SubtendrilError::OutOfBounds); 873 | } 874 | let new_len = old_len - n; 875 | 876 | unsafe { 877 | if !F::validate_prefix(unsafe_slice(self.as_byte_slice(), 0, new_len as usize)) { 878 | return Err(SubtendrilError::ValidationFailed); 879 | } 880 | 881 | self.unsafe_pop_back(n); 882 | Ok(()) 883 | } 884 | } 885 | 886 | /// Drop `n` bytes from the back. 887 | /// 888 | /// Panics if the bytes are not available, or the prefix fails 889 | /// validation. 890 | #[inline] 891 | pub fn pop_back(&mut self, n: u32) { 892 | self.try_pop_back(n).unwrap() 893 | } 894 | 895 | /// View as another format, without validating. 896 | #[inline(always)] 897 | pub unsafe fn reinterpret_view_without_validating(&self) -> &Tendril 898 | where 899 | Other: fmt::Format, 900 | { 901 | mem::transmute(self) 902 | } 903 | 904 | /// Convert into another format, without validating. 905 | #[inline(always)] 906 | pub unsafe fn reinterpret_without_validating(self) -> Tendril 907 | where 908 | Other: fmt::Format, 909 | { 910 | mem::transmute(self) 911 | } 912 | 913 | /// Build a `Tendril` by copying a byte slice, without validating. 914 | #[inline] 915 | pub unsafe fn from_byte_slice_without_validating(x: &[u8]) -> Tendril { 916 | assert!(x.len() <= buf32::MAX_LEN); 917 | if x.len() <= MAX_INLINE_LEN { 918 | Tendril::inline(x) 919 | } else { 920 | Tendril::owned_copy(x) 921 | } 922 | } 923 | 924 | /// Push some bytes onto the end of the `Tendril`, without validating. 925 | #[inline] 926 | pub unsafe fn push_bytes_without_validating(&mut self, buf: &[u8]) { 927 | assert!(buf.len() <= buf32::MAX_LEN); 928 | 929 | let Fixup { 930 | drop_left, 931 | drop_right, 932 | insert_len, 933 | insert_bytes, 934 | } = F::fixup(self.as_byte_slice(), buf); 935 | 936 | // FIXME: think more about overflow 937 | let adj_len = self.len32() + insert_len - drop_left; 938 | 939 | let new_len = adj_len.checked_add(buf.len() as u32).expect(OFLOW) - drop_right; 940 | 941 | let drop_left = drop_left as usize; 942 | let drop_right = drop_right as usize; 943 | 944 | if new_len <= MAX_INLINE_LEN as u32 { 945 | let mut tmp = [0_u8; MAX_INLINE_LEN]; 946 | { 947 | let old = self.as_byte_slice(); 948 | let mut dest = tmp.as_mut_ptr(); 949 | copy_and_advance(&mut dest, unsafe_slice(old, 0, old.len() - drop_left)); 950 | copy_and_advance( 951 | &mut dest, 952 | unsafe_slice(&insert_bytes, 0, insert_len as usize), 953 | ); 954 | copy_and_advance( 955 | &mut dest, 956 | unsafe_slice(buf, drop_right, buf.len() - drop_right), 957 | ); 958 | } 959 | *self = Tendril::inline(&tmp[..new_len as usize]); 960 | } else { 961 | self.make_owned_with_capacity(new_len); 962 | let (owned, _, _) = self.assume_buf(); 963 | let mut dest = owned 964 | .data_ptr() 965 | .offset((owned.len as usize - drop_left) as isize); 966 | copy_and_advance( 967 | &mut dest, 968 | unsafe_slice(&insert_bytes, 0, insert_len as usize), 969 | ); 970 | copy_and_advance( 971 | &mut dest, 972 | unsafe_slice(buf, drop_right, buf.len() - drop_right), 973 | ); 974 | self.set_len(new_len); 975 | } 976 | } 977 | 978 | /// Slice this `Tendril` as a new `Tendril`. 979 | /// 980 | /// Does not check validity or bounds! 981 | #[inline] 982 | pub unsafe fn unsafe_subtendril(&self, offset: u32, length: u32) -> Tendril { 983 | if length <= MAX_INLINE_LEN as u32 { 984 | Tendril::inline(unsafe_slice( 985 | self.as_byte_slice(), 986 | offset as usize, 987 | length as usize, 988 | )) 989 | } else { 990 | self.make_buf_shared(); 991 | self.incref(); 992 | let (buf, _, _) = self.assume_buf(); 993 | Tendril::shared(buf, self.aux() + offset, length) 994 | } 995 | } 996 | 997 | /// Drop `n` bytes from the front. 998 | /// 999 | /// Does not check validity or bounds! 1000 | #[inline] 1001 | pub unsafe fn unsafe_pop_front(&mut self, n: u32) { 1002 | let new_len = self.len32() - n; 1003 | if new_len <= MAX_INLINE_LEN as u32 { 1004 | *self = Tendril::inline(unsafe_slice( 1005 | self.as_byte_slice(), 1006 | n as usize, 1007 | new_len as usize, 1008 | )); 1009 | } else { 1010 | self.make_buf_shared(); 1011 | self.set_aux(self.aux() + n); 1012 | let len = self.raw_len(); 1013 | self.set_len(len - n); 1014 | } 1015 | } 1016 | 1017 | /// Drop `n` bytes from the back. 1018 | /// 1019 | /// Does not check validity or bounds! 1020 | #[inline] 1021 | pub unsafe fn unsafe_pop_back(&mut self, n: u32) { 1022 | let new_len = self.len32() - n; 1023 | if new_len <= MAX_INLINE_LEN as u32 { 1024 | *self = Tendril::inline(unsafe_slice(self.as_byte_slice(), 0, new_len as usize)); 1025 | } else { 1026 | self.make_buf_shared(); 1027 | let len = self.raw_len(); 1028 | self.set_len(len - n); 1029 | } 1030 | } 1031 | 1032 | #[inline] 1033 | unsafe fn incref(&self) { 1034 | (*self.header()).refcount.increment(); 1035 | } 1036 | 1037 | #[inline] 1038 | unsafe fn make_buf_shared(&self) { 1039 | let p = self.ptr.get().get(); 1040 | if p & 1 == 0 { 1041 | let header = p as *mut Header; 1042 | (*header).cap = self.aux(); 1043 | 1044 | self.ptr.set(NonZeroUsize::new_unchecked(p | 1)); 1045 | self.set_aux(0); 1046 | } 1047 | } 1048 | 1049 | // This is not public as it is of no practical value to users. 1050 | // By and large they shouldn't need to worry about the distinction at all, 1051 | // and going out of your way to make it owned is pointless. 1052 | #[inline] 1053 | fn make_owned(&mut self) { 1054 | unsafe { 1055 | let ptr = self.ptr.get().get(); 1056 | if ptr <= MAX_INLINE_TAG || (ptr & 1) == 1 { 1057 | *self = Tendril::owned_copy(self.as_byte_slice()); 1058 | } 1059 | } 1060 | } 1061 | 1062 | #[inline] 1063 | unsafe fn make_owned_with_capacity(&mut self, cap: u32) { 1064 | self.make_owned(); 1065 | let mut buf = self.assume_buf().0; 1066 | buf.grow(cap); 1067 | self.ptr.set(NonZeroUsize::new_unchecked(buf.ptr as usize)); 1068 | self.set_aux(buf.cap); 1069 | } 1070 | 1071 | #[inline(always)] 1072 | unsafe fn header(&self) -> *mut Header { 1073 | (self.ptr.get().get() & !1) as *mut Header 1074 | } 1075 | 1076 | #[inline] 1077 | unsafe fn assume_buf(&self) -> (Buf32>, bool, u32) { 1078 | let ptr = self.ptr.get().get(); 1079 | let header = self.header(); 1080 | let shared = (ptr & 1) == 1; 1081 | let (cap, offset) = match shared { 1082 | true => ((*header).cap, self.aux()), 1083 | false => (self.aux(), 0), 1084 | }; 1085 | 1086 | ( 1087 | Buf32 { 1088 | ptr: header, 1089 | len: offset + self.len32(), 1090 | cap: cap, 1091 | }, 1092 | shared, 1093 | offset, 1094 | ) 1095 | } 1096 | 1097 | #[inline] 1098 | unsafe fn inline(x: &[u8]) -> Tendril { 1099 | let len = x.len(); 1100 | let t = Tendril { 1101 | ptr: Cell::new(inline_tag(len as u32)), 1102 | buf: UnsafeCell::new(Buffer { inline: [0; 8] }), 1103 | marker: PhantomData, 1104 | refcount_marker: PhantomData, 1105 | }; 1106 | ptr::copy_nonoverlapping(x.as_ptr(), (*t.buf.get()).inline.as_mut_ptr(), len); 1107 | t 1108 | } 1109 | 1110 | #[inline] 1111 | unsafe fn owned(x: Buf32>) -> Tendril { 1112 | Tendril { 1113 | ptr: Cell::new(NonZeroUsize::new_unchecked(x.ptr as usize)), 1114 | buf: UnsafeCell::new(Buffer { 1115 | heap: Heap { 1116 | len: x.len, 1117 | aux: x.cap, 1118 | }, 1119 | }), 1120 | marker: PhantomData, 1121 | refcount_marker: PhantomData, 1122 | } 1123 | } 1124 | 1125 | #[inline] 1126 | unsafe fn owned_copy(x: &[u8]) -> Tendril { 1127 | let len32 = x.len() as u32; 1128 | let mut b = Buf32::with_capacity(len32, Header::new()); 1129 | ptr::copy_nonoverlapping(x.as_ptr(), b.data_ptr(), x.len()); 1130 | b.len = len32; 1131 | Tendril::owned(b) 1132 | } 1133 | 1134 | #[inline] 1135 | unsafe fn shared(buf: Buf32>, off: u32, len: u32) -> Tendril { 1136 | Tendril { 1137 | ptr: Cell::new(NonZeroUsize::new_unchecked((buf.ptr as usize) | 1)), 1138 | buf: UnsafeCell::new(Buffer { 1139 | heap: Heap { len, aux: off }, 1140 | }), 1141 | marker: PhantomData, 1142 | refcount_marker: PhantomData, 1143 | } 1144 | } 1145 | 1146 | #[inline] 1147 | fn as_byte_slice<'a>(&'a self) -> &'a [u8] { 1148 | unsafe { 1149 | match self.ptr.get().get() { 1150 | EMPTY_TAG => &[], 1151 | n if n <= MAX_INLINE_LEN => (*self.buf.get()).inline.get_unchecked(..n), 1152 | _ => { 1153 | let (buf, _, offset) = self.assume_buf(); 1154 | copy_lifetime( 1155 | self, 1156 | unsafe_slice(buf.data(), offset as usize, self.len32() as usize), 1157 | ) 1158 | } 1159 | } 1160 | } 1161 | } 1162 | 1163 | // There's no need to worry about locking on an atomic Tendril, because it makes it unique as 1164 | // soon as you do that. 1165 | #[inline] 1166 | fn as_mut_byte_slice<'a>(&'a mut self) -> &'a mut [u8] { 1167 | unsafe { 1168 | match self.ptr.get().get() { 1169 | EMPTY_TAG => &mut [], 1170 | n if n <= MAX_INLINE_LEN => (*self.buf.get()).inline.get_unchecked_mut(..n), 1171 | _ => { 1172 | self.make_owned(); 1173 | let (mut buf, _, offset) = self.assume_buf(); 1174 | let len = self.len32() as usize; 1175 | copy_lifetime_mut(self, unsafe_slice_mut(buf.data_mut(), offset as usize, len)) 1176 | } 1177 | } 1178 | } 1179 | } 1180 | 1181 | unsafe fn raw_len(&self) -> u32 { 1182 | (*self.buf.get()).heap.len 1183 | } 1184 | 1185 | unsafe fn set_len(&mut self, len: u32) { 1186 | (*self.buf.get()).heap.len = len; 1187 | } 1188 | 1189 | unsafe fn aux(&self) -> u32 { 1190 | (*self.buf.get()).heap.aux 1191 | } 1192 | 1193 | unsafe fn set_aux(&self, aux: u32) { 1194 | (*self.buf.get()).heap.aux = aux; 1195 | } 1196 | } 1197 | 1198 | impl Tendril 1199 | where 1200 | F: fmt::SliceFormat, 1201 | A: Atomicity, 1202 | { 1203 | /// Build a `Tendril` by copying a slice. 1204 | #[inline] 1205 | pub fn from_slice(x: &F::Slice) -> Tendril { 1206 | unsafe { Tendril::from_byte_slice_without_validating(x.as_bytes()) } 1207 | } 1208 | 1209 | /// Push a slice onto the end of the `Tendril`. 1210 | #[inline] 1211 | pub fn push_slice(&mut self, x: &F::Slice) { 1212 | unsafe { self.push_bytes_without_validating(x.as_bytes()) } 1213 | } 1214 | } 1215 | 1216 | /// A simple wrapper to make `Tendril` `Send`. 1217 | /// 1218 | /// Although there is a certain subset of the operations on a `Tendril` that a `SendTendril` could 1219 | /// reasonably implement, in order to clearly separate concerns this type is deliberately 1220 | /// minimalist, acting as a safe encapsulation around the invariants which permit `Send`ness and 1221 | /// behaving as an opaque object. 1222 | /// 1223 | /// A `SendTendril` may be produced by `Tendril.into_send()` or `SendTendril::from(tendril)`, 1224 | /// and may be returned to a `Tendril` by `Tendril::from(self)`. 1225 | #[derive(Clone)] 1226 | pub struct SendTendril 1227 | where 1228 | F: fmt::Format, 1229 | { 1230 | tendril: Tendril, 1231 | } 1232 | 1233 | unsafe impl Send for SendTendril where F: fmt::Format {} 1234 | 1235 | impl From> for SendTendril 1236 | where 1237 | F: fmt::Format, 1238 | A: Atomicity, 1239 | { 1240 | #[inline] 1241 | fn from(tendril: Tendril) -> SendTendril { 1242 | tendril.into_send() 1243 | } 1244 | } 1245 | 1246 | impl From> for Tendril 1247 | where 1248 | F: fmt::Format, 1249 | A: Atomicity, 1250 | { 1251 | #[inline] 1252 | fn from(send: SendTendril) -> Tendril { 1253 | unsafe { mem::transmute(send.tendril) } 1254 | // header.refcount may have been initialised as an Atomic or a NonAtomic, but the value 1255 | // will be the same (1) regardless, because the layout is defined. 1256 | // Thus we don't need to fiddle about resetting it or anything like that. 1257 | } 1258 | } 1259 | 1260 | /// `Tendril`-related methods for Rust slices. 1261 | pub trait SliceExt: fmt::Slice 1262 | where 1263 | F: fmt::SliceFormat, 1264 | { 1265 | /// Make a `Tendril` from this slice. 1266 | #[inline] 1267 | fn to_tendril(&self) -> Tendril { 1268 | // It should be done thusly, but at the time of writing the defaults don't help inference: 1269 | //fn to_tendril(&self) -> Tendril 1270 | // where A: Atomicity, 1271 | //{ 1272 | Tendril::from_slice(self) 1273 | } 1274 | } 1275 | 1276 | impl SliceExt for str {} 1277 | impl SliceExt for [u8] {} 1278 | 1279 | impl Tendril 1280 | where 1281 | F: for<'a> fmt::CharFormat<'a>, 1282 | A: Atomicity, 1283 | { 1284 | /// Remove and return the first character, if any. 1285 | #[inline] 1286 | pub fn pop_front_char<'a>(&'a mut self) -> Option { 1287 | unsafe { 1288 | let next_char; // first char in iterator 1289 | let mut skip = 0; // number of bytes to skip, or 0 to clear 1290 | 1291 | { 1292 | // <--+ 1293 | // | Creating an iterator borrows self, so introduce a 1294 | // +- scope to contain the borrow (that way we can mutate 1295 | // self below, after this scope exits). 1296 | 1297 | let mut iter = F::char_indices(self.as_byte_slice()); 1298 | match iter.next() { 1299 | Some((_, c)) => { 1300 | next_char = Some(c); 1301 | if let Some((n, _)) = iter.next() { 1302 | skip = n as u32; 1303 | } 1304 | } 1305 | None => { 1306 | next_char = None; 1307 | } 1308 | } 1309 | } 1310 | 1311 | if skip != 0 { 1312 | self.unsafe_pop_front(skip); 1313 | } else { 1314 | self.clear(); 1315 | } 1316 | 1317 | next_char 1318 | } 1319 | } 1320 | 1321 | /// Remove and return a run of characters at the front of the `Tendril` 1322 | /// which are classified the same according to the function `classify`. 1323 | /// 1324 | /// Returns `None` on an empty string. 1325 | #[inline] 1326 | pub fn pop_front_char_run<'a, C, R>(&'a mut self, mut classify: C) -> Option<(Tendril, R)> 1327 | where 1328 | C: FnMut(char) -> R, 1329 | R: PartialEq, 1330 | { 1331 | let (class, first_mismatch); 1332 | { 1333 | let mut chars = unsafe { F::char_indices(self.as_byte_slice()) }; 1334 | let (_, first) = unwrap_or_return!(chars.next(), None); 1335 | class = classify(first); 1336 | first_mismatch = chars.find(|&(_, ch)| &classify(ch) != &class); 1337 | } 1338 | 1339 | match first_mismatch { 1340 | Some((idx, _)) => unsafe { 1341 | let t = self.unsafe_subtendril(0, idx as u32); 1342 | self.unsafe_pop_front(idx as u32); 1343 | Some((t, class)) 1344 | }, 1345 | None => { 1346 | let t = self.clone(); 1347 | self.clear(); 1348 | Some((t, class)) 1349 | } 1350 | } 1351 | } 1352 | 1353 | /// Push a character, if it can be represented in this format. 1354 | #[inline] 1355 | pub fn try_push_char(&mut self, c: char) -> Result<(), ()> { 1356 | F::encode_char(c, |b| unsafe { 1357 | self.push_bytes_without_validating(b); 1358 | }) 1359 | } 1360 | } 1361 | 1362 | /// Extension trait for `io::Read`. 1363 | pub trait ReadExt: io::Read { 1364 | fn read_to_tendril(&mut self, buf: &mut Tendril) -> io::Result 1365 | where 1366 | A: Atomicity; 1367 | } 1368 | 1369 | impl ReadExt for T 1370 | where 1371 | T: io::Read, 1372 | { 1373 | /// Read all bytes until EOF. 1374 | fn read_to_tendril(&mut self, buf: &mut Tendril) -> io::Result 1375 | where 1376 | A: Atomicity, 1377 | { 1378 | // Adapted from libstd/io/mod.rs. 1379 | const DEFAULT_BUF_SIZE: u32 = 64 * 1024; 1380 | 1381 | let start_len = buf.len(); 1382 | let mut len = start_len; 1383 | let mut new_write_size = 16; 1384 | let ret; 1385 | loop { 1386 | if len == buf.len() { 1387 | if new_write_size < DEFAULT_BUF_SIZE { 1388 | new_write_size *= 2; 1389 | } 1390 | // FIXME: this exposes uninitialized bytes to a generic R type 1391 | // this is fine for R=File which never reads these bytes, 1392 | // but user-defined types might. 1393 | // The standard library pushes zeros to `Vec` for that reason. 1394 | unsafe { 1395 | buf.push_uninitialized(new_write_size); 1396 | } 1397 | } 1398 | 1399 | match self.read(&mut buf[len..]) { 1400 | Ok(0) => { 1401 | ret = Ok(len - start_len); 1402 | break; 1403 | } 1404 | Ok(n) => len += n, 1405 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} 1406 | Err(e) => { 1407 | ret = Err(e); 1408 | break; 1409 | } 1410 | } 1411 | } 1412 | 1413 | let buf_len = buf.len32(); 1414 | buf.pop_back(buf_len - (len as u32)); 1415 | ret 1416 | } 1417 | } 1418 | 1419 | impl io::Write for Tendril 1420 | where 1421 | A: Atomicity, 1422 | { 1423 | #[inline] 1424 | fn write(&mut self, buf: &[u8]) -> io::Result { 1425 | self.push_slice(buf); 1426 | Ok(buf.len()) 1427 | } 1428 | 1429 | #[inline] 1430 | fn write_all(&mut self, buf: &[u8]) -> io::Result<()> { 1431 | self.push_slice(buf); 1432 | Ok(()) 1433 | } 1434 | 1435 | #[inline(always)] 1436 | fn flush(&mut self) -> io::Result<()> { 1437 | Ok(()) 1438 | } 1439 | } 1440 | 1441 | #[cfg(feature = "encoding")] 1442 | impl encoding::ByteWriter for Tendril 1443 | where 1444 | A: Atomicity, 1445 | { 1446 | #[inline] 1447 | fn write_byte(&mut self, b: u8) { 1448 | self.push_slice(&[b]); 1449 | } 1450 | 1451 | #[inline] 1452 | fn write_bytes(&mut self, v: &[u8]) { 1453 | self.push_slice(v); 1454 | } 1455 | 1456 | #[inline] 1457 | fn writer_hint(&mut self, additional: usize) { 1458 | self.reserve(::std::cmp::min(u32::MAX as usize, additional) as u32); 1459 | } 1460 | } 1461 | 1462 | impl Tendril 1463 | where 1464 | A: Atomicity, 1465 | F: fmt::SliceFormat, 1466 | { 1467 | /// Decode from some character encoding into UTF-8. 1468 | /// 1469 | /// See the [rust-encoding docs](https://lifthrasiir.github.io/rust-encoding/encoding/) 1470 | /// for more information. 1471 | #[inline] 1472 | #[cfg(feature = "encoding")] 1473 | pub fn decode( 1474 | &self, 1475 | encoding: EncodingRef, 1476 | trap: DecoderTrap, 1477 | ) -> Result, ::std::borrow::Cow<'static, str>> { 1478 | let mut ret = Tendril::new(); 1479 | encoding.decode_to(&*self, trap, &mut ret).map(|_| ret) 1480 | } 1481 | 1482 | /// Push "uninitialized bytes" onto the end. 1483 | /// 1484 | /// Really, this grows the tendril without writing anything to the new area. 1485 | /// It's only defined for byte tendrils because it's only useful if you 1486 | /// plan to then mutate the buffer. 1487 | #[inline] 1488 | pub unsafe fn push_uninitialized(&mut self, n: u32) { 1489 | let new_len = self.len32().checked_add(n).expect(OFLOW); 1490 | if new_len <= MAX_INLINE_LEN as u32 && self.ptr.get().get() <= MAX_INLINE_TAG { 1491 | self.ptr.set(inline_tag(new_len)) 1492 | } else { 1493 | self.make_owned_with_capacity(new_len); 1494 | self.set_len(new_len); 1495 | } 1496 | } 1497 | } 1498 | 1499 | impl strfmt::Display for Tendril 1500 | where 1501 | A: Atomicity, 1502 | { 1503 | #[inline] 1504 | fn fmt(&self, f: &mut strfmt::Formatter) -> strfmt::Result { 1505 | ::fmt(&**self, f) 1506 | } 1507 | } 1508 | 1509 | impl str::FromStr for Tendril 1510 | where 1511 | A: Atomicity, 1512 | { 1513 | type Err = (); 1514 | 1515 | #[inline] 1516 | fn from_str(s: &str) -> Result { 1517 | Ok(Tendril::from_slice(s)) 1518 | } 1519 | } 1520 | 1521 | impl strfmt::Write for Tendril 1522 | where 1523 | A: Atomicity, 1524 | { 1525 | #[inline] 1526 | fn write_str(&mut self, s: &str) -> strfmt::Result { 1527 | self.push_slice(s); 1528 | Ok(()) 1529 | } 1530 | } 1531 | 1532 | #[cfg(feature = "encoding")] 1533 | impl encoding::StringWriter for Tendril 1534 | where 1535 | A: Atomicity, 1536 | { 1537 | #[inline] 1538 | fn write_char(&mut self, c: char) { 1539 | self.push_char(c); 1540 | } 1541 | 1542 | #[inline] 1543 | fn write_str(&mut self, s: &str) { 1544 | self.push_slice(s); 1545 | } 1546 | 1547 | #[inline] 1548 | fn writer_hint(&mut self, additional: usize) { 1549 | self.reserve(::std::cmp::min(u32::MAX as usize, additional) as u32); 1550 | } 1551 | } 1552 | 1553 | impl Tendril 1554 | where 1555 | A: Atomicity, 1556 | { 1557 | /// Encode from UTF-8 into some other character encoding. 1558 | /// 1559 | /// See the [rust-encoding docs](https://lifthrasiir.github.io/rust-encoding/encoding/) 1560 | /// for more information. 1561 | #[inline] 1562 | #[cfg(feature = "encoding")] 1563 | pub fn encode( 1564 | &self, 1565 | encoding: EncodingRef, 1566 | trap: EncoderTrap, 1567 | ) -> Result, ::std::borrow::Cow<'static, str>> { 1568 | let mut ret = Tendril::new(); 1569 | encoding.encode_to(&*self, trap, &mut ret).map(|_| ret) 1570 | } 1571 | 1572 | /// Push a character onto the end. 1573 | #[inline] 1574 | pub fn push_char(&mut self, c: char) { 1575 | unsafe { 1576 | self.push_bytes_without_validating(c.encode_utf8(&mut [0_u8; 4]).as_bytes()); 1577 | } 1578 | } 1579 | 1580 | /// Create a `Tendril` from a single character. 1581 | #[inline] 1582 | pub fn from_char(c: char) -> Tendril { 1583 | let mut t: Tendril = Tendril::new(); 1584 | t.push_char(c); 1585 | t 1586 | } 1587 | 1588 | /// Helper for the `format_tendril!` macro. 1589 | #[inline] 1590 | pub fn format(args: strfmt::Arguments) -> Tendril { 1591 | use std::fmt::Write; 1592 | let mut output: Tendril = Tendril::new(); 1593 | let _ = write!(&mut output, "{}", args); 1594 | output 1595 | } 1596 | } 1597 | 1598 | /// Create a `StrTendril` through string formatting. 1599 | /// 1600 | /// Works just like the standard `format!` macro. 1601 | #[macro_export] 1602 | macro_rules! format_tendril { 1603 | ($($arg:tt)*) => ($crate::StrTendril::format(format_args!($($arg)*))) 1604 | } 1605 | 1606 | impl<'a, F, A> From<&'a F::Slice> for Tendril 1607 | where 1608 | F: fmt::SliceFormat, 1609 | A: Atomicity, 1610 | { 1611 | #[inline] 1612 | fn from(input: &F::Slice) -> Tendril { 1613 | Tendril::from_slice(input) 1614 | } 1615 | } 1616 | 1617 | impl From for Tendril 1618 | where 1619 | A: Atomicity, 1620 | { 1621 | #[inline] 1622 | fn from(input: String) -> Tendril { 1623 | Tendril::from_slice(&*input) 1624 | } 1625 | } 1626 | 1627 | impl AsRef for Tendril 1628 | where 1629 | F: fmt::SliceFormat, 1630 | A: Atomicity, 1631 | { 1632 | #[inline] 1633 | fn as_ref(&self) -> &F::Slice { 1634 | &**self 1635 | } 1636 | } 1637 | 1638 | impl From> for String 1639 | where 1640 | A: Atomicity, 1641 | { 1642 | #[inline] 1643 | fn from(input: Tendril) -> String { 1644 | String::from(&*input) 1645 | } 1646 | } 1647 | 1648 | impl<'a, A> From<&'a Tendril> for String 1649 | where 1650 | A: Atomicity, 1651 | { 1652 | #[inline] 1653 | fn from(input: &'a Tendril) -> String { 1654 | String::from(&**input) 1655 | } 1656 | } 1657 | 1658 | #[cfg(all(test, feature = "bench"))] 1659 | #[path = "bench.rs"] 1660 | mod bench; 1661 | 1662 | #[cfg(test)] 1663 | mod test { 1664 | use super::{ 1665 | Atomic, ByteTendril, Header, NonAtomic, ReadExt, SendTendril, SliceExt, StrTendril, Tendril, 1666 | }; 1667 | use fmt; 1668 | use std::iter; 1669 | use std::thread; 1670 | 1671 | fn assert_send() {} 1672 | 1673 | #[test] 1674 | fn smoke_test() { 1675 | assert_eq!("", &*"".to_tendril()); 1676 | assert_eq!("abc", &*"abc".to_tendril()); 1677 | assert_eq!("Hello, world!", &*"Hello, world!".to_tendril()); 1678 | 1679 | assert_eq!(b"", &*b"".to_tendril()); 1680 | assert_eq!(b"abc", &*b"abc".to_tendril()); 1681 | assert_eq!(b"Hello, world!", &*b"Hello, world!".to_tendril()); 1682 | } 1683 | 1684 | #[test] 1685 | fn assert_sizes() { 1686 | use std::mem; 1687 | struct EmptyWithDrop; 1688 | impl Drop for EmptyWithDrop { 1689 | fn drop(&mut self) {} 1690 | } 1691 | let compiler_uses_inline_drop_flags = mem::size_of::() > 0; 1692 | 1693 | let correct = mem::size_of::<*const ()>() 1694 | + 8 1695 | + if compiler_uses_inline_drop_flags { 1696 | 1 1697 | } else { 1698 | 0 1699 | }; 1700 | 1701 | assert_eq!(correct, mem::size_of::()); 1702 | assert_eq!(correct, mem::size_of::()); 1703 | 1704 | // This is no longer true. See https://github.com/servo/tendril/issues/66 1705 | // assert_eq!(correct, mem::size_of::>()); 1706 | // assert_eq!(correct, mem::size_of::>()); 1707 | 1708 | assert_eq!( 1709 | mem::size_of::<*const ()>() * 2, 1710 | mem::size_of::>(), 1711 | ); 1712 | assert_eq!( 1713 | mem::size_of::>(), 1714 | mem::size_of::>(), 1715 | ); 1716 | } 1717 | 1718 | #[test] 1719 | fn validate_utf8() { 1720 | assert!(ByteTendril::try_from_byte_slice(b"\xFF").is_ok()); 1721 | assert!(StrTendril::try_from_byte_slice(b"\xFF").is_err()); 1722 | assert!(StrTendril::try_from_byte_slice(b"\xEA\x99\xFF").is_err()); 1723 | assert!(StrTendril::try_from_byte_slice(b"\xEA\x99").is_err()); 1724 | assert!(StrTendril::try_from_byte_slice(b"\xEA\x99\xAE\xEA").is_err()); 1725 | assert_eq!( 1726 | "\u{a66e}", 1727 | &*StrTendril::try_from_byte_slice(b"\xEA\x99\xAE").unwrap() 1728 | ); 1729 | 1730 | let mut t = StrTendril::new(); 1731 | assert!(t.try_push_bytes(b"\xEA\x99").is_err()); 1732 | assert!(t.try_push_bytes(b"\xAE").is_err()); 1733 | assert!(t.try_push_bytes(b"\xEA\x99\xAE").is_ok()); 1734 | assert_eq!("\u{a66e}", &*t); 1735 | } 1736 | 1737 | #[test] 1738 | fn share_and_unshare() { 1739 | let s = b"foobarbaz".to_tendril(); 1740 | assert_eq!(b"foobarbaz", &*s); 1741 | assert!(!s.is_shared()); 1742 | 1743 | let mut t = s.clone(); 1744 | assert_eq!(s.as_ptr(), t.as_ptr()); 1745 | assert!(s.is_shared()); 1746 | assert!(t.is_shared()); 1747 | 1748 | t.push_slice(b"quux"); 1749 | assert_eq!(b"foobarbaz", &*s); 1750 | assert_eq!(b"foobarbazquux", &*t); 1751 | assert!(s.as_ptr() != t.as_ptr()); 1752 | assert!(!t.is_shared()); 1753 | } 1754 | 1755 | #[test] 1756 | fn format_display() { 1757 | assert_eq!("foobar", &*format!("{}", "foobar".to_tendril())); 1758 | 1759 | let mut s = "foo".to_tendril(); 1760 | assert_eq!("foo", &*format!("{}", s)); 1761 | 1762 | let t = s.clone(); 1763 | assert_eq!("foo", &*format!("{}", s)); 1764 | assert_eq!("foo", &*format!("{}", t)); 1765 | 1766 | s.push_slice("barbaz!"); 1767 | assert_eq!("foobarbaz!", &*format!("{}", s)); 1768 | assert_eq!("foo", &*format!("{}", t)); 1769 | } 1770 | 1771 | #[test] 1772 | fn format_debug() { 1773 | assert_eq!( 1774 | r#"Tendril(inline: "foobar")"#, 1775 | &*format!("{:?}", "foobar".to_tendril()) 1776 | ); 1777 | assert_eq!( 1778 | r#"Tendril(inline: [102, 111, 111, 98, 97, 114])"#, 1779 | &*format!("{:?}", b"foobar".to_tendril()) 1780 | ); 1781 | 1782 | let t = "anextralongstring".to_tendril(); 1783 | assert_eq!( 1784 | r#"Tendril(owned: "anextralongstring")"#, 1785 | &*format!("{:?}", t) 1786 | ); 1787 | let _ = t.clone(); 1788 | assert_eq!( 1789 | r#"Tendril(shared: "anextralongstring")"#, 1790 | &*format!("{:?}", t) 1791 | ); 1792 | } 1793 | 1794 | #[test] 1795 | fn subtendril() { 1796 | assert_eq!("foo".to_tendril(), "foo-bar".to_tendril().subtendril(0, 3)); 1797 | assert_eq!("bar".to_tendril(), "foo-bar".to_tendril().subtendril(4, 3)); 1798 | 1799 | let mut t = "foo-bar".to_tendril(); 1800 | t.pop_front(2); 1801 | assert_eq!("o-bar".to_tendril(), t); 1802 | t.pop_back(1); 1803 | assert_eq!("o-ba".to_tendril(), t); 1804 | 1805 | assert_eq!( 1806 | "foo".to_tendril(), 1807 | "foo-a-longer-string-bar-baz".to_tendril().subtendril(0, 3) 1808 | ); 1809 | assert_eq!( 1810 | "oo-a-".to_tendril(), 1811 | "foo-a-longer-string-bar-baz".to_tendril().subtendril(1, 5) 1812 | ); 1813 | assert_eq!( 1814 | "bar".to_tendril(), 1815 | "foo-a-longer-string-bar-baz".to_tendril().subtendril(20, 3) 1816 | ); 1817 | 1818 | let mut t = "another rather long string".to_tendril(); 1819 | t.pop_front(2); 1820 | assert!(t.starts_with("other rather")); 1821 | t.pop_back(1); 1822 | assert_eq!("other rather long strin".to_tendril(), t); 1823 | assert!(t.is_shared()); 1824 | } 1825 | 1826 | #[test] 1827 | fn subtendril_invalid() { 1828 | assert!("\u{a66e}".to_tendril().try_subtendril(0, 2).is_err()); 1829 | assert!("\u{a66e}".to_tendril().try_subtendril(1, 2).is_err()); 1830 | 1831 | assert!("\u{1f4a9}".to_tendril().try_subtendril(0, 3).is_err()); 1832 | assert!("\u{1f4a9}".to_tendril().try_subtendril(0, 2).is_err()); 1833 | assert!("\u{1f4a9}".to_tendril().try_subtendril(0, 1).is_err()); 1834 | assert!("\u{1f4a9}".to_tendril().try_subtendril(1, 3).is_err()); 1835 | assert!("\u{1f4a9}".to_tendril().try_subtendril(1, 2).is_err()); 1836 | assert!("\u{1f4a9}".to_tendril().try_subtendril(1, 1).is_err()); 1837 | assert!("\u{1f4a9}".to_tendril().try_subtendril(2, 2).is_err()); 1838 | assert!("\u{1f4a9}".to_tendril().try_subtendril(2, 1).is_err()); 1839 | assert!("\u{1f4a9}".to_tendril().try_subtendril(3, 1).is_err()); 1840 | 1841 | let mut t = "\u{1f4a9}zzzzzz".to_tendril(); 1842 | assert!(t.try_pop_front(1).is_err()); 1843 | assert!(t.try_pop_front(2).is_err()); 1844 | assert!(t.try_pop_front(3).is_err()); 1845 | assert!(t.try_pop_front(4).is_ok()); 1846 | assert_eq!("zzzzzz", &*t); 1847 | 1848 | let mut t = "zzzzzz\u{1f4a9}".to_tendril(); 1849 | assert!(t.try_pop_back(1).is_err()); 1850 | assert!(t.try_pop_back(2).is_err()); 1851 | assert!(t.try_pop_back(3).is_err()); 1852 | assert!(t.try_pop_back(4).is_ok()); 1853 | assert_eq!("zzzzzz", &*t); 1854 | } 1855 | 1856 | #[test] 1857 | fn conversion() { 1858 | assert_eq!( 1859 | &[0x66, 0x6F, 0x6F].to_tendril(), 1860 | "foo".to_tendril().as_bytes() 1861 | ); 1862 | assert_eq!( 1863 | [0x66, 0x6F, 0x6F].to_tendril(), 1864 | "foo".to_tendril().into_bytes() 1865 | ); 1866 | 1867 | let ascii: Tendril = b"hello".to_tendril().try_reinterpret().unwrap(); 1868 | assert_eq!(&"hello".to_tendril(), ascii.as_superset()); 1869 | assert_eq!("hello".to_tendril(), ascii.clone().into_superset()); 1870 | 1871 | assert!(b"\xFF" 1872 | .to_tendril() 1873 | .try_reinterpret::() 1874 | .is_err()); 1875 | 1876 | let t = "hello".to_tendril(); 1877 | let ascii: &Tendril = t.try_as_subset().unwrap(); 1878 | assert_eq!(b"hello", &**ascii.as_bytes()); 1879 | 1880 | assert!("ő" 1881 | .to_tendril() 1882 | .try_reinterpret_view::() 1883 | .is_err()); 1884 | assert!("ő".to_tendril().try_as_subset::().is_err()); 1885 | 1886 | let ascii: Tendril = "hello".to_tendril().try_into_subset().unwrap(); 1887 | assert_eq!(b"hello", &**ascii.as_bytes()); 1888 | 1889 | assert!("ő".to_tendril().try_reinterpret::().is_err()); 1890 | assert!("ő".to_tendril().try_into_subset::().is_err()); 1891 | } 1892 | 1893 | #[test] 1894 | fn clear() { 1895 | let mut t = "foo-".to_tendril(); 1896 | t.clear(); 1897 | assert_eq!(t.len(), 0); 1898 | assert_eq!(t.len32(), 0); 1899 | assert_eq!(&*t, ""); 1900 | 1901 | let mut t = "much longer".to_tendril(); 1902 | let s = t.clone(); 1903 | t.clear(); 1904 | assert_eq!(t.len(), 0); 1905 | assert_eq!(t.len32(), 0); 1906 | assert_eq!(&*t, ""); 1907 | assert_eq!(&*s, "much longer"); 1908 | } 1909 | 1910 | #[test] 1911 | fn push_tendril() { 1912 | let mut t = "abc".to_tendril(); 1913 | t.push_tendril(&"xyz".to_tendril()); 1914 | assert_eq!("abcxyz", &*t); 1915 | } 1916 | 1917 | #[test] 1918 | fn wtf8() { 1919 | assert!(Tendril::::try_from_byte_slice(b"\xED\xA0\xBD").is_ok()); 1920 | assert!(Tendril::::try_from_byte_slice(b"\xED\xB2\xA9").is_ok()); 1921 | assert!(Tendril::::try_from_byte_slice(b"\xED\xA0\xBD\xED\xB2\xA9").is_err()); 1922 | 1923 | let t: Tendril = 1924 | Tendril::try_from_byte_slice(b"\xED\xA0\xBD\xEA\x99\xAE").unwrap(); 1925 | assert!(b"\xED\xA0\xBD".to_tendril().try_reinterpret().unwrap() == t.subtendril(0, 3)); 1926 | assert!(b"\xEA\x99\xAE".to_tendril().try_reinterpret().unwrap() == t.subtendril(3, 3)); 1927 | assert!(t.try_reinterpret_view::().is_err()); 1928 | 1929 | assert!(t.try_subtendril(0, 1).is_err()); 1930 | assert!(t.try_subtendril(0, 2).is_err()); 1931 | assert!(t.try_subtendril(1, 1).is_err()); 1932 | 1933 | assert!(t.try_subtendril(3, 1).is_err()); 1934 | assert!(t.try_subtendril(3, 2).is_err()); 1935 | assert!(t.try_subtendril(4, 1).is_err()); 1936 | 1937 | // paired surrogates 1938 | let mut t: Tendril = Tendril::try_from_byte_slice(b"\xED\xA0\xBD").unwrap(); 1939 | assert!(t.try_push_bytes(b"\xED\xB2\xA9").is_ok()); 1940 | assert_eq!(b"\xF0\x9F\x92\xA9", t.as_byte_slice()); 1941 | assert!(t.try_reinterpret_view::().is_ok()); 1942 | 1943 | // unpaired surrogates 1944 | let mut t: Tendril = Tendril::try_from_byte_slice(b"\xED\xA0\xBB").unwrap(); 1945 | assert!(t.try_push_bytes(b"\xED\xA0").is_err()); 1946 | assert!(t.try_push_bytes(b"\xED").is_err()); 1947 | assert!(t.try_push_bytes(b"\xA0").is_err()); 1948 | assert!(t.try_push_bytes(b"\xED\xA0\xBD").is_ok()); 1949 | assert_eq!(b"\xED\xA0\xBB\xED\xA0\xBD", t.as_byte_slice()); 1950 | assert!(t.try_push_bytes(b"\xED\xB2\xA9").is_ok()); 1951 | assert_eq!(b"\xED\xA0\xBB\xF0\x9F\x92\xA9", t.as_byte_slice()); 1952 | assert!(t.try_reinterpret_view::().is_err()); 1953 | } 1954 | 1955 | #[test] 1956 | fn front_char() { 1957 | let mut t = "".to_tendril(); 1958 | assert_eq!(None, t.pop_front_char()); 1959 | assert_eq!(None, t.pop_front_char()); 1960 | 1961 | let mut t = "abc".to_tendril(); 1962 | assert_eq!(Some('a'), t.pop_front_char()); 1963 | assert_eq!(Some('b'), t.pop_front_char()); 1964 | assert_eq!(Some('c'), t.pop_front_char()); 1965 | assert_eq!(None, t.pop_front_char()); 1966 | assert_eq!(None, t.pop_front_char()); 1967 | 1968 | let mut t = "főo-a-longer-string-bar-baz".to_tendril(); 1969 | assert_eq!(28, t.len()); 1970 | assert_eq!(Some('f'), t.pop_front_char()); 1971 | assert_eq!(Some('ő'), t.pop_front_char()); 1972 | assert_eq!(Some('o'), t.pop_front_char()); 1973 | assert_eq!(Some('-'), t.pop_front_char()); 1974 | assert_eq!(23, t.len()); 1975 | } 1976 | 1977 | #[test] 1978 | fn char_run() { 1979 | for &(s, exp) in &[ 1980 | ("", None), 1981 | (" ", Some((" ", true))), 1982 | ("x", Some(("x", false))), 1983 | (" \t \n", Some((" \t \n", true))), 1984 | ("xyzzy", Some(("xyzzy", false))), 1985 | (" xyzzy", Some((" ", true))), 1986 | ("xyzzy ", Some(("xyzzy", false))), 1987 | (" xyzzy ", Some((" ", true))), 1988 | ("xyzzy hi", Some(("xyzzy", false))), 1989 | ("中 ", Some(("中", false))), 1990 | (" 中 ", Some((" ", true))), 1991 | (" 中 ", Some((" ", true))), 1992 | (" 中 ", Some((" ", true))), 1993 | ] { 1994 | let mut t = s.to_tendril(); 1995 | let res = t.pop_front_char_run(char::is_whitespace); 1996 | match exp { 1997 | None => assert!(res.is_none()), 1998 | Some((es, ec)) => { 1999 | let (rt, rc) = res.unwrap(); 2000 | assert_eq!(es, &*rt); 2001 | assert_eq!(ec, rc); 2002 | } 2003 | } 2004 | } 2005 | } 2006 | 2007 | #[test] 2008 | fn deref_mut_inline() { 2009 | let mut t = "xyő".to_tendril().into_bytes(); 2010 | t[3] = 0xff; 2011 | assert_eq!(b"xy\xC5\xFF", &*t); 2012 | assert!(t.try_reinterpret_view::().is_err()); 2013 | t[3] = 0x8b; 2014 | assert_eq!("xyŋ", &**t.try_reinterpret_view::().unwrap()); 2015 | 2016 | unsafe { 2017 | t.push_uninitialized(3); 2018 | t[4] = 0xEA; 2019 | t[5] = 0x99; 2020 | t[6] = 0xAE; 2021 | assert_eq!( 2022 | "xyŋ\u{a66e}", 2023 | &**t.try_reinterpret_view::().unwrap() 2024 | ); 2025 | t.push_uninitialized(20); 2026 | t.pop_back(20); 2027 | assert_eq!( 2028 | "xyŋ\u{a66e}", 2029 | &**t.try_reinterpret_view::().unwrap() 2030 | ); 2031 | } 2032 | } 2033 | 2034 | #[test] 2035 | fn deref_mut() { 2036 | let mut t = b"0123456789".to_tendril(); 2037 | let u = t.clone(); 2038 | assert!(t.is_shared()); 2039 | t[9] = 0xff; 2040 | assert!(!t.is_shared()); 2041 | assert_eq!(b"0123456789", &*u); 2042 | assert_eq!(b"012345678\xff", &*t); 2043 | } 2044 | 2045 | #[test] 2046 | fn push_char() { 2047 | let mut t = "xyz".to_tendril(); 2048 | t.push_char('o'); 2049 | assert_eq!("xyzo", &*t); 2050 | t.push_char('ő'); 2051 | assert_eq!("xyzoő", &*t); 2052 | t.push_char('\u{a66e}'); 2053 | assert_eq!("xyzoő\u{a66e}", &*t); 2054 | t.push_char('\u{1f4a9}'); 2055 | assert_eq!("xyzoő\u{a66e}\u{1f4a9}", &*t); 2056 | assert_eq!(t.len(), 13); 2057 | } 2058 | 2059 | #[test] 2060 | #[cfg(feature = "encoding")] 2061 | fn encode() { 2062 | use encoding::{all, EncoderTrap}; 2063 | 2064 | let t = "안녕하세요 러스트".to_tendril(); 2065 | assert_eq!( 2066 | b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4\x20\xb7\xaf\xbd\xba\xc6\xae", 2067 | &*t.encode(all::WINDOWS_949, EncoderTrap::Strict).unwrap() 2068 | ); 2069 | 2070 | let t = "Энергия пробуждения ия-я-я! \u{a66e}".to_tendril(); 2071 | assert_eq!( 2072 | b"\xfc\xce\xc5\xd2\xc7\xc9\xd1 \xd0\xd2\xcf\xc2\xd5\xd6\xc4\xc5\xce\ 2073 | \xc9\xd1 \xc9\xd1\x2d\xd1\x2d\xd1\x21 ?", 2074 | &*t.encode(all::KOI8_U, EncoderTrap::Replace).unwrap() 2075 | ); 2076 | 2077 | let t = "\u{1f4a9}".to_tendril(); 2078 | assert!(t.encode(all::WINDOWS_1252, EncoderTrap::Strict).is_err()); 2079 | } 2080 | 2081 | #[test] 2082 | #[cfg(feature = "encoding")] 2083 | fn decode() { 2084 | use encoding::{all, DecoderTrap}; 2085 | 2086 | let t = b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\ 2087 | \xbf\xe4\x20\xb7\xaf\xbd\xba\xc6\xae" 2088 | .to_tendril(); 2089 | assert_eq!( 2090 | "안녕하세요 러스트", 2091 | &*t.decode(all::WINDOWS_949, DecoderTrap::Strict).unwrap() 2092 | ); 2093 | 2094 | let t = b"\xfc\xce\xc5\xd2\xc7\xc9\xd1 \xd0\xd2\xcf\xc2\xd5\xd6\xc4\xc5\xce\ 2095 | \xc9\xd1 \xc9\xd1\x2d\xd1\x2d\xd1\x21" 2096 | .to_tendril(); 2097 | assert_eq!( 2098 | "Энергия пробуждения ия-я-я!", 2099 | &*t.decode(all::KOI8_U, DecoderTrap::Replace).unwrap() 2100 | ); 2101 | 2102 | let t = b"x \xff y".to_tendril(); 2103 | assert!(t.decode(all::UTF_8, DecoderTrap::Strict).is_err()); 2104 | 2105 | let t = b"x \xff y".to_tendril(); 2106 | assert_eq!( 2107 | "x \u{fffd} y", 2108 | &*t.decode(all::UTF_8, DecoderTrap::Replace).unwrap() 2109 | ); 2110 | } 2111 | 2112 | #[test] 2113 | fn ascii() { 2114 | fn mk(x: &[u8]) -> Tendril { 2115 | x.to_tendril().try_reinterpret().unwrap() 2116 | } 2117 | 2118 | let mut t = mk(b"xyz"); 2119 | assert_eq!(Some('x'), t.pop_front_char()); 2120 | assert_eq!(Some('y'), t.pop_front_char()); 2121 | assert_eq!(Some('z'), t.pop_front_char()); 2122 | assert_eq!(None, t.pop_front_char()); 2123 | 2124 | let mut t = mk(b" \t xyz"); 2125 | assert!(Some((mk(b" \t "), true)) == t.pop_front_char_run(char::is_whitespace)); 2126 | assert!(Some((mk(b"xyz"), false)) == t.pop_front_char_run(char::is_whitespace)); 2127 | assert!(t.pop_front_char_run(char::is_whitespace).is_none()); 2128 | 2129 | let mut t = Tendril::::new(); 2130 | assert!(t.try_push_char('x').is_ok()); 2131 | assert!(t.try_push_char('\0').is_ok()); 2132 | assert!(t.try_push_char('\u{a0}').is_err()); 2133 | assert_eq!(b"x\0", t.as_byte_slice()); 2134 | } 2135 | 2136 | #[test] 2137 | fn latin1() { 2138 | fn mk(x: &[u8]) -> Tendril { 2139 | x.to_tendril().try_reinterpret().unwrap() 2140 | } 2141 | 2142 | let mut t = mk(b"\xd8_\xd8"); 2143 | assert_eq!(Some('Ø'), t.pop_front_char()); 2144 | assert_eq!(Some('_'), t.pop_front_char()); 2145 | assert_eq!(Some('Ø'), t.pop_front_char()); 2146 | assert_eq!(None, t.pop_front_char()); 2147 | 2148 | let mut t = mk(b" \t \xfe\xa7z"); 2149 | assert!(Some((mk(b" \t "), true)) == t.pop_front_char_run(char::is_whitespace)); 2150 | assert!(Some((mk(b"\xfe\xa7z"), false)) == t.pop_front_char_run(char::is_whitespace)); 2151 | assert!(t.pop_front_char_run(char::is_whitespace).is_none()); 2152 | 2153 | let mut t = Tendril::::new(); 2154 | assert!(t.try_push_char('x').is_ok()); 2155 | assert!(t.try_push_char('\0').is_ok()); 2156 | assert!(t.try_push_char('\u{a0}').is_ok()); 2157 | assert!(t.try_push_char('ő').is_err()); 2158 | assert!(t.try_push_char('я').is_err()); 2159 | assert!(t.try_push_char('\u{a66e}').is_err()); 2160 | assert!(t.try_push_char('\u{1f4a9}').is_err()); 2161 | assert_eq!(b"x\0\xa0", t.as_byte_slice()); 2162 | } 2163 | 2164 | #[test] 2165 | fn format() { 2166 | assert_eq!("", &*format_tendril!("")); 2167 | assert_eq!( 2168 | "two and two make 4", 2169 | &*format_tendril!("two and two make {}", 2 + 2) 2170 | ); 2171 | } 2172 | 2173 | #[test] 2174 | fn merge_shared() { 2175 | let t = "012345678901234567890123456789".to_tendril(); 2176 | let a = t.subtendril(10, 20); 2177 | assert!(a.is_shared()); 2178 | assert_eq!("01234567890123456789", &*a); 2179 | let mut b = t.subtendril(0, 10); 2180 | assert!(b.is_shared()); 2181 | assert_eq!("0123456789", &*b); 2182 | 2183 | b.push_tendril(&a); 2184 | assert!(b.is_shared()); 2185 | assert!(a.is_shared()); 2186 | assert!(a.is_shared_with(&b)); 2187 | assert!(b.is_shared_with(&a)); 2188 | assert_eq!("012345678901234567890123456789", &*b); 2189 | 2190 | assert!(t.is_shared()); 2191 | assert!(t.is_shared_with(&a)); 2192 | assert!(t.is_shared_with(&b)); 2193 | } 2194 | 2195 | #[test] 2196 | fn merge_cant_share() { 2197 | let t = "012345678901234567890123456789".to_tendril(); 2198 | let mut b = t.subtendril(0, 10); 2199 | assert!(b.is_shared()); 2200 | assert_eq!("0123456789", &*b); 2201 | 2202 | b.push_tendril(&"abcd".to_tendril()); 2203 | assert!(!b.is_shared()); 2204 | assert_eq!("0123456789abcd", &*b); 2205 | } 2206 | 2207 | #[test] 2208 | fn shared_doesnt_reserve() { 2209 | let mut t = "012345678901234567890123456789".to_tendril(); 2210 | let a = t.subtendril(1, 10); 2211 | 2212 | assert!(t.is_shared()); 2213 | t.reserve(10); 2214 | assert!(t.is_shared()); 2215 | 2216 | let _ = a; 2217 | } 2218 | 2219 | #[test] 2220 | fn out_of_bounds() { 2221 | assert!("".to_tendril().try_subtendril(0, 1).is_err()); 2222 | assert!("abc".to_tendril().try_subtendril(0, 4).is_err()); 2223 | assert!("abc".to_tendril().try_subtendril(3, 1).is_err()); 2224 | assert!("abc".to_tendril().try_subtendril(7, 1).is_err()); 2225 | 2226 | let mut t = "".to_tendril(); 2227 | assert!(t.try_pop_front(1).is_err()); 2228 | assert!(t.try_pop_front(5).is_err()); 2229 | assert!(t.try_pop_front(500).is_err()); 2230 | assert!(t.try_pop_back(1).is_err()); 2231 | assert!(t.try_pop_back(5).is_err()); 2232 | assert!(t.try_pop_back(500).is_err()); 2233 | 2234 | let mut t = "abcd".to_tendril(); 2235 | assert!(t.try_pop_front(1).is_ok()); 2236 | assert!(t.try_pop_front(4).is_err()); 2237 | assert!(t.try_pop_front(500).is_err()); 2238 | assert!(t.try_pop_back(1).is_ok()); 2239 | assert!(t.try_pop_back(3).is_err()); 2240 | assert!(t.try_pop_back(500).is_err()); 2241 | } 2242 | 2243 | #[test] 2244 | fn compare() { 2245 | for &a in &[ 2246 | "indiscretions", 2247 | "validity", 2248 | "hallucinogenics", 2249 | "timelessness", 2250 | "original", 2251 | "microcosms", 2252 | "boilers", 2253 | "mammoth", 2254 | ] { 2255 | for &b in &[ 2256 | "intrepidly", 2257 | "frigid", 2258 | "spa", 2259 | "cardigans", 2260 | "guileful", 2261 | "evaporated", 2262 | "unenthusiastic", 2263 | "legitimate", 2264 | ] { 2265 | let ta = a.to_tendril(); 2266 | let tb = b.to_tendril(); 2267 | 2268 | assert_eq!(a.eq(b), ta.eq(&tb)); 2269 | assert_eq!(a.ne(b), ta.ne(&tb)); 2270 | assert_eq!(a.lt(b), ta.lt(&tb)); 2271 | assert_eq!(a.le(b), ta.le(&tb)); 2272 | assert_eq!(a.gt(b), ta.gt(&tb)); 2273 | assert_eq!(a.ge(b), ta.ge(&tb)); 2274 | assert_eq!(a.partial_cmp(b), ta.partial_cmp(&tb)); 2275 | assert_eq!(a.cmp(b), ta.cmp(&tb)); 2276 | } 2277 | } 2278 | } 2279 | 2280 | #[test] 2281 | fn extend_and_from_iterator() { 2282 | // Testing Extend and FromIterator for the various Ts. 2283 | 2284 | // Tendril 2285 | let mut t = "Hello".to_tendril(); 2286 | t.extend(None::<&Tendril<_>>.into_iter()); 2287 | assert_eq!("Hello", &*t); 2288 | t.extend(&[", ".to_tendril(), "world".to_tendril(), "!".to_tendril()]); 2289 | assert_eq!("Hello, world!", &*t); 2290 | assert_eq!( 2291 | "Hello, world!", 2292 | &*[ 2293 | "Hello".to_tendril(), 2294 | ", ".to_tendril(), 2295 | "world".to_tendril(), 2296 | "!".to_tendril() 2297 | ] 2298 | .iter() 2299 | .collect::() 2300 | ); 2301 | 2302 | // &str 2303 | let mut t = "Hello".to_tendril(); 2304 | t.extend(None::<&str>.into_iter()); 2305 | assert_eq!("Hello", &*t); 2306 | t.extend([", ", "world", "!"].iter().map(|&s| s)); 2307 | assert_eq!("Hello, world!", &*t); 2308 | assert_eq!( 2309 | "Hello, world!", 2310 | &*["Hello", ", ", "world", "!"] 2311 | .iter() 2312 | .map(|&s| s) 2313 | .collect::() 2314 | ); 2315 | 2316 | // &[u8] 2317 | let mut t = b"Hello".to_tendril(); 2318 | t.extend(None::<&[u8]>.into_iter()); 2319 | assert_eq!(b"Hello", &*t); 2320 | t.extend( 2321 | [b", ".as_ref(), b"world".as_ref(), b"!".as_ref()] 2322 | .iter() 2323 | .map(|&s| s), 2324 | ); 2325 | assert_eq!(b"Hello, world!", &*t); 2326 | assert_eq!( 2327 | b"Hello, world!", 2328 | &*[ 2329 | b"Hello".as_ref(), 2330 | b", ".as_ref(), 2331 | b"world".as_ref(), 2332 | b"!".as_ref() 2333 | ] 2334 | .iter() 2335 | .map(|&s| s) 2336 | .collect::() 2337 | ); 2338 | 2339 | let string = "the quick brown fox jumps over the lazy dog"; 2340 | let string_expected = string.to_tendril(); 2341 | let bytes = string.as_bytes(); 2342 | let bytes_expected = bytes.to_tendril(); 2343 | 2344 | // char 2345 | assert_eq!(string_expected, string.chars().collect()); 2346 | let mut tendril = StrTendril::new(); 2347 | tendril.extend(string.chars()); 2348 | assert_eq!(string_expected, tendril); 2349 | 2350 | // &u8 2351 | assert_eq!(bytes_expected, bytes.iter().collect()); 2352 | let mut tendril = ByteTendril::new(); 2353 | tendril.extend(bytes); 2354 | assert_eq!(bytes_expected, tendril); 2355 | 2356 | // u8 2357 | assert_eq!(bytes_expected, bytes.iter().map(|&b| b).collect()); 2358 | let mut tendril = ByteTendril::new(); 2359 | tendril.extend(bytes.iter().map(|&b| b)); 2360 | assert_eq!(bytes_expected, tendril); 2361 | } 2362 | 2363 | #[test] 2364 | fn from_str() { 2365 | use std::str::FromStr; 2366 | let t: Tendril<_> = FromStr::from_str("foo bar baz").unwrap(); 2367 | assert_eq!("foo bar baz", &*t); 2368 | } 2369 | 2370 | #[test] 2371 | fn from_char() { 2372 | assert_eq!("o", &*StrTendril::from_char('o')); 2373 | assert_eq!("ő", &*StrTendril::from_char('ő')); 2374 | assert_eq!("\u{a66e}", &*StrTendril::from_char('\u{a66e}')); 2375 | assert_eq!("\u{1f4a9}", &*StrTendril::from_char('\u{1f4a9}')); 2376 | } 2377 | 2378 | #[test] 2379 | #[cfg_attr(miri, ignore)] // slow 2380 | fn read() { 2381 | fn check(x: &[u8]) { 2382 | use std::io::Cursor; 2383 | let mut t = ByteTendril::new(); 2384 | assert_eq!(x.len(), Cursor::new(x).read_to_tendril(&mut t).unwrap()); 2385 | assert_eq!(x, &*t); 2386 | } 2387 | 2388 | check(b""); 2389 | check(b"abcd"); 2390 | 2391 | let long: Vec = iter::repeat(b'x').take(1_000_000).collect(); 2392 | check(&long); 2393 | } 2394 | 2395 | #[test] 2396 | fn hash_map_key() { 2397 | use std::collections::HashMap; 2398 | 2399 | // As noted with Borrow, indexing on HashMap is byte-based because of 2400 | // https://github.com/rust-lang/rust/issues/27108. 2401 | let mut map = HashMap::new(); 2402 | map.insert("foo".to_tendril(), 1); 2403 | assert_eq!(map.get(b"foo".as_ref()), Some(&1)); 2404 | assert_eq!(map.get(b"bar".as_ref()), None); 2405 | 2406 | let mut map = HashMap::new(); 2407 | map.insert(b"foo".to_tendril(), 1); 2408 | assert_eq!(map.get(b"foo".as_ref()), Some(&1)); 2409 | assert_eq!(map.get(b"bar".as_ref()), None); 2410 | } 2411 | 2412 | #[test] 2413 | fn atomic() { 2414 | assert_send::>(); 2415 | let s: Tendril = Tendril::from_slice("this is a string"); 2416 | assert!(!s.is_shared()); 2417 | let mut t = s.clone(); 2418 | assert!(s.is_shared()); 2419 | let sp = s.as_ptr() as usize; 2420 | thread::spawn(move || { 2421 | assert!(t.is_shared()); 2422 | t.push_slice(" extended"); 2423 | assert_eq!("this is a string extended", &*t); 2424 | assert!(t.as_ptr() as usize != sp); 2425 | assert!(!t.is_shared()); 2426 | }) 2427 | .join() 2428 | .unwrap(); 2429 | assert!(s.is_shared()); 2430 | assert_eq!("this is a string", &*s); 2431 | } 2432 | 2433 | #[test] 2434 | fn send() { 2435 | assert_send::>(); 2436 | let s = "this is a string".to_tendril(); 2437 | let t = s.clone(); 2438 | let s2 = s.into_send(); 2439 | thread::spawn(move || { 2440 | let s = StrTendril::from(s2); 2441 | assert!(!s.is_shared()); 2442 | assert_eq!("this is a string", &*s); 2443 | }) 2444 | .join() 2445 | .unwrap(); 2446 | assert_eq!("this is a string", &*t); 2447 | } 2448 | 2449 | /// https://github.com/servo/tendril/issues/58 2450 | #[test] 2451 | fn issue_58() { 2452 | let data = "

Hello!

, World!"; 2453 | let s: Tendril = data.into(); 2454 | assert_eq!(&*s, data); 2455 | let s: Tendril = s.into_send().into(); 2456 | assert_eq!(&*s, data); 2457 | } 2458 | 2459 | #[test] 2460 | fn inline_send() { 2461 | let s = "x".to_tendril(); 2462 | let t = s.clone(); 2463 | let s2 = s.into_send(); 2464 | thread::spawn(move || { 2465 | let s = StrTendril::from(s2); 2466 | assert!(!s.is_shared()); 2467 | assert_eq!("x", &*s); 2468 | }) 2469 | .join() 2470 | .unwrap(); 2471 | assert_eq!("x", &*t); 2472 | } 2473 | } 2474 | --------------------------------------------------------------------------------