├── .gitignore ├── README.md ├── .travis.yml ├── Cargo.toml ├── LICENSE └── src ├── not_quite_std.rs └── lib.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /Cargo.lock 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | rust-wtf8 2 | ========= 3 | 4 | [![No Maintenance Intended](http://unmaintained.tech/badge.svg)](http://unmaintained.tech/) 5 | 6 | Historical implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/). 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | rust: nightly 3 | 4 | env: 5 | - secure: fwItLzDPQp/KKxgooGrmcjZ8GKFmYx0kbRBG6gWg++blXNBotfv6weYV1IyIN1oIMa2gIf1+k4XH+Sf7OGkqIJ4EH31Wl9caxGBqa6/Rc5aCPaJuNcA20fyBp4BoiZ1R78yudbzvc63fP9wPE7idX0ZaV3k7TybPSTkFNau0Z/g= 6 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | 3 | name = "wtf8" 4 | version = "0.1.0" 5 | authors = ["Simon Sapin"] 6 | 7 | description = "Implementation of the WTF-8 encoding. https://simonsapin.github.io/wtf-8/" 8 | repository = "https://github.com/SimonSapin/rust-wtf8" 9 | readme = "README.md" 10 | keywords = ["unicode", "encoding", "surrogate"] 11 | license = "MIT" 12 | 13 | [lib] 14 | name = "wtf8" 15 | doctest = false 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 Simon Sapin 2 | 3 | Permission is hereby granted, free of charge, to any 4 | person obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the 6 | Software without restriction, including without 7 | limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software 10 | is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice 14 | shall be included in all copies or substantial portions 15 | of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /src/not_quite_std.rs: -------------------------------------------------------------------------------- 1 | //! The code in this module is copied from Rust standard library 2 | //! (the `std` crate and crates it is a facade for) 3 | //! at commit 16d80de231abb2b1756f3951ffd4776d681035eb, 4 | //! with the signature changed to use `Wtf8Buf`, `Wtf8`, and `CodePoint` 5 | //! instead of `String`, `&str`, and `char`. 6 | //! 7 | //! FIXME: if and when this is moved into the standard library, 8 | //! try to avoid the code duplication. 9 | //! Maybe by having private generic code that is monomorphized to UTF-8 and WTF-8? 10 | 11 | use core::char; 12 | use core::mem; 13 | use core::slice; 14 | 15 | use super::{Wtf8Buf, Wtf8, CodePoint, IllFormedUtf16CodeUnits}; 16 | 17 | // UTF-8 ranges and tags for encoding characters 18 | // Copied from 48d5fe9ec560b53b1f5069219b0d62015e1de5ba^:src/libcore/char.rs 19 | const TAG_CONT: u8 = 0b1000_0000; 20 | const TAG_TWO_B: u8 = 0b1100_0000; 21 | const TAG_THREE_B: u8 = 0b1110_0000; 22 | const TAG_FOUR_B: u8 = 0b1111_0000; 23 | const MAX_ONE_B: u32 = 0x80; 24 | const MAX_TWO_B: u32 = 0x800; 25 | const MAX_THREE_B: u32 = 0x10000; 26 | 27 | /// Copied from 48d5fe9ec560b53b1f5069219b0d62015e1de5ba^:src/libcore/char.rs 28 | #[inline] 29 | fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option { 30 | // Marked #[inline] to allow llvm optimizing it away 31 | if code < MAX_ONE_B && !dst.is_empty() { 32 | dst[0] = code as u8; 33 | Some(1) 34 | } else if code < MAX_TWO_B && dst.len() >= 2 { 35 | dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; 36 | dst[1] = (code & 0x3F) as u8 | TAG_CONT; 37 | Some(2) 38 | } else if code < MAX_THREE_B && dst.len() >= 3 { 39 | dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; 40 | dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT; 41 | dst[2] = (code & 0x3F) as u8 | TAG_CONT; 42 | Some(3) 43 | } else if dst.len() >= 4 { 44 | dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; 45 | dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT; 46 | dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT; 47 | dst[3] = (code & 0x3F) as u8 | TAG_CONT; 48 | Some(4) 49 | } else { 50 | None 51 | } 52 | } 53 | 54 | /// Copied from 48d5fe9ec560b53b1f5069219b0d62015e1de5ba^:src/libcore/char.rs 55 | #[inline] 56 | fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option { 57 | // Marked #[inline] to allow llvm optimizing it away 58 | if (ch & 0xFFFF) == ch && !dst.is_empty() { 59 | // The BMP falls through (assuming non-surrogate, as it should) 60 | dst[0] = ch as u16; 61 | Some(1) 62 | } else if dst.len() >= 2 { 63 | // Supplementary planes break into surrogates. 64 | ch -= 0x1_0000; 65 | dst[0] = 0xD800 | ((ch >> 10) as u16); 66 | dst[1] = 0xDC00 | ((ch as u16) & 0x3FF); 67 | Some(2) 68 | } else { 69 | None 70 | } 71 | } 72 | 73 | /// Copied from core::str::next_code_point 74 | #[inline] 75 | pub fn next_code_point(bytes: &mut slice::Iter) -> Option { 76 | // Decode UTF-8 77 | let x = match bytes.next() { 78 | None => return None, 79 | Some(&next_byte) if next_byte < 128 => return Some(next_byte as u32), 80 | Some(&next_byte) => next_byte, 81 | }; 82 | 83 | // Multibyte case follows 84 | // Decode from a byte combination out of: [[[x y] z] w] 85 | // NOTE: Performance is sensitive to the exact formulation here 86 | let init = utf8_first_byte(x, 2); 87 | let y = unwrap_or_0(bytes.next()); 88 | let mut ch = utf8_acc_cont_byte(init, y); 89 | if x >= 0xE0 { 90 | // [[x y z] w] case 91 | // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid 92 | let z = unwrap_or_0(bytes.next()); 93 | let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); 94 | ch = init << 12 | y_z; 95 | if x >= 0xF0 { 96 | // [x y z w] case 97 | // use only the lower 3 bits of `init` 98 | let w = unwrap_or_0(bytes.next()); 99 | ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); 100 | } 101 | } 102 | 103 | Some(ch) 104 | } 105 | 106 | #[inline] 107 | fn utf8_first_byte(byte: u8, width: u32) -> u32 { (byte & (0x7F >> width)) as u32 } 108 | 109 | /// Return the value of `ch` updated with continuation byte `byte`. 110 | #[inline] 111 | fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { (ch << 6) | (byte & CONT_MASK) as u32 } 112 | 113 | #[inline] 114 | fn unwrap_or_0(opt: Option<&u8>) -> u8 { 115 | match opt { 116 | Some(&byte) => byte, 117 | None => 0, 118 | } 119 | } 120 | 121 | /// Mask of the value bits of a continuation byte 122 | const CONT_MASK: u8 = 0b0011_1111; 123 | 124 | /// Copied from String::push 125 | /// This does **not** include the WTF-8 concatenation check. 126 | #[inline] 127 | pub fn push_code_point(string: &mut Wtf8Buf, code_point: CodePoint) { 128 | let cur_len = string.len(); 129 | // This may use up to 4 bytes. 130 | string.reserve(4); 131 | 132 | unsafe { 133 | // Attempt to not use an intermediate buffer by just pushing bytes 134 | // directly onto this string. 135 | let slice = slice::from_raw_parts_mut( 136 | string.bytes.as_mut_ptr().offset(cur_len as isize), 137 | 4, 138 | ); 139 | let used = encode_utf8_raw(code_point.to_u32(), slice).unwrap_or(0); 140 | string.bytes.set_len(cur_len + used); 141 | } 142 | } 143 | 144 | 145 | /// Copied from core::str::StrPrelude::is_char_boundary 146 | #[inline] 147 | pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool { 148 | if index == slice.len() { return true; } 149 | match slice.bytes.get(index) { 150 | None => false, 151 | Some(&b) => b < 128u8 || b >= 192u8, 152 | } 153 | } 154 | 155 | /// Copied from core::str::raw::slice_unchecked 156 | #[inline] 157 | pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 { 158 | mem::transmute(slice::from_raw_parts( 159 | s.bytes.as_ptr().offset(begin as isize), 160 | end - begin, 161 | )) 162 | } 163 | 164 | /// Copied from core::str::raw::slice_error_fail 165 | #[inline(never)] 166 | pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! { 167 | assert!(begin <= end); 168 | panic!("index {} and/or {} in {:?} do not lie on character boundary", 169 | begin, end, s); 170 | } 171 | 172 | 173 | /// Copied from core::str::Utf16CodeUnits::next 174 | pub fn next_utf16_code_unit(iter: &mut IllFormedUtf16CodeUnits) -> Option { 175 | if iter.extra != 0 { 176 | let tmp = iter.extra; 177 | iter.extra = 0; 178 | return Some(tmp); 179 | } 180 | 181 | let mut buf = [0u16; 2]; 182 | iter.code_points.next().map(|code_point| { 183 | let n = encode_utf16_raw(code_point.to_u32(), &mut buf).unwrap_or(0); 184 | if n == 2 { iter.extra = buf[1]; } 185 | buf[0] 186 | }) 187 | } 188 | 189 | /// Copied from src/librustc_unicode/char.rs 190 | pub struct DecodeUtf16 191 | where I: Iterator 192 | { 193 | iter: I, 194 | buf: Option, 195 | } 196 | 197 | 198 | /// Copied from src/librustc_unicode/char.rs 199 | #[inline] 200 | pub fn decode_utf16>(iterable: I) -> DecodeUtf16 { 201 | DecodeUtf16 { 202 | iter: iterable.into_iter(), 203 | buf: None, 204 | } 205 | } 206 | 207 | /// Copied from src/librustc_unicode/char.rs 208 | impl> Iterator for DecodeUtf16 { 209 | type Item = Result; 210 | 211 | fn next(&mut self) -> Option> { 212 | let u = match self.buf.take() { 213 | Some(buf) => buf, 214 | None => match self.iter.next() { 215 | Some(u) => u, 216 | None => return None, 217 | }, 218 | }; 219 | 220 | if u < 0xD800 || 0xDFFF < u { 221 | // not a surrogate 222 | Some(Ok(unsafe { char::from_u32_unchecked(u as u32) })) 223 | } else if u >= 0xDC00 { 224 | // a trailing surrogate 225 | Some(Err(u)) 226 | } else { 227 | let u2 = match self.iter.next() { 228 | Some(u2) => u2, 229 | // eof 230 | None => return Some(Err(u)), 231 | }; 232 | if u2 < 0xDC00 || u2 > 0xDFFF { 233 | // not a trailing surrogate so we're not a valid 234 | // surrogate pair, so rewind to redecode u2 next time. 235 | self.buf = Some(u2); 236 | return Some(Err(u)); 237 | } 238 | 239 | // all ok, so lets decode it. 240 | let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000; 241 | Some(Ok(unsafe { char::from_u32_unchecked(c) })) 242 | } 243 | } 244 | 245 | #[inline] 246 | fn size_hint(&self) -> (usize, Option) { 247 | let (low, high) = self.iter.size_hint(); 248 | // we could be entirely valid surrogates (2 elements per 249 | // char), or entirely non-surrogates (1 element per char) 250 | (low / 2, high) 251 | } 252 | } 253 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | 3 | Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/). 4 | 5 | This library uses Rust’s type system to maintain 6 | [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed), 7 | like the `String` and `&str` types do for UTF-8. 8 | 9 | Since [WTF-8 must not be used 10 | for interchange](https://simonsapin.github.io/wtf-8/#intended-audience), 11 | this library deliberately does not provide access to the underlying bytes 12 | of WTF-8 strings, 13 | nor can it decode WTF-8 from arbitrary bytes. 14 | WTF-8 strings can be obtained from UTF-8, UTF-16, or code points. 15 | 16 | */ 17 | 18 | #![no_std] 19 | 20 | extern crate alloc; 21 | 22 | use alloc::borrow::Cow; 23 | use alloc::string::String; 24 | use alloc::vec::Vec; 25 | use core::str; 26 | use core::cmp::Ordering; 27 | use core::fmt; 28 | use core::hash; 29 | use core::iter::{FromIterator, IntoIterator}; 30 | use core::mem::transmute; 31 | use core::ops::Deref; 32 | use core::slice; 33 | 34 | mod not_quite_std; 35 | 36 | 37 | static UTF8_REPLACEMENT_CHARACTER: &'static [u8] = b"\xEF\xBF\xBD"; 38 | 39 | /// A Unicode code point: from U+0000 to U+10FFFF. 40 | /// 41 | /// Compare with the `char` type, 42 | /// which represents a Unicode scalar value: 43 | /// a code point that is not a surrogate (U+D800 to U+DFFF). 44 | #[derive(Eq, PartialEq, Ord, PartialOrd, Clone)] 45 | pub struct CodePoint { 46 | value: u32 47 | } 48 | 49 | impl Copy for CodePoint {} 50 | 51 | 52 | /// Format the code point as `U+` followed by four to six hexadecimal digits. 53 | /// Example: `U+1F4A9` 54 | impl fmt::Debug for CodePoint { 55 | #[inline] 56 | fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { 57 | write!(formatter, "U+{:04X}", self.value) 58 | } 59 | } 60 | 61 | 62 | impl CodePoint { 63 | /// Unsafely create a new `CodePoint` without checking the value. 64 | /// 65 | /// Only use when `value` is known to be less than or equal to 0x10FFFF. 66 | #[inline] 67 | pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint { 68 | CodePoint { value: value } 69 | } 70 | 71 | /// Create a new `CodePoint` if the value is a valid code point. 72 | /// 73 | /// Return `None` if `value` is above 0x10FFFF. 74 | #[inline] 75 | pub fn from_u32(value: u32) -> Option { 76 | match value { 77 | 0 ..= 0x10FFFF => Some(CodePoint { value: value }), 78 | _ => None 79 | } 80 | } 81 | 82 | /// Create a new `CodePoint` from a `char`. 83 | /// 84 | /// Since all Unicode scalar values are code points, this always succeds. 85 | #[inline] 86 | pub fn from_char(value: char) -> CodePoint { 87 | CodePoint { value: value as u32 } 88 | } 89 | 90 | /// Return the numeric value of the code point. 91 | #[inline] 92 | pub fn to_u32(&self) -> u32 { 93 | self.value 94 | } 95 | 96 | /// Optionally return a Unicode scalar value for the code point. 97 | /// 98 | /// Return `None` if the code point is a surrogate (from U+D800 to U+DFFF). 99 | #[inline] 100 | pub fn to_char(&self) -> Option { 101 | match self.value { 102 | 0xD800 ..= 0xDFFF => None, 103 | _ => Some(unsafe { transmute(self.value) }) 104 | } 105 | } 106 | 107 | /// Return a Unicode scalar value for the code point. 108 | /// 109 | /// Return `'\u{FFFD}'` (the replacement character “�”) 110 | /// if the code point is a surrogate (from U+D800 to U+DFFF). 111 | #[inline] 112 | pub fn to_char_lossy(&self) -> char { 113 | self.to_char().unwrap_or('\u{FFFD}') 114 | } 115 | } 116 | 117 | 118 | /// An owned, growable string of well-formed WTF-8 data. 119 | /// 120 | /// Similar to `String`, but can additionally contain surrogate code points 121 | /// if they’re not in a surrogate pair. 122 | #[derive(Eq, PartialEq, Ord, PartialOrd, Clone)] 123 | pub struct Wtf8Buf { 124 | bytes: Vec 125 | } 126 | 127 | impl Deref for Wtf8Buf { 128 | type Target = Wtf8; 129 | 130 | fn deref(&self) -> &Wtf8 { 131 | unsafe { transmute(&*self.bytes) } 132 | } 133 | } 134 | 135 | /// Format the string with double quotes, 136 | /// and surrogates as `\u` followed by four hexadecimal digits. 137 | /// Example: `"a\u{D800}"` for a string with code points [U+0061, U+D800] 138 | impl fmt::Debug for Wtf8Buf { 139 | #[inline] 140 | fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { 141 | Wtf8::fmt(self, formatter) 142 | } 143 | } 144 | 145 | 146 | impl Wtf8Buf { 147 | /// Create an new, empty WTF-8 string. 148 | #[inline] 149 | pub fn new() -> Wtf8Buf { 150 | Wtf8Buf { bytes: Vec::new() } 151 | } 152 | 153 | /// Create an new, empty WTF-8 string with pre-allocated capacity for `n` bytes. 154 | #[inline] 155 | pub fn with_capacity(n: usize) -> Wtf8Buf { 156 | Wtf8Buf { bytes: Vec::with_capacity(n) } 157 | } 158 | 159 | /// Create a WTF-8 string from an UTF-8 `String`. 160 | /// 161 | /// This takes ownership of the `String` and does not copy. 162 | /// 163 | /// Since WTF-8 is a superset of UTF-8, this always succeeds. 164 | #[inline] 165 | pub fn from_string(string: String) -> Wtf8Buf { 166 | Wtf8Buf { bytes: string.into_bytes() } 167 | } 168 | 169 | /// Create a WTF-8 string from an UTF-8 `&str` slice. 170 | /// 171 | /// This copies the content of the slice. 172 | /// 173 | /// Since WTF-8 is a superset of UTF-8, this always succeeds. 174 | #[inline] 175 | pub fn from_str(str: &str) -> Wtf8Buf { 176 | Wtf8Buf { bytes: str.as_bytes().to_vec() } 177 | } 178 | 179 | /// Create a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units. 180 | /// 181 | /// This is lossless: calling `.to_ill_formed_utf16()` on the resulting string 182 | /// will always return the original code units. 183 | pub fn from_ill_formed_utf16(v: &[u16]) -> Wtf8Buf { 184 | let mut string = Wtf8Buf::with_capacity(v.len()); 185 | for item in not_quite_std::decode_utf16(v.iter().cloned()) { 186 | match item { 187 | Ok(c) => string.push_char(c), 188 | Err(s) => { 189 | // Surrogates are known to be in the code point range. 190 | let code_point = unsafe { CodePoint::from_u32_unchecked(s as u32) }; 191 | // Skip the WTF-8 concatenation check, 192 | // surrogate pairs are already decoded by utf16_items 193 | not_quite_std::push_code_point(&mut string, code_point) 194 | } 195 | } 196 | } 197 | string 198 | } 199 | 200 | /// Reserves capacity for at least `additional` more bytes to be inserted 201 | /// in the given `Wtf8Buf`. 202 | /// The collection may reserve more space to avoid frequent reallocations. 203 | /// 204 | /// # Panics 205 | /// 206 | /// Panics if the new capacity overflows `usize`. 207 | /// 208 | /// # Example 209 | /// 210 | /// ``` 211 | /// let mut s = Wtf8Buf::new(); 212 | /// s.reserve(10); 213 | /// assert!(s.capacity() >= 10); 214 | /// ``` 215 | #[inline] 216 | pub fn reserve(&mut self, additional: usize) { 217 | self.bytes.reserve(additional) 218 | } 219 | 220 | /// Returns the number of bytes that this string buffer can hold without reallocating. 221 | /// 222 | /// # Example 223 | /// 224 | /// ``` 225 | /// let s = Wtf8Buf::with_capacity(10); 226 | /// assert!(s.capacity() >= 10); 227 | /// ``` 228 | #[inline] 229 | pub fn capacity(&self) -> usize { 230 | self.bytes.capacity() 231 | } 232 | 233 | /// Append an UTF-8 slice at the end of the string. 234 | #[inline] 235 | pub fn push_str(&mut self, other: &str) { 236 | self.bytes.extend_from_slice(other.as_bytes()) 237 | } 238 | 239 | /// Append a WTF-8 slice at the end of the string. 240 | /// 241 | /// This replaces newly paired surrogates at the boundary 242 | /// with a supplementary code point, 243 | /// like concatenating ill-formed UTF-16 strings effectively would. 244 | #[inline] 245 | pub fn push_wtf8(&mut self, other: &Wtf8) { 246 | match ((&*self).final_lead_surrogate(), other.initial_trail_surrogate()) { 247 | // Replace newly paired surrogates by a supplementary code point. 248 | (Some(lead), Some(trail)) => { 249 | let len_without_lead_surrogate = self.len() - 3; 250 | self.bytes.truncate(len_without_lead_surrogate); 251 | let other_without_trail_surrogate = &other.bytes[3..]; 252 | // 4 bytes for the supplementary code point 253 | self.bytes.reserve(4 + other_without_trail_surrogate.len()); 254 | self.push_char(decode_surrogate_pair(lead, trail)); 255 | self.bytes.extend_from_slice(other_without_trail_surrogate); 256 | } 257 | _ => self.bytes.extend_from_slice(&other.bytes) 258 | } 259 | } 260 | 261 | /// Append a Unicode scalar value at the end of the string. 262 | #[inline] 263 | pub fn push_char(&mut self, c: char) { 264 | not_quite_std::push_code_point(self, CodePoint::from_char(c)) 265 | } 266 | 267 | /// Append a code point at the end of the string. 268 | /// 269 | /// This replaces newly paired surrogates at the boundary 270 | /// with a supplementary code point, 271 | /// like concatenating ill-formed UTF-16 strings effectively would. 272 | #[inline] 273 | pub fn push(&mut self, code_point: CodePoint) { 274 | match code_point.to_u32() { 275 | trail @ 0xDC00..=0xDFFF => { 276 | match (&*self).final_lead_surrogate() { 277 | Some(lead) => { 278 | let len_without_lead_surrogate = self.len() - 3; 279 | self.bytes.truncate(len_without_lead_surrogate); 280 | self.push_char(decode_surrogate_pair(lead, trail as u16)); 281 | return 282 | } 283 | _ => {} 284 | } 285 | } 286 | _ => {} 287 | } 288 | 289 | // No newly paired surrogates at the boundary. 290 | not_quite_std::push_code_point(self, code_point) 291 | } 292 | 293 | /// Shortens a string to the specified length. 294 | /// 295 | /// # Failure 296 | /// 297 | /// Fails if `new_len` > current length, 298 | /// or if `new_len` is not a code point boundary. 299 | #[inline] 300 | pub fn truncate(&mut self, new_len: usize) { 301 | assert!(not_quite_std::is_code_point_boundary(self, new_len)); 302 | self.bytes.truncate(new_len) 303 | } 304 | 305 | /// Consume the WTF-8 string and try to convert it to UTF-8. 306 | /// 307 | /// This does not copy the data. 308 | /// 309 | /// If the contents are not well-formed UTF-8 310 | /// (that is, if the string contains surrogates), 311 | /// the original WTF-8 string is returned instead. 312 | pub fn into_string(self) -> Result { 313 | match self.next_surrogate(0) { 314 | None => Ok(unsafe { String::from_utf8_unchecked(self.bytes) }), 315 | Some(_) => Err(self), 316 | } 317 | } 318 | 319 | /// Consume the WTF-8 string and convert it lossily to UTF-8. 320 | /// 321 | /// This does not copy the data (but may overwrite parts of it in place). 322 | /// 323 | /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”) 324 | pub fn into_string_lossy(mut self) -> String { 325 | let mut pos = 0; 326 | loop { 327 | match self.next_surrogate(pos) { 328 | Some((surrogate_pos, _)) => { 329 | pos = surrogate_pos + 3; 330 | self.bytes[surrogate_pos..pos].copy_from_slice(UTF8_REPLACEMENT_CHARACTER); 331 | }, 332 | None => return unsafe { String::from_utf8_unchecked(self.bytes) } 333 | } 334 | } 335 | } 336 | } 337 | 338 | 339 | /// Create a new WTF-8 string from an iterator of code points. 340 | /// 341 | /// This replaces surrogate code point pairs with supplementary code points, 342 | /// like concatenating ill-formed UTF-16 strings effectively would. 343 | impl FromIterator for Wtf8Buf { 344 | fn from_iter>(iterable: T) -> Wtf8Buf { 345 | let mut string = Wtf8Buf::new(); 346 | string.extend(iterable); 347 | string 348 | } 349 | } 350 | 351 | 352 | /// Append code points from an iterator to the string. 353 | /// 354 | /// This replaces surrogate code point pairs with supplementary code points, 355 | /// like concatenating ill-formed UTF-16 strings effectively would. 356 | impl Extend for Wtf8Buf { 357 | fn extend>(&mut self, iterable: T) { 358 | let iterator = iterable.into_iter(); 359 | let (low, _high) = iterator.size_hint(); 360 | // Lower bound of one byte per code point (ASCII only) 361 | self.bytes.reserve(low); 362 | for code_point in iterator { 363 | self.push(code_point); 364 | } 365 | } 366 | } 367 | 368 | /// A borrowed slice of well-formed WTF-8 data. 369 | /// 370 | /// Similar to `&str`, but can additionally contain surrogate code points 371 | /// if they’re not in a surrogate pair. 372 | pub struct Wtf8 { 373 | bytes: [u8] 374 | } 375 | 376 | // FIXME: https://github.com/rust-lang/rust/issues/18805 377 | impl PartialEq for Wtf8 { 378 | fn eq(&self, other: &Wtf8) -> bool { self.bytes.eq(&other.bytes) } 379 | } 380 | 381 | // FIXME: https://github.com/rust-lang/rust/issues/18805 382 | impl Eq for Wtf8 {} 383 | 384 | // FIXME: https://github.com/rust-lang/rust/issues/18738 385 | impl PartialOrd for Wtf8 { 386 | #[inline] 387 | fn partial_cmp(&self, other: &Wtf8) -> Option { 388 | self.bytes.partial_cmp(&other.bytes) 389 | } 390 | #[inline] 391 | fn lt(&self, other: &Wtf8) -> bool { self.bytes.lt(&other.bytes) } 392 | #[inline] 393 | fn le(&self, other: &Wtf8) -> bool { self.bytes.le(&other.bytes) } 394 | #[inline] 395 | fn gt(&self, other: &Wtf8) -> bool { self.bytes.gt(&other.bytes) } 396 | #[inline] 397 | fn ge(&self, other: &Wtf8) -> bool { self.bytes.ge(&other.bytes) } 398 | } 399 | 400 | // FIXME: https://github.com/rust-lang/rust/issues/18738 401 | impl Ord for Wtf8 { 402 | #[inline] 403 | fn cmp(&self, other: &Wtf8) -> Ordering { self.bytes.cmp(&other.bytes) } 404 | } 405 | 406 | 407 | /// Format the slice with double quotes, 408 | /// and surrogates as `\u` followed by four hexadecimal digits. 409 | /// Example: `"a\u{D800}"` for a slice with code points [U+0061, U+D800] 410 | impl fmt::Debug for Wtf8 { 411 | fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { 412 | formatter.write_str("\"")?; 413 | let mut pos = 0; 414 | loop { 415 | match self.next_surrogate(pos) { 416 | None => break, 417 | Some((surrogate_pos, surrogate)) => { 418 | formatter.write_str(unsafe { 419 | str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos]) 420 | })?; 421 | write!(formatter, "\\u{{{:X}}}", surrogate)?; 422 | pos = surrogate_pos + 3; 423 | } 424 | } 425 | } 426 | formatter.write_str(unsafe { 427 | str::from_utf8_unchecked(&self.bytes[pos..]) 428 | })?; 429 | formatter.write_str("\"") 430 | } 431 | } 432 | 433 | 434 | impl Wtf8 { 435 | /// Create a WTF-8 slice from a UTF-8 `&str` slice. 436 | /// 437 | /// Since WTF-8 is a superset of UTF-8, this always succeeds. 438 | #[inline] 439 | pub fn from_str(value: &str) -> &Wtf8 { 440 | unsafe { transmute(value.as_bytes()) } 441 | } 442 | 443 | /// Return the length, in WTF-8 bytes. 444 | #[inline] 445 | pub fn len(&self) -> usize { 446 | self.bytes.len() 447 | } 448 | 449 | /// Return a slice of the given string for the byte range [`begin`..`end`). 450 | /// 451 | /// # Failure 452 | /// 453 | /// Fails when `begin` and `end` do not point to code point boundaries, 454 | /// or point beyond the end of the string. 455 | #[inline] 456 | pub fn slice(&self, begin: usize, end: usize) -> &Wtf8 { 457 | // is_code_point_boundary checks that the index is in [0, .len()] 458 | if begin <= end && 459 | not_quite_std::is_code_point_boundary(self, begin) && 460 | not_quite_std::is_code_point_boundary(self, end) { 461 | unsafe { not_quite_std::slice_unchecked(self, begin, end) } 462 | } else { 463 | not_quite_std::slice_error_fail(self, begin, end) 464 | } 465 | } 466 | 467 | /// Return a slice of the given string from byte `begin` to its end. 468 | /// 469 | /// # Failure 470 | /// 471 | /// Fails when `begin` is not at a code point boundary, 472 | /// or is beyond the end of the string. 473 | #[inline] 474 | pub fn slice_from(&self, begin: usize) -> &Wtf8 { 475 | // is_code_point_boundary checks that the index is in [0, .len()] 476 | if not_quite_std::is_code_point_boundary(self, begin) { 477 | unsafe { not_quite_std::slice_unchecked(self, begin, self.len()) } 478 | } else { 479 | not_quite_std::slice_error_fail(self, begin, self.len()) 480 | } 481 | } 482 | 483 | /// Return a slice of the given string from its beginning to byte `end`. 484 | /// 485 | /// # Failure 486 | /// 487 | /// Fails when `end` is not at a code point boundary, 488 | /// or is beyond the end of the string. 489 | #[inline] 490 | pub fn slice_to(&self, end: usize) -> &Wtf8 { 491 | // is_code_point_boundary checks that the index is in [0, .len()] 492 | if not_quite_std::is_code_point_boundary(self, end) { 493 | unsafe { not_quite_std::slice_unchecked(self, 0, end) } 494 | } else { 495 | not_quite_std::slice_error_fail(self, 0, end) 496 | } 497 | } 498 | 499 | /// Return the code point at `position` if it is in the ASCII range, 500 | /// or `b'\xFF' otherwise. 501 | /// 502 | /// # Failure 503 | /// 504 | /// Fails if `position` is beyond the end of the string. 505 | #[inline] 506 | pub fn ascii_byte_at(&self, position: usize) -> u8 { 507 | match self.bytes[position] { 508 | ascii_byte @ 0x00 ..= 0x7F => ascii_byte, 509 | _ => 0xFF 510 | } 511 | } 512 | 513 | /// Return an iterator for the string’s code points. 514 | #[inline] 515 | pub fn code_points(&self) -> Wtf8CodePoints { 516 | Wtf8CodePoints { bytes: self.bytes.iter() } 517 | } 518 | 519 | /// Try to convert the string to UTF-8 and return a `&str` slice. 520 | /// 521 | /// Return `None` if the string contains surrogates. 522 | /// 523 | /// This does not copy the data. 524 | #[inline] 525 | pub fn as_str(&self) -> Option<&str> { 526 | // Well-formed WTF-8 is also well-formed UTF-8 527 | // if and only if it contains no surrogate. 528 | match self.next_surrogate(0) { 529 | None => Some(unsafe { str::from_utf8_unchecked(&self.bytes) }), 530 | Some(_) => None, 531 | } 532 | } 533 | 534 | /// Lossily convert the string to UTF-8. 535 | /// Return an UTF-8 `&str` slice if the contents are well-formed in UTF-8. 536 | /// 537 | /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”). 538 | /// 539 | /// This only copies the data if necessary (if it contains any surrogate). 540 | pub fn to_string_lossy(&self) -> Cow { 541 | let surrogate_pos = match self.next_surrogate(0) { 542 | None => return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) }), 543 | Some((pos, _)) => pos, 544 | }; 545 | let wtf8_bytes = &self.bytes; 546 | let mut utf8_bytes = Vec::with_capacity(self.len()); 547 | utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]); 548 | utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER); 549 | let mut pos = surrogate_pos + 3; 550 | loop { 551 | match self.next_surrogate(pos) { 552 | Some((surrogate_pos, _)) => { 553 | utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]); 554 | utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER); 555 | pos = surrogate_pos + 3; 556 | }, 557 | None => { 558 | utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]); 559 | return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) }) 560 | } 561 | } 562 | } 563 | } 564 | 565 | /// Convert the WTF-8 string to potentially ill-formed UTF-16 566 | /// and return an iterator of 16-bit code units. 567 | /// 568 | /// This is lossless: 569 | /// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units 570 | /// would always return the original WTF-8 string. 571 | #[inline] 572 | pub fn to_ill_formed_utf16(&self) -> IllFormedUtf16CodeUnits { 573 | IllFormedUtf16CodeUnits { code_points: self.code_points(), extra: 0 } 574 | } 575 | 576 | #[inline] 577 | fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> { 578 | let mut iter = self.bytes[pos..].iter(); 579 | loop { 580 | let b = match iter.next() { 581 | None => return None, 582 | Some(&b) => b, 583 | }; 584 | if b < 0x80 { 585 | pos += 1; 586 | } else if b < 0xE0 { 587 | iter.next(); 588 | pos += 2; 589 | } else if b == 0xED { 590 | match (iter.next(), iter.next()) { 591 | (Some(&b2), Some(&b3)) if b2 >= 0xA0 => { 592 | return Some((pos, decode_surrogate(b2, b3))) 593 | } 594 | _ => pos += 3 595 | } 596 | } else if b < 0xF0 { 597 | iter.next(); 598 | iter.next(); 599 | pos += 3; 600 | } else { 601 | iter.next(); 602 | iter.next(); 603 | iter.next(); 604 | pos += 4; 605 | } 606 | } 607 | } 608 | 609 | #[inline] 610 | fn final_lead_surrogate(&self) -> Option { 611 | let len = self.len(); 612 | if len < 3 { 613 | return None 614 | } 615 | let seq = &self.bytes[len - 3..]; 616 | if seq[0] == 0xED && 0xA0 <= seq[1] && seq[1] <= 0xAF { 617 | Some(decode_surrogate(seq[1], seq[2])) 618 | } else { 619 | None 620 | } 621 | } 622 | 623 | #[inline] 624 | fn initial_trail_surrogate(&self) -> Option { 625 | let len = self.len(); 626 | if len < 3 { 627 | return None 628 | } 629 | let seq = &self.bytes[..3]; 630 | if seq[0] == 0xED && 0xB0 <= seq[1] && seq[1] <= 0xBF { 631 | Some(decode_surrogate(seq[1], seq[2])) 632 | } else { 633 | None 634 | } 635 | } 636 | } 637 | 638 | 639 | #[inline] 640 | fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 { 641 | // The first byte is assumed to be 0xED 642 | 0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F 643 | } 644 | 645 | #[inline] 646 | fn decode_surrogate_pair(lead: u16, trail: u16) -> char { 647 | let code_point = 0x10000 + (((lead as u32 - 0xD800) << 10) | (trail as u32 - 0xDC00)); 648 | unsafe { transmute(code_point) } 649 | } 650 | 651 | 652 | /// Iterator for the code points of a WTF-8 string. 653 | /// 654 | /// Created with the method `.code_points()`. 655 | #[derive(Clone)] 656 | pub struct Wtf8CodePoints<'a> { 657 | bytes: slice::Iter<'a, u8> 658 | } 659 | 660 | impl<'a> Iterator for Wtf8CodePoints<'a> { 661 | type Item = CodePoint; 662 | 663 | #[inline] 664 | fn next(&mut self) -> Option { 665 | match not_quite_std::next_code_point(&mut self.bytes) { 666 | None => None, 667 | Some(value) => { 668 | // Wtf8 invariant says `value` is a valid code point 669 | unsafe { 670 | Some(CodePoint::from_u32_unchecked(value)) 671 | } 672 | } 673 | } 674 | 675 | } 676 | 677 | #[inline] 678 | fn size_hint(&self) -> (usize, Option) { 679 | let (len, _) = self.bytes.size_hint(); 680 | (len.saturating_add(3) / 4, Some(len)) 681 | } 682 | } 683 | 684 | #[derive(Clone)] 685 | pub struct IllFormedUtf16CodeUnits<'a> { 686 | code_points: Wtf8CodePoints<'a>, 687 | extra: u16 688 | } 689 | 690 | impl<'a> Iterator for IllFormedUtf16CodeUnits<'a> { 691 | type Item = u16; 692 | 693 | #[inline] 694 | fn next(&mut self) -> Option { 695 | not_quite_std::next_utf16_code_unit(self) 696 | } 697 | 698 | #[inline] 699 | fn size_hint(&self) -> (usize, Option) { 700 | let (low, high) = self.code_points.size_hint(); 701 | // every code point gets either one u16 or two u16, 702 | // so this iterator is between 1 or 2 times as 703 | // long as the underlying iterator. 704 | (low, high.and_then(|n| n.checked_mul(2))) 705 | } 706 | } 707 | 708 | 709 | impl<'a> PartialEq<&'a Wtf8> for Wtf8Buf { 710 | fn eq(&self, other: &&Wtf8) -> bool { 711 | **self == **other 712 | } 713 | } 714 | 715 | impl<'a> PartialEq for &'a Wtf8 { 716 | fn eq(&self, other: &Wtf8Buf) -> bool { 717 | **self == **other 718 | } 719 | } 720 | 721 | 722 | impl hash::Hash for CodePoint { 723 | #[inline] 724 | fn hash(&self, state: &mut H) { 725 | self.value.hash(state) 726 | } 727 | } 728 | 729 | impl hash::Hash for Wtf8Buf { 730 | #[inline] 731 | fn hash(&self, state: &mut H) { 732 | Wtf8::hash(self, state) 733 | } 734 | } 735 | 736 | impl hash::Hash for Wtf8 { 737 | #[inline] 738 | fn hash(&self, state: &mut H) { 739 | state.write(&self.bytes); 740 | 0xfeu8.hash(state) 741 | } 742 | } 743 | 744 | 745 | #[cfg(test)] 746 | mod tests { 747 | use alloc::format; 748 | use alloc::vec; 749 | use core::mem::transmute; 750 | use super::*; 751 | 752 | #[test] 753 | fn code_point_from_u32() { 754 | assert!(CodePoint::from_u32(0).is_some()); 755 | assert!(CodePoint::from_u32(0xD800).is_some()); 756 | assert!(CodePoint::from_u32(0x10FFFF).is_some()); 757 | assert!(CodePoint::from_u32(0x110000).is_none()); 758 | } 759 | 760 | #[test] 761 | fn code_point_to_u32() { 762 | fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() } 763 | assert_eq!(c(0).to_u32(), 0); 764 | assert_eq!(c(0xD800).to_u32(), 0xD800); 765 | assert_eq!(c(0x10FFFF).to_u32(), 0x10FFFF); 766 | } 767 | 768 | #[test] 769 | fn code_point_from_char() { 770 | assert_eq!(CodePoint::from_char('a').to_u32(), 0x61); 771 | assert_eq!(CodePoint::from_char('💩').to_u32(), 0x1F4A9); 772 | } 773 | 774 | #[test] 775 | fn code_point_to_string() { 776 | assert_eq!(format!("{:?}", CodePoint::from_char('a')), "U+0061"); 777 | assert_eq!(format!("{:?}", CodePoint::from_char('💩')), "U+1F4A9"); 778 | } 779 | 780 | #[test] 781 | fn code_point_to_char() { 782 | fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() } 783 | assert_eq!(c(0x61).to_char(), Some('a')); 784 | assert_eq!(c(0x1F4A9).to_char(), Some('💩')); 785 | assert_eq!(c(0xD800).to_char(), None); 786 | } 787 | 788 | #[test] 789 | fn code_point_to_char_lossy() { 790 | fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() } 791 | assert_eq!(c(0x61).to_char_lossy(), 'a'); 792 | assert_eq!(c(0x1F4A9).to_char_lossy(), '💩'); 793 | assert_eq!(c(0xD800).to_char_lossy(), '\u{FFFD}'); 794 | } 795 | 796 | #[test] 797 | fn wtf8buf_new() { 798 | assert_eq!(Wtf8Buf::new().bytes, b""); 799 | } 800 | 801 | #[test] 802 | fn wtf8buf_from_str() { 803 | assert_eq!(Wtf8Buf::from_str("").bytes, b""); 804 | assert_eq!(Wtf8Buf::from_str("aé 💩").bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); 805 | } 806 | 807 | #[test] 808 | fn wtf8buf_from_string() { 809 | assert_eq!(Wtf8Buf::from_string(String::from("")).bytes, b""); 810 | assert_eq!(Wtf8Buf::from_string(String::from("aé 💩")).bytes, 811 | b"a\xC3\xA9 \xF0\x9F\x92\xA9"); 812 | } 813 | 814 | #[test] 815 | fn wtf8buf_from_ill_formed_utf16() { 816 | assert_eq!(Wtf8Buf::from_ill_formed_utf16(&[]).bytes, b""); 817 | assert_eq!(Wtf8Buf::from_ill_formed_utf16( 818 | &[0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]).bytes, 819 | b"a\xC3\xA9 \xED\xA0\xBD\xF0\x9F\x92\xA9"); 820 | } 821 | 822 | #[test] 823 | fn wtf8buf_push_str() { 824 | let mut string = Wtf8Buf::new(); 825 | assert_eq!(string.bytes, b""); 826 | string.push_str("aé 💩"); 827 | assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); 828 | } 829 | 830 | #[test] 831 | fn wtf8buf_push_char() { 832 | let mut string = Wtf8Buf::from_str("aé "); 833 | assert_eq!(string.bytes, b"a\xC3\xA9 "); 834 | string.push_char('💩'); 835 | assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); 836 | } 837 | 838 | #[test] 839 | fn wtf8buf_push() { 840 | let mut string = Wtf8Buf::from_str("aé "); 841 | assert_eq!(string.bytes, b"a\xC3\xA9 "); 842 | string.push(CodePoint::from_char('💩')); 843 | assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); 844 | 845 | fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() } 846 | 847 | let mut string = Wtf8Buf::new(); 848 | string.push(c(0xD83D)); // lead 849 | string.push(c(0xDCA9)); // trail 850 | assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); // Magic! 851 | 852 | let mut string = Wtf8Buf::new(); 853 | string.push(c(0xD83D)); // lead 854 | string.push(c(0x20)); // not surrogate 855 | string.push(c(0xDCA9)); // trail 856 | assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); 857 | 858 | let mut string = Wtf8Buf::new(); 859 | string.push(c(0xD800)); // lead 860 | string.push(c(0xDBFF)); // lead 861 | assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF"); 862 | 863 | let mut string = Wtf8Buf::new(); 864 | string.push(c(0xD800)); // lead 865 | string.push(c(0xE000)); // not surrogate 866 | assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80"); 867 | 868 | let mut string = Wtf8Buf::new(); 869 | string.push(c(0xD7FF)); // not surrogate 870 | string.push(c(0xDC00)); // trail 871 | assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80"); 872 | 873 | let mut string = Wtf8Buf::new(); 874 | string.push(c(0x61)); // not surrogate, < 3 bytes 875 | string.push(c(0xDC00)); // trail 876 | assert_eq!(string.bytes, b"\x61\xED\xB0\x80"); 877 | 878 | let mut string = Wtf8Buf::new(); 879 | string.push(c(0xDC00)); // trail 880 | assert_eq!(string.bytes, b"\xED\xB0\x80"); 881 | } 882 | 883 | #[test] 884 | fn wtf8buf_push_wtf8() { 885 | let mut string = Wtf8Buf::from_str("aé"); 886 | assert_eq!(string.bytes, b"a\xC3\xA9"); 887 | string.push_wtf8(Wtf8::from_str(" 💩")); 888 | assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); 889 | 890 | fn w(value: &[u8]) -> &Wtf8 { unsafe { transmute(value) } } 891 | 892 | let mut string = Wtf8Buf::new(); 893 | string.push_wtf8(w(b"\xED\xA0\xBD")); // lead 894 | string.push_wtf8(w(b"\xED\xB2\xA9")); // trail 895 | assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); // Magic! 896 | 897 | let mut string = Wtf8Buf::new(); 898 | string.push_wtf8(w(b"\xED\xA0\xBD")); // lead 899 | string.push_wtf8(w(b" ")); // not surrogate 900 | string.push_wtf8(w(b"\xED\xB2\xA9")); // trail 901 | assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); 902 | 903 | let mut string = Wtf8Buf::new(); 904 | string.push_wtf8(w(b"\xED\xA0\x80")); // lead 905 | string.push_wtf8(w(b"\xED\xAF\xBF")); // lead 906 | assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF"); 907 | 908 | let mut string = Wtf8Buf::new(); 909 | string.push_wtf8(w(b"\xED\xA0\x80")); // lead 910 | string.push_wtf8(w(b"\xEE\x80\x80")); // not surrogate 911 | assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80"); 912 | 913 | let mut string = Wtf8Buf::new(); 914 | string.push_wtf8(w(b"\xED\x9F\xBF")); // not surrogate 915 | string.push_wtf8(w(b"\xED\xB0\x80")); // trail 916 | assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80"); 917 | 918 | let mut string = Wtf8Buf::new(); 919 | string.push_wtf8(w(b"a")); // not surrogate, < 3 bytes 920 | string.push_wtf8(w(b"\xED\xB0\x80")); // trail 921 | assert_eq!(string.bytes, b"\x61\xED\xB0\x80"); 922 | 923 | let mut string = Wtf8Buf::new(); 924 | string.push_wtf8(w(b"\xED\xB0\x80")); // trail 925 | assert_eq!(string.bytes, b"\xED\xB0\x80"); 926 | } 927 | 928 | #[test] 929 | fn wtf8buf_truncate() { 930 | let mut string = Wtf8Buf::from_str("aé"); 931 | string.truncate(1); 932 | assert_eq!(string.bytes, b"a"); 933 | } 934 | 935 | #[test] 936 | #[should_panic] 937 | fn wtf8buf_truncate_fail_code_point_boundary() { 938 | let mut string = Wtf8Buf::from_str("aé"); 939 | string.truncate(2); 940 | } 941 | 942 | #[test] 943 | #[should_panic] 944 | fn wtf8buf_truncate_fail_longer() { 945 | let mut string = Wtf8Buf::from_str("aé"); 946 | string.truncate(4); 947 | } 948 | 949 | #[test] 950 | fn wtf8buf_into_string() { 951 | let mut string = Wtf8Buf::from_str("aé 💩"); 952 | assert_eq!(string.clone().into_string(), Ok(String::from("aé 💩"))); 953 | string.push(CodePoint::from_u32(0xD800).unwrap()); 954 | assert_eq!(string.clone().into_string(), Err(string)); 955 | } 956 | 957 | #[test] 958 | fn wtf8buf_into_string_lossy() { 959 | let mut string = Wtf8Buf::from_str("aé 💩"); 960 | assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩")); 961 | string.push(CodePoint::from_u32(0xD800).unwrap()); 962 | assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩�")); 963 | } 964 | 965 | #[test] 966 | fn wtf8buf_from_iterator() { 967 | fn f(values: &[u32]) -> Wtf8Buf { 968 | values.iter().map(|&c| CodePoint::from_u32(c).unwrap()).collect::() 969 | } 970 | assert_eq!(f(&[0x61, 0xE9, 0x20, 0x1F4A9]).bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); 971 | 972 | assert_eq!(f(&[0xD83D, 0xDCA9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic! 973 | assert_eq!(f(&[0xD83D, 0x20, 0xDCA9]).bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); 974 | assert_eq!(f(&[0xD800, 0xDBFF]).bytes, b"\xED\xA0\x80\xED\xAF\xBF"); 975 | assert_eq!(f(&[0xD800, 0xE000]).bytes, b"\xED\xA0\x80\xEE\x80\x80"); 976 | assert_eq!(f(&[0xD7FF, 0xDC00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80"); 977 | assert_eq!(f(&[0x61, 0xDC00]).bytes, b"\x61\xED\xB0\x80"); 978 | assert_eq!(f(&[0xDC00]).bytes, b"\xED\xB0\x80"); 979 | } 980 | 981 | #[test] 982 | fn wtf8buf_extend() { 983 | fn e(initial: &[u32], extended: &[u32]) -> Wtf8Buf { 984 | fn c(value: &u32) -> CodePoint { CodePoint::from_u32(*value).unwrap() } 985 | let mut string = initial.iter().map(c).collect::(); 986 | string.extend(extended.iter().map(c)); 987 | string 988 | } 989 | 990 | assert_eq!(e(&[0x61, 0xE9], &[0x20, 0x1F4A9]).bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); 991 | 992 | assert_eq!(e(&[0xD83D], &[0xDCA9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic! 993 | assert_eq!(e(&[0xD83D, 0x20], &[0xDCA9]).bytes, b"\xED\xA0\xBD \xED\xB2\xA9"); 994 | assert_eq!(e(&[0xD800], &[0xDBFF]).bytes, b"\xED\xA0\x80\xED\xAF\xBF"); 995 | assert_eq!(e(&[0xD800], &[0xE000]).bytes, b"\xED\xA0\x80\xEE\x80\x80"); 996 | assert_eq!(e(&[0xD7FF], &[0xDC00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80"); 997 | assert_eq!(e(&[0x61], &[0xDC00]).bytes, b"\x61\xED\xB0\x80"); 998 | assert_eq!(e(&[], &[0xDC00]).bytes, b"\xED\xB0\x80"); 999 | } 1000 | 1001 | #[test] 1002 | fn wtf8buf_debug() { 1003 | let mut string = Wtf8Buf::from_str("aé 💩"); 1004 | string.push(CodePoint::from_u32(0xD800).unwrap()); 1005 | assert_eq!(format!("{:?}", string), r#""aé 💩\u{D800}""#); 1006 | } 1007 | 1008 | #[test] 1009 | fn wtf8buf_as_slice() { 1010 | assert_eq!(Wtf8Buf::from_str("aé"), Wtf8::from_str("aé")); 1011 | } 1012 | 1013 | #[test] 1014 | fn wtf8_debug() { 1015 | let mut string = Wtf8Buf::from_str("aé 💩"); 1016 | string.push(CodePoint::from_u32(0xD800).unwrap()); 1017 | assert_eq!(format!("{:?}", &*string), r#""aé 💩\u{D800}""#); 1018 | } 1019 | 1020 | #[test] 1021 | fn wtf8_from_str() { 1022 | assert_eq!(&Wtf8::from_str("").bytes, b""); 1023 | assert_eq!(&Wtf8::from_str("aé 💩").bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9"); 1024 | } 1025 | 1026 | #[test] 1027 | fn wtf8_len() { 1028 | assert_eq!(Wtf8::from_str("").len(), 0); 1029 | assert_eq!(Wtf8::from_str("aé 💩").len(), 8); 1030 | } 1031 | 1032 | #[test] 1033 | fn wtf8_slice() { 1034 | assert_eq!(&Wtf8::from_str("aé 💩").slice(1, 4).bytes, b"\xC3\xA9 "); 1035 | } 1036 | 1037 | #[test] 1038 | #[should_panic] 1039 | fn wtf8_slice_not_code_point_boundary() { 1040 | Wtf8::from_str("aé 💩").slice(2, 4); 1041 | } 1042 | 1043 | #[test] 1044 | fn wtf8_slice_from() { 1045 | assert_eq!(&Wtf8::from_str("aé 💩").slice_from(1).bytes, b"\xC3\xA9 \xF0\x9F\x92\xA9"); 1046 | } 1047 | 1048 | #[test] 1049 | #[should_panic] 1050 | fn wtf8_slice_from_not_code_point_boundary() { 1051 | Wtf8::from_str("aé 💩").slice_from(2); 1052 | } 1053 | 1054 | #[test] 1055 | fn wtf8_slice_to() { 1056 | assert_eq!(&Wtf8::from_str("aé 💩").slice_to(4).bytes, b"a\xC3\xA9 "); 1057 | } 1058 | 1059 | #[test] 1060 | #[should_panic] 1061 | fn wtf8_slice_to_not_code_point_boundary() { 1062 | Wtf8::from_str("aé 💩").slice_from(5); 1063 | } 1064 | 1065 | #[test] 1066 | fn wtf8_ascii_byte_at() { 1067 | let slice = Wtf8::from_str("aé 💩"); 1068 | assert_eq!(slice.ascii_byte_at(0), b'a'); 1069 | assert_eq!(slice.ascii_byte_at(1), b'\xFF'); 1070 | assert_eq!(slice.ascii_byte_at(2), b'\xFF'); 1071 | assert_eq!(slice.ascii_byte_at(3), b' '); 1072 | assert_eq!(slice.ascii_byte_at(4), b'\xFF'); 1073 | } 1074 | 1075 | #[test] 1076 | fn wtf8_code_points() { 1077 | fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() } 1078 | fn cp(string: &Wtf8Buf) -> Vec> { 1079 | string.code_points().map(|c| c.to_char()).collect::>() 1080 | } 1081 | let mut string = Wtf8Buf::from_str("é "); 1082 | assert_eq!(cp(&string), vec![Some('é'), Some(' ')]); 1083 | string.push(c(0xD83D)); 1084 | assert_eq!(cp(&string), vec![Some('é'), Some(' '), None]); 1085 | string.push(c(0xDCA9)); 1086 | assert_eq!(cp(&string), vec![Some('é'), Some(' '), Some('💩')]); 1087 | } 1088 | 1089 | #[test] 1090 | fn wtf8_as_str() { 1091 | assert_eq!(Wtf8::from_str("").as_str(), Some("")); 1092 | assert_eq!(Wtf8::from_str("aé 💩").as_str(), Some("aé 💩")); 1093 | let mut string = Wtf8Buf::new(); 1094 | string.push(CodePoint::from_u32(0xD800).unwrap()); 1095 | assert_eq!(string.as_str(), None); 1096 | } 1097 | 1098 | #[test] 1099 | fn wtf8_to_string_lossy() { 1100 | assert_eq!(Wtf8::from_str("").to_string_lossy(), Cow::Borrowed("")); 1101 | assert_eq!(Wtf8::from_str("aé 💩").to_string_lossy(), Cow::Borrowed("aé 💩")); 1102 | let mut string = Wtf8Buf::from_str("aé 💩"); 1103 | string.push(CodePoint::from_u32(0xD800).unwrap()); 1104 | assert_eq!(string.to_string_lossy(), { 1105 | let o: Cow = Cow::Owned(String::from("aé 💩�")); 1106 | o 1107 | }); 1108 | } 1109 | 1110 | #[test] 1111 | fn wtf8_to_ill_formed_utf16() { 1112 | let mut string = Wtf8Buf::from_str("aé "); 1113 | string.push(CodePoint::from_u32(0xD83D).unwrap()); 1114 | string.push_char('💩'); 1115 | assert_eq!(string.to_ill_formed_utf16().collect::>(), 1116 | vec![0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]); 1117 | } 1118 | } 1119 | --------------------------------------------------------------------------------