├── .gitignore
├── README.md
├── .travis.yml
├── Cargo.toml
├── LICENSE
└── src
    ├── not_quite_std.rs
    └── lib.rs


/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | /Cargo.lock
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | rust-wtf8
2 | =========
3 | 
4 | [![No Maintenance Intended](http://unmaintained.tech/badge.svg)](http://unmaintained.tech/)
5 | 
6 | Historical implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/).
7 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: rust
2 | rust: nightly
3 | 
4 | env:
5 |   - secure: fwItLzDPQp/KKxgooGrmcjZ8GKFmYx0kbRBG6gWg++blXNBotfv6weYV1IyIN1oIMa2gIf1+k4XH+Sf7OGkqIJ4EH31Wl9caxGBqa6/Rc5aCPaJuNcA20fyBp4BoiZ1R78yudbzvc63fP9wPE7idX0ZaV3k7TybPSTkFNau0Z/g=
6 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | 
 3 | name = "wtf8"
 4 | version = "0.1.0"
 5 | authors = ["Simon Sapin"]
 6 | 
 7 | description = "Implementation of the WTF-8 encoding. https://simonsapin.github.io/wtf-8/"
 8 | repository = "https://github.com/SimonSapin/rust-wtf8"
 9 | readme = "README.md"
10 | keywords = ["unicode", "encoding", "surrogate"]
11 | license = "MIT"
12 | 
13 | [lib]
14 | name = "wtf8"
15 | doctest = false
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014 Simon Sapin
 2 | 
 3 | Permission is hereby granted, free of charge, to any
 4 | person obtaining a copy of this software and associated
 5 | documentation files (the "Software"), to deal in the
 6 | Software without restriction, including without
 7 | limitation the rights to use, copy, modify, merge,
 8 | publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software
10 | is furnished to do so, subject to the following
11 | conditions:
12 | 
13 | The above copyright notice and this permission notice
14 | shall be included in all copies or substantial portions
15 | of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | DEALINGS IN THE SOFTWARE.
26 | 


--------------------------------------------------------------------------------
/src/not_quite_std.rs:
--------------------------------------------------------------------------------
  1 | //! The code in this module is copied from Rust standard library
  2 | //! (the `std` crate and crates it is a facade for)
  3 | //! at commit 16d80de231abb2b1756f3951ffd4776d681035eb,
  4 | //! with the signature changed to use `Wtf8Buf`, `Wtf8`, and `CodePoint`
  5 | //! instead of `String`, `&str`, and `char`.
  6 | //!
  7 | //! FIXME: if and when this is moved into the standard library,
  8 | //! try to avoid the code duplication.
  9 | //! Maybe by having private generic code that is monomorphized to UTF-8 and WTF-8?
 10 | 
 11 | use core::char;
 12 | use core::mem;
 13 | use core::slice;
 14 | 
 15 | use super::{Wtf8Buf, Wtf8, CodePoint, IllFormedUtf16CodeUnits};
 16 | 
 17 | // UTF-8 ranges and tags for encoding characters
 18 | // Copied from 48d5fe9ec560b53b1f5069219b0d62015e1de5ba^:src/libcore/char.rs
 19 | const TAG_CONT: u8    = 0b1000_0000;
 20 | const TAG_TWO_B: u8   = 0b1100_0000;
 21 | const TAG_THREE_B: u8 = 0b1110_0000;
 22 | const TAG_FOUR_B: u8  = 0b1111_0000;
 23 | const MAX_ONE_B: u32   =     0x80;
 24 | const MAX_TWO_B: u32   =    0x800;
 25 | const MAX_THREE_B: u32 =  0x10000;
 26 | 
 27 | /// Copied from 48d5fe9ec560b53b1f5069219b0d62015e1de5ba^:src/libcore/char.rs
 28 | #[inline]
 29 | fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option<usize> {
 30 |     // Marked #[inline] to allow llvm optimizing it away
 31 |     if code < MAX_ONE_B && !dst.is_empty() {
 32 |         dst[0] = code as u8;
 33 |         Some(1)
 34 |     } else if code < MAX_TWO_B && dst.len() >= 2 {
 35 |         dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
 36 |         dst[1] = (code & 0x3F) as u8 | TAG_CONT;
 37 |         Some(2)
 38 |     } else if code < MAX_THREE_B && dst.len() >= 3  {
 39 |         dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
 40 |         dst[1] = (code >>  6 & 0x3F) as u8 | TAG_CONT;
 41 |         dst[2] = (code & 0x3F) as u8 | TAG_CONT;
 42 |         Some(3)
 43 |     } else if dst.len() >= 4 {
 44 |         dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
 45 |         dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
 46 |         dst[2] = (code >>  6 & 0x3F) as u8 | TAG_CONT;
 47 |         dst[3] = (code & 0x3F) as u8 | TAG_CONT;
 48 |         Some(4)
 49 |     } else {
 50 |         None
 51 |     }
 52 | }
 53 | 
 54 | /// Copied from 48d5fe9ec560b53b1f5069219b0d62015e1de5ba^:src/libcore/char.rs
 55 | #[inline]
 56 | fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option<usize> {
 57 |     // Marked #[inline] to allow llvm optimizing it away
 58 |     if (ch & 0xFFFF) == ch && !dst.is_empty() {
 59 |         // The BMP falls through (assuming non-surrogate, as it should)
 60 |         dst[0] = ch as u16;
 61 |         Some(1)
 62 |     } else if dst.len() >= 2 {
 63 |         // Supplementary planes break into surrogates.
 64 |         ch -= 0x1_0000;
 65 |         dst[0] = 0xD800 | ((ch >> 10) as u16);
 66 |         dst[1] = 0xDC00 | ((ch as u16) & 0x3FF);
 67 |         Some(2)
 68 |     } else {
 69 |         None
 70 |     }
 71 | }
 72 | 
 73 | /// Copied from core::str::next_code_point
 74 | #[inline]
 75 | pub fn next_code_point(bytes: &mut slice::Iter<u8>) -> Option<u32> {
 76 |     // Decode UTF-8
 77 |     let x = match bytes.next() {
 78 |         None => return None,
 79 |         Some(&next_byte) if next_byte < 128 => return Some(next_byte as u32),
 80 |         Some(&next_byte) => next_byte,
 81 |     };
 82 | 
 83 |     // Multibyte case follows
 84 |     // Decode from a byte combination out of: [[[x y] z] w]
 85 |     // NOTE: Performance is sensitive to the exact formulation here
 86 |     let init = utf8_first_byte(x, 2);
 87 |     let y = unwrap_or_0(bytes.next());
 88 |     let mut ch = utf8_acc_cont_byte(init, y);
 89 |     if x >= 0xE0 {
 90 |         // [[x y z] w] case
 91 |         // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
 92 |         let z = unwrap_or_0(bytes.next());
 93 |         let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
 94 |         ch = init << 12 | y_z;
 95 |         if x >= 0xF0 {
 96 |             // [x y z w] case
 97 |             // use only the lower 3 bits of `init`
 98 |             let w = unwrap_or_0(bytes.next());
 99 |             ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
100 |         }
101 |     }
102 | 
103 |     Some(ch)
104 | }
105 | 
106 | #[inline]
107 | fn utf8_first_byte(byte: u8, width: u32) -> u32 { (byte & (0x7F >> width)) as u32 }
108 | 
109 | /// Return the value of `ch` updated with continuation byte `byte`.
110 | #[inline]
111 | fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { (ch << 6) | (byte & CONT_MASK) as u32 }
112 | 
113 | #[inline]
114 | fn unwrap_or_0(opt: Option<&u8>) -> u8 {
115 |     match opt {
116 |         Some(&byte) => byte,
117 |         None => 0,
118 |     }
119 | }
120 | 
121 | /// Mask of the value bits of a continuation byte
122 | const CONT_MASK: u8 = 0b0011_1111;
123 | 
124 | /// Copied from String::push
125 | /// This does **not** include the WTF-8 concatenation check.
126 | #[inline]
127 | pub fn push_code_point(string: &mut Wtf8Buf, code_point: CodePoint) {
128 |     let cur_len = string.len();
129 |     // This may use up to 4 bytes.
130 |     string.reserve(4);
131 | 
132 |     unsafe {
133 |         // Attempt to not use an intermediate buffer by just pushing bytes
134 |         // directly onto this string.
135 |         let slice = slice::from_raw_parts_mut(
136 |             string.bytes.as_mut_ptr().offset(cur_len as isize),
137 |             4,
138 |         );
139 |         let used = encode_utf8_raw(code_point.to_u32(), slice).unwrap_or(0);
140 |         string.bytes.set_len(cur_len + used);
141 |     }
142 | }
143 | 
144 | 
145 | /// Copied from core::str::StrPrelude::is_char_boundary
146 | #[inline]
147 | pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool {
148 |     if index == slice.len() { return true; }
149 |     match slice.bytes.get(index) {
150 |         None => false,
151 |         Some(&b) => b < 128u8 || b >= 192u8,
152 |     }
153 | }
154 | 
155 | /// Copied from core::str::raw::slice_unchecked
156 | #[inline]
157 | pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 {
158 |     mem::transmute(slice::from_raw_parts(
159 |         s.bytes.as_ptr().offset(begin as isize),
160 |         end - begin,
161 |     ))
162 | }
163 | 
164 | /// Copied from core::str::raw::slice_error_fail
165 | #[inline(never)]
166 | pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! {
167 |     assert!(begin <= end);
168 |     panic!("index {} and/or {} in {:?} do not lie on character boundary",
169 |           begin, end, s);
170 | }
171 | 
172 | 
173 | /// Copied from core::str::Utf16CodeUnits::next
174 | pub fn next_utf16_code_unit(iter: &mut IllFormedUtf16CodeUnits) -> Option<u16> {
175 |     if iter.extra != 0 {
176 |         let tmp = iter.extra;
177 |         iter.extra = 0;
178 |         return Some(tmp);
179 |     }
180 | 
181 |     let mut buf = [0u16; 2];
182 |     iter.code_points.next().map(|code_point| {
183 |         let n = encode_utf16_raw(code_point.to_u32(), &mut buf).unwrap_or(0);
184 |         if n == 2 { iter.extra = buf[1]; }
185 |         buf[0]
186 |     })
187 | }
188 | 
189 | /// Copied from src/librustc_unicode/char.rs
190 | pub struct DecodeUtf16<I>
191 |     where I: Iterator<Item = u16>
192 | {
193 |     iter: I,
194 |     buf: Option<u16>,
195 | }
196 | 
197 | 
198 | /// Copied from src/librustc_unicode/char.rs
199 | #[inline]
200 | pub fn decode_utf16<I: IntoIterator<Item = u16>>(iterable: I) -> DecodeUtf16<I::IntoIter> {
201 |     DecodeUtf16 {
202 |         iter: iterable.into_iter(),
203 |         buf: None,
204 |     }
205 | }
206 | 
207 | /// Copied from src/librustc_unicode/char.rs
208 | impl<I: Iterator<Item=u16>> Iterator for DecodeUtf16<I> {
209 |     type Item = Result<char, u16>;
210 | 
211 |     fn next(&mut self) -> Option<Result<char, u16>> {
212 |         let u = match self.buf.take() {
213 |             Some(buf) => buf,
214 |             None => match self.iter.next() {
215 |                 Some(u) => u,
216 |                 None => return None,
217 |             },
218 |         };
219 | 
220 |         if u < 0xD800 || 0xDFFF < u {
221 |             // not a surrogate
222 |             Some(Ok(unsafe { char::from_u32_unchecked(u as u32) }))
223 |         } else if u >= 0xDC00 {
224 |             // a trailing surrogate
225 |             Some(Err(u))
226 |         } else {
227 |             let u2 = match self.iter.next() {
228 |                 Some(u2) => u2,
229 |                 // eof
230 |                 None => return Some(Err(u)),
231 |             };
232 |             if u2 < 0xDC00 || u2 > 0xDFFF {
233 |                 // not a trailing surrogate so we're not a valid
234 |                 // surrogate pair, so rewind to redecode u2 next time.
235 |                 self.buf = Some(u2);
236 |                 return Some(Err(u));
237 |             }
238 | 
239 |             // all ok, so lets decode it.
240 |             let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
241 |             Some(Ok(unsafe { char::from_u32_unchecked(c) }))
242 |         }
243 |     }
244 | 
245 |     #[inline]
246 |     fn size_hint(&self) -> (usize, Option<usize>) {
247 |         let (low, high) = self.iter.size_hint();
248 |         // we could be entirely valid surrogates (2 elements per
249 |         // char), or entirely non-surrogates (1 element per char)
250 |         (low / 2, high)
251 |     }
252 | }
253 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
   1 | /*!
   2 | 
   3 | Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/).
   4 | 
   5 | This library uses Rust’s type system to maintain
   6 | [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed),
   7 | like the `String` and `&str` types do for UTF-8.
   8 | 
   9 | Since [WTF-8 must not be used
  10 | for interchange](https://simonsapin.github.io/wtf-8/#intended-audience),
  11 | this library deliberately does not provide access to the underlying bytes
  12 | of WTF-8 strings,
  13 | nor can it decode WTF-8 from arbitrary bytes.
  14 | WTF-8 strings can be obtained from UTF-8, UTF-16, or code points.
  15 | 
  16 | */
  17 | 
  18 | #![no_std]
  19 | 
  20 | extern crate alloc;
  21 | 
  22 | use alloc::borrow::Cow;
  23 | use alloc::string::String;
  24 | use alloc::vec::Vec;
  25 | use core::str;
  26 | use core::cmp::Ordering;
  27 | use core::fmt;
  28 | use core::hash;
  29 | use core::iter::{FromIterator, IntoIterator};
  30 | use core::mem::transmute;
  31 | use core::ops::Deref;
  32 | use core::slice;
  33 | 
  34 | mod not_quite_std;
  35 | 
  36 | 
  37 | static UTF8_REPLACEMENT_CHARACTER: &'static [u8] = b"\xEF\xBF\xBD";
  38 | 
  39 | /// A Unicode code point: from U+0000 to U+10FFFF.
  40 | ///
  41 | /// Compare with the `char` type,
  42 | /// which represents a Unicode scalar value:
  43 | /// a code point that is not a surrogate (U+D800 to U+DFFF).
  44 | #[derive(Eq, PartialEq, Ord, PartialOrd, Clone)]
  45 | pub struct CodePoint {
  46 |     value: u32
  47 | }
  48 | 
  49 | impl Copy for CodePoint {}
  50 | 
  51 | 
  52 | /// Format the code point as `U+` followed by four to six hexadecimal digits.
  53 | /// Example: `U+1F4A9`
  54 | impl fmt::Debug for CodePoint {
  55 |     #[inline]
  56 |     fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
  57 |         write!(formatter, "U+{:04X}", self.value)
  58 |     }
  59 | }
  60 | 
  61 | 
  62 | impl CodePoint {
  63 |     /// Unsafely create a new `CodePoint` without checking the value.
  64 |     ///
  65 |     /// Only use when `value` is known to be less than or equal to 0x10FFFF.
  66 |     #[inline]
  67 |     pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint {
  68 |         CodePoint { value: value }
  69 |     }
  70 | 
  71 |     /// Create a new `CodePoint` if the value is a valid code point.
  72 |     ///
  73 |     /// Return `None` if `value` is above 0x10FFFF.
  74 |     #[inline]
  75 |     pub fn from_u32(value: u32) -> Option<CodePoint> {
  76 |         match value {
  77 |             0 ..= 0x10FFFF => Some(CodePoint { value: value }),
  78 |             _ => None
  79 |         }
  80 |     }
  81 | 
  82 |     /// Create a new `CodePoint` from a `char`.
  83 |     ///
  84 |     /// Since all Unicode scalar values are code points, this always succeds.
  85 |     #[inline]
  86 |     pub fn from_char(value: char) -> CodePoint {
  87 |         CodePoint { value: value as u32 }
  88 |     }
  89 | 
  90 |     /// Return the numeric value of the code point.
  91 |     #[inline]
  92 |     pub fn to_u32(&self) -> u32 {
  93 |         self.value
  94 |     }
  95 | 
  96 |     /// Optionally return a Unicode scalar value for the code point.
  97 |     ///
  98 |     /// Return `None` if the code point is a surrogate (from U+D800 to U+DFFF).
  99 |     #[inline]
 100 |     pub fn to_char(&self) -> Option<char> {
 101 |         match self.value {
 102 |             0xD800 ..= 0xDFFF => None,
 103 |             _ => Some(unsafe { transmute(self.value) })
 104 |         }
 105 |     }
 106 | 
 107 |     /// Return a Unicode scalar value for the code point.
 108 |     ///
 109 |     /// Return `'\u{FFFD}'` (the replacement character “�”)
 110 |     /// if the code point is a surrogate (from U+D800 to U+DFFF).
 111 |     #[inline]
 112 |     pub fn to_char_lossy(&self) -> char {
 113 |         self.to_char().unwrap_or('\u{FFFD}')
 114 |     }
 115 | }
 116 | 
 117 | 
 118 | /// An owned, growable string of well-formed WTF-8 data.
 119 | ///
 120 | /// Similar to `String`, but can additionally contain surrogate code points
 121 | /// if they’re not in a surrogate pair.
 122 | #[derive(Eq, PartialEq, Ord, PartialOrd, Clone)]
 123 | pub struct Wtf8Buf {
 124 |     bytes: Vec<u8>
 125 | }
 126 | 
 127 | impl Deref for Wtf8Buf {
 128 |     type Target = Wtf8;
 129 | 
 130 |     fn deref(&self) -> &Wtf8 {
 131 |         unsafe { transmute(&*self.bytes) }
 132 |     }
 133 | }
 134 | 
 135 | /// Format the string with double quotes,
 136 | /// and surrogates as `\u` followed by four hexadecimal digits.
 137 | /// Example: `"a\u{D800}"` for a string with code points [U+0061, U+D800]
 138 | impl fmt::Debug for Wtf8Buf {
 139 |     #[inline]
 140 |     fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
 141 |         Wtf8::fmt(self, formatter)
 142 |     }
 143 | }
 144 | 
 145 | 
 146 | impl Wtf8Buf {
 147 |     /// Create an new, empty WTF-8 string.
 148 |     #[inline]
 149 |     pub fn new() -> Wtf8Buf {
 150 |         Wtf8Buf { bytes: Vec::new() }
 151 |     }
 152 | 
 153 |     /// Create an new, empty WTF-8 string with pre-allocated capacity for `n` bytes.
 154 |     #[inline]
 155 |     pub fn with_capacity(n: usize) -> Wtf8Buf {
 156 |         Wtf8Buf { bytes: Vec::with_capacity(n) }
 157 |     }
 158 | 
 159 |     /// Create a WTF-8 string from an UTF-8 `String`.
 160 |     ///
 161 |     /// This takes ownership of the `String` and does not copy.
 162 |     ///
 163 |     /// Since WTF-8 is a superset of UTF-8, this always succeeds.
 164 |     #[inline]
 165 |     pub fn from_string(string: String) -> Wtf8Buf {
 166 |         Wtf8Buf { bytes: string.into_bytes() }
 167 |     }
 168 | 
 169 |     /// Create a WTF-8 string from an UTF-8 `&str` slice.
 170 |     ///
 171 |     /// This copies the content of the slice.
 172 |     ///
 173 |     /// Since WTF-8 is a superset of UTF-8, this always succeeds.
 174 |     #[inline]
 175 |     pub fn from_str(str: &str) -> Wtf8Buf {
 176 |         Wtf8Buf { bytes: str.as_bytes().to_vec() }
 177 |     }
 178 | 
 179 |     /// Create a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units.
 180 |     ///
 181 |     /// This is lossless: calling `.to_ill_formed_utf16()` on the resulting string
 182 |     /// will always return the original code units.
 183 |     pub fn from_ill_formed_utf16(v: &[u16]) -> Wtf8Buf {
 184 |         let mut string = Wtf8Buf::with_capacity(v.len());
 185 |         for item in not_quite_std::decode_utf16(v.iter().cloned()) {
 186 |             match item {
 187 |                 Ok(c) => string.push_char(c),
 188 |                 Err(s) => {
 189 |                     // Surrogates are known to be in the code point range.
 190 |                     let code_point = unsafe { CodePoint::from_u32_unchecked(s as u32) };
 191 |                     // Skip the WTF-8 concatenation check,
 192 |                     // surrogate pairs are already decoded by utf16_items
 193 |                     not_quite_std::push_code_point(&mut string, code_point)
 194 |                 }
 195 |             }
 196 |         }
 197 |         string
 198 |     }
 199 | 
 200 |     /// Reserves capacity for at least `additional` more bytes to be inserted
 201 |     /// in the given `Wtf8Buf`.
 202 |     /// The collection may reserve more space to avoid frequent reallocations.
 203 |     ///
 204 |     /// # Panics
 205 |     ///
 206 |     /// Panics if the new capacity overflows `usize`.
 207 |     ///
 208 |     /// # Example
 209 |     ///
 210 |     /// ```
 211 |     /// let mut s = Wtf8Buf::new();
 212 |     /// s.reserve(10);
 213 |     /// assert!(s.capacity() >= 10);
 214 |     /// ```
 215 |     #[inline]
 216 |     pub fn reserve(&mut self, additional: usize) {
 217 |         self.bytes.reserve(additional)
 218 |     }
 219 | 
 220 |     /// Returns the number of bytes that this string buffer can hold without reallocating.
 221 |     ///
 222 |     /// # Example
 223 |     ///
 224 |     /// ```
 225 |     /// let s = Wtf8Buf::with_capacity(10);
 226 |     /// assert!(s.capacity() >= 10);
 227 |     /// ```
 228 |     #[inline]
 229 |     pub fn capacity(&self) -> usize {
 230 |         self.bytes.capacity()
 231 |     }
 232 | 
 233 |     /// Append an UTF-8 slice at the end of the string.
 234 |     #[inline]
 235 |     pub fn push_str(&mut self, other: &str) {
 236 |         self.bytes.extend_from_slice(other.as_bytes())
 237 |     }
 238 | 
 239 |     /// Append a WTF-8 slice at the end of the string.
 240 |     ///
 241 |     /// This replaces newly paired surrogates at the boundary
 242 |     /// with a supplementary code point,
 243 |     /// like concatenating ill-formed UTF-16 strings effectively would.
 244 |     #[inline]
 245 |     pub fn push_wtf8(&mut self, other: &Wtf8) {
 246 |         match ((&*self).final_lead_surrogate(), other.initial_trail_surrogate()) {
 247 |             // Replace newly paired surrogates by a supplementary code point.
 248 |             (Some(lead), Some(trail)) => {
 249 |                 let len_without_lead_surrogate = self.len() - 3;
 250 |                 self.bytes.truncate(len_without_lead_surrogate);
 251 |                 let other_without_trail_surrogate = &other.bytes[3..];
 252 |                 // 4 bytes for the supplementary code point
 253 |                 self.bytes.reserve(4 + other_without_trail_surrogate.len());
 254 |                 self.push_char(decode_surrogate_pair(lead, trail));
 255 |                 self.bytes.extend_from_slice(other_without_trail_surrogate);
 256 |             }
 257 |             _ => self.bytes.extend_from_slice(&other.bytes)
 258 |         }
 259 |     }
 260 | 
 261 |     /// Append a Unicode scalar value at the end of the string.
 262 |     #[inline]
 263 |     pub fn push_char(&mut self, c: char) {
 264 |         not_quite_std::push_code_point(self, CodePoint::from_char(c))
 265 |     }
 266 | 
 267 |     /// Append a code point at the end of the string.
 268 |     ///
 269 |     /// This replaces newly paired surrogates at the boundary
 270 |     /// with a supplementary code point,
 271 |     /// like concatenating ill-formed UTF-16 strings effectively would.
 272 |     #[inline]
 273 |     pub fn push(&mut self, code_point: CodePoint) {
 274 |         match code_point.to_u32() {
 275 |             trail @ 0xDC00..=0xDFFF => {
 276 |                 match (&*self).final_lead_surrogate() {
 277 |                     Some(lead) => {
 278 |                         let len_without_lead_surrogate = self.len() - 3;
 279 |                         self.bytes.truncate(len_without_lead_surrogate);
 280 |                         self.push_char(decode_surrogate_pair(lead, trail as u16));
 281 |                         return
 282 |                     }
 283 |                     _ => {}
 284 |                 }
 285 |             }
 286 |             _ => {}
 287 |         }
 288 | 
 289 |         // No newly paired surrogates at the boundary.
 290 |         not_quite_std::push_code_point(self, code_point)
 291 |     }
 292 | 
 293 |     /// Shortens a string to the specified length.
 294 |     ///
 295 |     /// # Failure
 296 |     ///
 297 |     /// Fails if `new_len` > current length,
 298 |     /// or if `new_len` is not a code point boundary.
 299 |     #[inline]
 300 |     pub fn truncate(&mut self, new_len: usize) {
 301 |         assert!(not_quite_std::is_code_point_boundary(self, new_len));
 302 |         self.bytes.truncate(new_len)
 303 |     }
 304 | 
 305 |     /// Consume the WTF-8 string and try to convert it to UTF-8.
 306 |     ///
 307 |     /// This does not copy the data.
 308 |     ///
 309 |     /// If the contents are not well-formed UTF-8
 310 |     /// (that is, if the string contains surrogates),
 311 |     /// the original WTF-8 string is returned instead.
 312 |     pub fn into_string(self) -> Result<String, Wtf8Buf> {
 313 |         match self.next_surrogate(0) {
 314 |             None => Ok(unsafe { String::from_utf8_unchecked(self.bytes) }),
 315 |             Some(_) => Err(self),
 316 |         }
 317 |     }
 318 | 
 319 |     /// Consume the WTF-8 string and convert it lossily to UTF-8.
 320 |     ///
 321 |     /// This does not copy the data (but may overwrite parts of it in place).
 322 |     ///
 323 |     /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
 324 |     pub fn into_string_lossy(mut self) -> String {
 325 |         let mut pos = 0;
 326 |         loop {
 327 |             match self.next_surrogate(pos) {
 328 |                 Some((surrogate_pos, _)) => {
 329 |                     pos = surrogate_pos + 3;
 330 |                     self.bytes[surrogate_pos..pos].copy_from_slice(UTF8_REPLACEMENT_CHARACTER);
 331 |                 },
 332 |                 None => return unsafe { String::from_utf8_unchecked(self.bytes) }
 333 |             }
 334 |         }
 335 |     }
 336 | }
 337 | 
 338 | 
 339 | /// Create a new WTF-8 string from an iterator of code points.
 340 | ///
 341 | /// This replaces surrogate code point pairs with supplementary code points,
 342 | /// like concatenating ill-formed UTF-16 strings effectively would.
 343 | impl FromIterator<CodePoint> for Wtf8Buf {
 344 |     fn from_iter<T: IntoIterator<Item = CodePoint>>(iterable: T) -> Wtf8Buf {
 345 |         let mut string = Wtf8Buf::new();
 346 |         string.extend(iterable);
 347 |         string
 348 |     }
 349 | }
 350 | 
 351 | 
 352 | /// Append code points from an iterator to the string.
 353 | ///
 354 | /// This replaces surrogate code point pairs with supplementary code points,
 355 | /// like concatenating ill-formed UTF-16 strings effectively would.
 356 | impl Extend<CodePoint> for Wtf8Buf {
 357 |     fn extend<T: IntoIterator<Item = CodePoint>>(&mut self, iterable: T) {
 358 |         let iterator = iterable.into_iter();
 359 |         let (low, _high) = iterator.size_hint();
 360 |         // Lower bound of one byte per code point (ASCII only)
 361 |         self.bytes.reserve(low);
 362 |         for code_point in iterator {
 363 |             self.push(code_point);
 364 |         }
 365 |     }
 366 | }
 367 | 
 368 | /// A borrowed slice of well-formed WTF-8 data.
 369 | ///
 370 | /// Similar to `&str`, but can additionally contain surrogate code points
 371 | /// if they’re not in a surrogate pair.
 372 | pub struct Wtf8 {
 373 |     bytes: [u8]
 374 | }
 375 | 
 376 | // FIXME: https://github.com/rust-lang/rust/issues/18805
 377 | impl PartialEq for Wtf8 {
 378 |     fn eq(&self, other: &Wtf8) -> bool { self.bytes.eq(&other.bytes) }
 379 | }
 380 | 
 381 | // FIXME: https://github.com/rust-lang/rust/issues/18805
 382 | impl Eq for Wtf8 {}
 383 | 
 384 | // FIXME: https://github.com/rust-lang/rust/issues/18738
 385 | impl PartialOrd for Wtf8 {
 386 |     #[inline]
 387 |     fn partial_cmp(&self, other: &Wtf8) -> Option<Ordering> {
 388 |         self.bytes.partial_cmp(&other.bytes)
 389 |     }
 390 |     #[inline]
 391 |     fn lt(&self, other: &Wtf8) -> bool { self.bytes.lt(&other.bytes) }
 392 |     #[inline]
 393 |     fn le(&self, other: &Wtf8) -> bool { self.bytes.le(&other.bytes) }
 394 |     #[inline]
 395 |     fn gt(&self, other: &Wtf8) -> bool { self.bytes.gt(&other.bytes) }
 396 |     #[inline]
 397 |     fn ge(&self, other: &Wtf8) -> bool { self.bytes.ge(&other.bytes) }
 398 | }
 399 | 
 400 | // FIXME: https://github.com/rust-lang/rust/issues/18738
 401 | impl Ord for Wtf8 {
 402 |     #[inline]
 403 |     fn cmp(&self, other: &Wtf8) -> Ordering { self.bytes.cmp(&other.bytes) }
 404 | }
 405 | 
 406 | 
 407 | /// Format the slice with double quotes,
 408 | /// and surrogates as `\u` followed by four hexadecimal digits.
 409 | /// Example: `"a\u{D800}"` for a slice with code points [U+0061, U+D800]
 410 | impl fmt::Debug for Wtf8 {
 411 |     fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
 412 |         formatter.write_str("\"")?;
 413 |         let mut pos = 0;
 414 |         loop {
 415 |             match self.next_surrogate(pos) {
 416 |                 None => break,
 417 |                 Some((surrogate_pos, surrogate)) => {
 418 |                     formatter.write_str(unsafe {
 419 |                         str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos])
 420 |                     })?;
 421 |                     write!(formatter, "\\u{{{:X}}}", surrogate)?;
 422 |                     pos = surrogate_pos + 3;
 423 |                 }
 424 |             }
 425 |         }
 426 |         formatter.write_str(unsafe {
 427 |             str::from_utf8_unchecked(&self.bytes[pos..])
 428 |         })?;
 429 |         formatter.write_str("\"")
 430 |     }
 431 | }
 432 | 
 433 | 
 434 | impl Wtf8 {
 435 |     /// Create a WTF-8 slice from a UTF-8 `&str` slice.
 436 |     ///
 437 |     /// Since WTF-8 is a superset of UTF-8, this always succeeds.
 438 |     #[inline]
 439 |     pub fn from_str(value: &str) -> &Wtf8 {
 440 |         unsafe { transmute(value.as_bytes()) }
 441 |     }
 442 | 
 443 |     /// Return the length, in WTF-8 bytes.
 444 |     #[inline]
 445 |     pub fn len(&self) -> usize {
 446 |         self.bytes.len()
 447 |     }
 448 | 
 449 |     /// Return a slice of the given string for the byte range [`begin`..`end`).
 450 |     ///
 451 |     /// # Failure
 452 |     ///
 453 |     /// Fails when `begin` and `end` do not point to code point boundaries,
 454 |     /// or point beyond the end of the string.
 455 |     #[inline]
 456 |     pub fn slice(&self, begin: usize, end: usize) -> &Wtf8 {
 457 |         // is_code_point_boundary checks that the index is in [0, .len()]
 458 |         if begin <= end &&
 459 |            not_quite_std::is_code_point_boundary(self, begin) &&
 460 |            not_quite_std::is_code_point_boundary(self, end) {
 461 |             unsafe { not_quite_std::slice_unchecked(self, begin, end) }
 462 |         } else {
 463 |             not_quite_std::slice_error_fail(self, begin, end)
 464 |         }
 465 |     }
 466 | 
 467 |     /// Return a slice of the given string from byte `begin` to its end.
 468 |     ///
 469 |     /// # Failure
 470 |     ///
 471 |     /// Fails when `begin` is not at a code point boundary,
 472 |     /// or is beyond the end of the string.
 473 |     #[inline]
 474 |     pub fn slice_from(&self, begin: usize) -> &Wtf8 {
 475 |         // is_code_point_boundary checks that the index is in [0, .len()]
 476 |         if not_quite_std::is_code_point_boundary(self, begin) {
 477 |             unsafe { not_quite_std::slice_unchecked(self, begin, self.len()) }
 478 |         } else {
 479 |             not_quite_std::slice_error_fail(self, begin, self.len())
 480 |         }
 481 |     }
 482 | 
 483 |     /// Return a slice of the given string from its beginning to byte `end`.
 484 |     ///
 485 |     /// # Failure
 486 |     ///
 487 |     /// Fails when `end` is not at a code point boundary,
 488 |     /// or is beyond the end of the string.
 489 |     #[inline]
 490 |     pub fn slice_to(&self, end: usize) -> &Wtf8 {
 491 |         // is_code_point_boundary checks that the index is in [0, .len()]
 492 |         if not_quite_std::is_code_point_boundary(self, end) {
 493 |             unsafe { not_quite_std::slice_unchecked(self, 0, end) }
 494 |         } else {
 495 |             not_quite_std::slice_error_fail(self, 0, end)
 496 |         }
 497 |     }
 498 | 
 499 |     /// Return the code point at `position` if it is in the ASCII range,
 500 |     /// or `b'\xFF' otherwise.
 501 |     ///
 502 |     /// # Failure
 503 |     ///
 504 |     /// Fails if `position` is beyond the end of the string.
 505 |     #[inline]
 506 |     pub fn ascii_byte_at(&self, position: usize) -> u8 {
 507 |         match self.bytes[position] {
 508 |             ascii_byte @ 0x00 ..= 0x7F => ascii_byte,
 509 |             _ => 0xFF
 510 |         }
 511 |     }
 512 | 
 513 |     /// Return an iterator for the string’s code points.
 514 |     #[inline]
 515 |     pub fn code_points(&self) -> Wtf8CodePoints {
 516 |         Wtf8CodePoints { bytes: self.bytes.iter() }
 517 |     }
 518 | 
 519 |     /// Try to convert the string to UTF-8 and return a `&str` slice.
 520 |     ///
 521 |     /// Return `None` if the string contains surrogates.
 522 |     ///
 523 |     /// This does not copy the data.
 524 |     #[inline]
 525 |     pub fn as_str(&self) -> Option<&str> {
 526 |         // Well-formed WTF-8 is also well-formed UTF-8
 527 |         // if and only if it contains no surrogate.
 528 |         match self.next_surrogate(0) {
 529 |             None => Some(unsafe { str::from_utf8_unchecked(&self.bytes) }),
 530 |             Some(_) => None,
 531 |         }
 532 |     }
 533 | 
 534 |     /// Lossily convert the string to UTF-8.
 535 |     /// Return an UTF-8 `&str` slice if the contents are well-formed in UTF-8.
 536 |     ///
 537 |     /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”).
 538 |     ///
 539 |     /// This only copies the data if necessary (if it contains any surrogate).
 540 |     pub fn to_string_lossy(&self) -> Cow<str> {
 541 |         let surrogate_pos = match self.next_surrogate(0) {
 542 |             None => return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) }),
 543 |             Some((pos, _)) => pos,
 544 |         };
 545 |         let wtf8_bytes = &self.bytes;
 546 |         let mut utf8_bytes = Vec::with_capacity(self.len());
 547 |         utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]);
 548 |         utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER);
 549 |         let mut pos = surrogate_pos + 3;
 550 |         loop {
 551 |             match self.next_surrogate(pos) {
 552 |                 Some((surrogate_pos, _)) => {
 553 |                     utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]);
 554 |                     utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER);
 555 |                     pos = surrogate_pos + 3;
 556 |                 },
 557 |                 None => {
 558 |                     utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]);
 559 |                     return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) })
 560 |                 }
 561 |             }
 562 |         }
 563 |     }
 564 | 
 565 |     /// Convert the WTF-8 string to potentially ill-formed UTF-16
 566 |     /// and return an iterator of 16-bit code units.
 567 |     ///
 568 |     /// This is lossless:
 569 |     /// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units
 570 |     /// would always return the original WTF-8 string.
 571 |     #[inline]
 572 |     pub fn to_ill_formed_utf16(&self) -> IllFormedUtf16CodeUnits {
 573 |         IllFormedUtf16CodeUnits { code_points: self.code_points(), extra: 0 }
 574 |     }
 575 | 
 576 |     #[inline]
 577 |     fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> {
 578 |         let mut iter = self.bytes[pos..].iter();
 579 |         loop {
 580 |             let b = match iter.next() {
 581 |                 None => return None,
 582 |                 Some(&b) => b,
 583 |             };
 584 |             if b < 0x80 {
 585 |                 pos += 1;
 586 |             } else if b < 0xE0 {
 587 |                 iter.next();
 588 |                 pos += 2;
 589 |             } else if b == 0xED {
 590 |                 match (iter.next(), iter.next()) {
 591 |                     (Some(&b2), Some(&b3)) if b2 >= 0xA0 => {
 592 |                         return Some((pos, decode_surrogate(b2, b3)))
 593 |                     }
 594 |                     _ => pos += 3
 595 |                 }
 596 |             } else if b < 0xF0 {
 597 |                 iter.next();
 598 |                 iter.next();
 599 |                 pos += 3;
 600 |             } else {
 601 |                 iter.next();
 602 |                 iter.next();
 603 |                 iter.next();
 604 |                 pos += 4;
 605 |             }
 606 |         }
 607 |     }
 608 | 
 609 |     #[inline]
 610 |     fn final_lead_surrogate(&self) -> Option<u16> {
 611 |         let len = self.len();
 612 |         if len < 3 {
 613 |             return None
 614 |         }
 615 |         let seq = &self.bytes[len - 3..];
 616 |         if seq[0] == 0xED && 0xA0 <= seq[1] && seq[1] <= 0xAF {
 617 |             Some(decode_surrogate(seq[1], seq[2]))
 618 |         } else {
 619 |             None
 620 |         }
 621 |     }
 622 | 
 623 |     #[inline]
 624 |     fn initial_trail_surrogate(&self) -> Option<u16> {
 625 |         let len = self.len();
 626 |         if len < 3 {
 627 |             return None
 628 |         }
 629 |         let seq = &self.bytes[..3];
 630 |         if seq[0] == 0xED && 0xB0 <= seq[1] && seq[1] <= 0xBF {
 631 |             Some(decode_surrogate(seq[1], seq[2]))
 632 |         } else {
 633 |             None
 634 |         }
 635 |     }
 636 | }
 637 | 
 638 | 
 639 | #[inline]
 640 | fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
 641 |     // The first byte is assumed to be 0xED
 642 |     0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F
 643 | }
 644 | 
 645 | #[inline]
 646 | fn decode_surrogate_pair(lead: u16, trail: u16) -> char {
 647 |     let code_point = 0x10000 + (((lead as u32 - 0xD800) << 10) | (trail as u32 - 0xDC00));
 648 |     unsafe { transmute(code_point) }
 649 | }
 650 | 
 651 | 
 652 | /// Iterator for the code points of a WTF-8 string.
 653 | ///
 654 | /// Created with the method `.code_points()`.
 655 | #[derive(Clone)]
 656 | pub struct Wtf8CodePoints<'a> {
 657 |     bytes: slice::Iter<'a, u8>
 658 | }
 659 | 
 660 | impl<'a> Iterator for Wtf8CodePoints<'a> {
 661 |     type Item = CodePoint;
 662 | 
 663 |     #[inline]
 664 |     fn next(&mut self) -> Option<CodePoint> {
 665 |         match not_quite_std::next_code_point(&mut self.bytes) {
 666 |             None => None,
 667 |             Some(value) => {
 668 |                 // Wtf8 invariant says `value` is a valid code point
 669 |                 unsafe {
 670 |                     Some(CodePoint::from_u32_unchecked(value))
 671 |                 }
 672 |             }
 673 |         }
 674 | 
 675 |     }
 676 | 
 677 |     #[inline]
 678 |     fn size_hint(&self) -> (usize, Option<usize>) {
 679 |         let (len, _) = self.bytes.size_hint();
 680 |         (len.saturating_add(3) / 4, Some(len))
 681 |     }
 682 | }
 683 | 
 684 | #[derive(Clone)]
 685 | pub struct IllFormedUtf16CodeUnits<'a> {
 686 |     code_points: Wtf8CodePoints<'a>,
 687 |     extra: u16
 688 | }
 689 | 
 690 | impl<'a> Iterator for IllFormedUtf16CodeUnits<'a> {
 691 |     type Item = u16;
 692 | 
 693 |     #[inline]
 694 |     fn next(&mut self) -> Option<u16> {
 695 |         not_quite_std::next_utf16_code_unit(self)
 696 |     }
 697 | 
 698 |     #[inline]
 699 |     fn size_hint(&self) -> (usize, Option<usize>) {
 700 |         let (low, high) = self.code_points.size_hint();
 701 |         // every code point gets either one u16 or two u16,
 702 |         // so this iterator is between 1 or 2 times as
 703 |         // long as the underlying iterator.
 704 |         (low, high.and_then(|n| n.checked_mul(2)))
 705 |     }
 706 | }
 707 | 
 708 | 
 709 | impl<'a> PartialEq<&'a Wtf8> for Wtf8Buf {
 710 |     fn eq(&self, other: &&Wtf8) -> bool {
 711 |         **self == **other
 712 |     }
 713 | }
 714 | 
 715 | impl<'a> PartialEq<Wtf8Buf> for &'a Wtf8 {
 716 |     fn eq(&self, other: &Wtf8Buf) -> bool {
 717 |         **self == **other
 718 |     }
 719 | }
 720 | 
 721 | 
 722 | impl hash::Hash for CodePoint {
 723 |     #[inline]
 724 |     fn hash<H: hash::Hasher>(&self, state: &mut H) {
 725 |         self.value.hash(state)
 726 |     }
 727 | }
 728 | 
 729 | impl hash::Hash for Wtf8Buf {
 730 |     #[inline]
 731 |     fn hash<H: hash::Hasher>(&self, state: &mut H) {
 732 |         Wtf8::hash(self, state)
 733 |     }
 734 | }
 735 | 
 736 | impl hash::Hash for Wtf8 {
 737 |     #[inline]
 738 |     fn hash<H: hash::Hasher>(&self, state: &mut H) {
 739 |         state.write(&self.bytes);
 740 |         0xfeu8.hash(state)
 741 |     }
 742 | }
 743 | 
 744 | 
 745 | #[cfg(test)]
 746 | mod tests {
 747 |     use alloc::format;
 748 |     use alloc::vec;
 749 |     use core::mem::transmute;
 750 |     use super::*;
 751 | 
 752 |     #[test]
 753 |     fn code_point_from_u32() {
 754 |         assert!(CodePoint::from_u32(0).is_some());
 755 |         assert!(CodePoint::from_u32(0xD800).is_some());
 756 |         assert!(CodePoint::from_u32(0x10FFFF).is_some());
 757 |         assert!(CodePoint::from_u32(0x110000).is_none());
 758 |     }
 759 | 
 760 |     #[test]
 761 |     fn code_point_to_u32() {
 762 |         fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() }
 763 |         assert_eq!(c(0).to_u32(), 0);
 764 |         assert_eq!(c(0xD800).to_u32(), 0xD800);
 765 |         assert_eq!(c(0x10FFFF).to_u32(), 0x10FFFF);
 766 |     }
 767 | 
 768 |     #[test]
 769 |     fn code_point_from_char() {
 770 |         assert_eq!(CodePoint::from_char('a').to_u32(), 0x61);
 771 |         assert_eq!(CodePoint::from_char('💩').to_u32(), 0x1F4A9);
 772 |     }
 773 | 
 774 |     #[test]
 775 |     fn code_point_to_string() {
 776 |         assert_eq!(format!("{:?}", CodePoint::from_char('a')), "U+0061");
 777 |         assert_eq!(format!("{:?}", CodePoint::from_char('💩')), "U+1F4A9");
 778 |     }
 779 | 
 780 |     #[test]
 781 |     fn code_point_to_char() {
 782 |         fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() }
 783 |         assert_eq!(c(0x61).to_char(), Some('a'));
 784 |         assert_eq!(c(0x1F4A9).to_char(), Some('💩'));
 785 |         assert_eq!(c(0xD800).to_char(), None);
 786 |     }
 787 | 
 788 |     #[test]
 789 |     fn code_point_to_char_lossy() {
 790 |         fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() }
 791 |         assert_eq!(c(0x61).to_char_lossy(), 'a');
 792 |         assert_eq!(c(0x1F4A9).to_char_lossy(), '💩');
 793 |         assert_eq!(c(0xD800).to_char_lossy(), '\u{FFFD}');
 794 |     }
 795 | 
 796 |     #[test]
 797 |     fn wtf8buf_new() {
 798 |         assert_eq!(Wtf8Buf::new().bytes, b"");
 799 |     }
 800 | 
 801 |     #[test]
 802 |     fn wtf8buf_from_str() {
 803 |         assert_eq!(Wtf8Buf::from_str("").bytes, b"");
 804 |         assert_eq!(Wtf8Buf::from_str("aé 💩").bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
 805 |     }
 806 | 
 807 |     #[test]
 808 |     fn wtf8buf_from_string() {
 809 |         assert_eq!(Wtf8Buf::from_string(String::from("")).bytes, b"");
 810 |         assert_eq!(Wtf8Buf::from_string(String::from("aé 💩")).bytes,
 811 |                    b"a\xC3\xA9 \xF0\x9F\x92\xA9");
 812 |     }
 813 | 
 814 |     #[test]
 815 |     fn wtf8buf_from_ill_formed_utf16() {
 816 |         assert_eq!(Wtf8Buf::from_ill_formed_utf16(&[]).bytes, b"");
 817 |         assert_eq!(Wtf8Buf::from_ill_formed_utf16(
 818 |                       &[0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]).bytes,
 819 |                    b"a\xC3\xA9 \xED\xA0\xBD\xF0\x9F\x92\xA9");
 820 |     }
 821 | 
 822 |     #[test]
 823 |     fn wtf8buf_push_str() {
 824 |         let mut string = Wtf8Buf::new();
 825 |         assert_eq!(string.bytes, b"");
 826 |         string.push_str("aé 💩");
 827 |         assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
 828 |     }
 829 | 
 830 |     #[test]
 831 |     fn wtf8buf_push_char() {
 832 |         let mut string = Wtf8Buf::from_str("aé ");
 833 |         assert_eq!(string.bytes, b"a\xC3\xA9 ");
 834 |         string.push_char('💩');
 835 |         assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
 836 |     }
 837 | 
 838 |     #[test]
 839 |     fn wtf8buf_push() {
 840 |         let mut string = Wtf8Buf::from_str("aé ");
 841 |         assert_eq!(string.bytes, b"a\xC3\xA9 ");
 842 |         string.push(CodePoint::from_char('💩'));
 843 |         assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
 844 | 
 845 |         fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() }
 846 | 
 847 |         let mut string = Wtf8Buf::new();
 848 |         string.push(c(0xD83D));  // lead
 849 |         string.push(c(0xDCA9));  // trail
 850 |         assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9");  // Magic!
 851 | 
 852 |         let mut string = Wtf8Buf::new();
 853 |         string.push(c(0xD83D));  // lead
 854 |         string.push(c(0x20));  // not surrogate
 855 |         string.push(c(0xDCA9));  // trail
 856 |         assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9");
 857 | 
 858 |         let mut string = Wtf8Buf::new();
 859 |         string.push(c(0xD800));  // lead
 860 |         string.push(c(0xDBFF));  // lead
 861 |         assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF");
 862 | 
 863 |         let mut string = Wtf8Buf::new();
 864 |         string.push(c(0xD800));  // lead
 865 |         string.push(c(0xE000));  // not surrogate
 866 |         assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80");
 867 | 
 868 |         let mut string = Wtf8Buf::new();
 869 |         string.push(c(0xD7FF));  // not surrogate
 870 |         string.push(c(0xDC00));  // trail
 871 |         assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80");
 872 | 
 873 |         let mut string = Wtf8Buf::new();
 874 |         string.push(c(0x61));  // not surrogate, < 3 bytes
 875 |         string.push(c(0xDC00));  // trail
 876 |         assert_eq!(string.bytes, b"\x61\xED\xB0\x80");
 877 | 
 878 |         let mut string = Wtf8Buf::new();
 879 |         string.push(c(0xDC00));  // trail
 880 |         assert_eq!(string.bytes, b"\xED\xB0\x80");
 881 |     }
 882 | 
 883 |     #[test]
 884 |     fn wtf8buf_push_wtf8() {
 885 |         let mut string = Wtf8Buf::from_str("aé");
 886 |         assert_eq!(string.bytes, b"a\xC3\xA9");
 887 |         string.push_wtf8(Wtf8::from_str(" 💩"));
 888 |         assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
 889 | 
 890 |         fn w(value: &[u8]) -> &Wtf8 { unsafe { transmute(value) } }
 891 | 
 892 |         let mut string = Wtf8Buf::new();
 893 |         string.push_wtf8(w(b"\xED\xA0\xBD"));  // lead
 894 |         string.push_wtf8(w(b"\xED\xB2\xA9"));  // trail
 895 |         assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9");  // Magic!
 896 | 
 897 |         let mut string = Wtf8Buf::new();
 898 |         string.push_wtf8(w(b"\xED\xA0\xBD"));  // lead
 899 |         string.push_wtf8(w(b" "));  // not surrogate
 900 |         string.push_wtf8(w(b"\xED\xB2\xA9"));  // trail
 901 |         assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9");
 902 | 
 903 |         let mut string = Wtf8Buf::new();
 904 |         string.push_wtf8(w(b"\xED\xA0\x80"));  // lead
 905 |         string.push_wtf8(w(b"\xED\xAF\xBF"));  // lead
 906 |         assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF");
 907 | 
 908 |         let mut string = Wtf8Buf::new();
 909 |         string.push_wtf8(w(b"\xED\xA0\x80"));  // lead
 910 |         string.push_wtf8(w(b"\xEE\x80\x80"));  // not surrogate
 911 |         assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80");
 912 | 
 913 |         let mut string = Wtf8Buf::new();
 914 |         string.push_wtf8(w(b"\xED\x9F\xBF"));  // not surrogate
 915 |         string.push_wtf8(w(b"\xED\xB0\x80"));  // trail
 916 |         assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80");
 917 | 
 918 |         let mut string = Wtf8Buf::new();
 919 |         string.push_wtf8(w(b"a"));  // not surrogate, < 3 bytes
 920 |         string.push_wtf8(w(b"\xED\xB0\x80"));  // trail
 921 |         assert_eq!(string.bytes, b"\x61\xED\xB0\x80");
 922 | 
 923 |         let mut string = Wtf8Buf::new();
 924 |         string.push_wtf8(w(b"\xED\xB0\x80"));  // trail
 925 |         assert_eq!(string.bytes, b"\xED\xB0\x80");
 926 |     }
 927 | 
 928 |     #[test]
 929 |     fn wtf8buf_truncate() {
 930 |         let mut string = Wtf8Buf::from_str("aé");
 931 |         string.truncate(1);
 932 |         assert_eq!(string.bytes, b"a");
 933 |     }
 934 | 
 935 |     #[test]
 936 |     #[should_panic]
 937 |     fn wtf8buf_truncate_fail_code_point_boundary() {
 938 |         let mut string = Wtf8Buf::from_str("aé");
 939 |         string.truncate(2);
 940 |     }
 941 | 
 942 |     #[test]
 943 |     #[should_panic]
 944 |     fn wtf8buf_truncate_fail_longer() {
 945 |         let mut string = Wtf8Buf::from_str("aé");
 946 |         string.truncate(4);
 947 |     }
 948 | 
 949 |     #[test]
 950 |     fn wtf8buf_into_string() {
 951 |         let mut string = Wtf8Buf::from_str("aé 💩");
 952 |         assert_eq!(string.clone().into_string(), Ok(String::from("aé 💩")));
 953 |         string.push(CodePoint::from_u32(0xD800).unwrap());
 954 |         assert_eq!(string.clone().into_string(), Err(string));
 955 |     }
 956 | 
 957 |     #[test]
 958 |     fn wtf8buf_into_string_lossy() {
 959 |         let mut string = Wtf8Buf::from_str("aé 💩");
 960 |         assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩"));
 961 |         string.push(CodePoint::from_u32(0xD800).unwrap());
 962 |         assert_eq!(string.clone().into_string_lossy(), String::from("aé 💩�"));
 963 |     }
 964 | 
 965 |     #[test]
 966 |     fn wtf8buf_from_iterator() {
 967 |         fn f(values: &[u32]) -> Wtf8Buf {
 968 |             values.iter().map(|&c| CodePoint::from_u32(c).unwrap()).collect::<Wtf8Buf>()
 969 |         }
 970 |         assert_eq!(f(&[0x61, 0xE9, 0x20, 0x1F4A9]).bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
 971 | 
 972 |         assert_eq!(f(&[0xD83D, 0xDCA9]).bytes, b"\xF0\x9F\x92\xA9");  // Magic!
 973 |         assert_eq!(f(&[0xD83D, 0x20, 0xDCA9]).bytes, b"\xED\xA0\xBD \xED\xB2\xA9");
 974 |         assert_eq!(f(&[0xD800, 0xDBFF]).bytes, b"\xED\xA0\x80\xED\xAF\xBF");
 975 |         assert_eq!(f(&[0xD800, 0xE000]).bytes, b"\xED\xA0\x80\xEE\x80\x80");
 976 |         assert_eq!(f(&[0xD7FF, 0xDC00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80");
 977 |         assert_eq!(f(&[0x61, 0xDC00]).bytes, b"\x61\xED\xB0\x80");
 978 |         assert_eq!(f(&[0xDC00]).bytes, b"\xED\xB0\x80");
 979 |     }
 980 | 
 981 |     #[test]
 982 |     fn wtf8buf_extend() {
 983 |         fn e(initial: &[u32], extended: &[u32]) -> Wtf8Buf {
 984 |             fn c(value: &u32) -> CodePoint { CodePoint::from_u32(*value).unwrap() }
 985 |             let mut string = initial.iter().map(c).collect::<Wtf8Buf>();
 986 |             string.extend(extended.iter().map(c));
 987 |             string
 988 |         }
 989 | 
 990 |         assert_eq!(e(&[0x61, 0xE9], &[0x20, 0x1F4A9]).bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
 991 | 
 992 |         assert_eq!(e(&[0xD83D], &[0xDCA9]).bytes, b"\xF0\x9F\x92\xA9");  // Magic!
 993 |         assert_eq!(e(&[0xD83D, 0x20], &[0xDCA9]).bytes, b"\xED\xA0\xBD \xED\xB2\xA9");
 994 |         assert_eq!(e(&[0xD800], &[0xDBFF]).bytes, b"\xED\xA0\x80\xED\xAF\xBF");
 995 |         assert_eq!(e(&[0xD800], &[0xE000]).bytes, b"\xED\xA0\x80\xEE\x80\x80");
 996 |         assert_eq!(e(&[0xD7FF], &[0xDC00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80");
 997 |         assert_eq!(e(&[0x61], &[0xDC00]).bytes, b"\x61\xED\xB0\x80");
 998 |         assert_eq!(e(&[], &[0xDC00]).bytes, b"\xED\xB0\x80");
 999 |     }
1000 | 
1001 |     #[test]
1002 |     fn wtf8buf_debug() {
1003 |         let mut string = Wtf8Buf::from_str("aé 💩");
1004 |         string.push(CodePoint::from_u32(0xD800).unwrap());
1005 |         assert_eq!(format!("{:?}", string), r#""aé 💩\u{D800}""#);
1006 |     }
1007 | 
1008 |     #[test]
1009 |     fn wtf8buf_as_slice() {
1010 |         assert_eq!(Wtf8Buf::from_str("aé"), Wtf8::from_str("aé"));
1011 |     }
1012 | 
1013 |     #[test]
1014 |     fn wtf8_debug() {
1015 |         let mut string = Wtf8Buf::from_str("aé 💩");
1016 |         string.push(CodePoint::from_u32(0xD800).unwrap());
1017 |         assert_eq!(format!("{:?}", &*string), r#""aé 💩\u{D800}""#);
1018 |     }
1019 | 
1020 |     #[test]
1021 |     fn wtf8_from_str() {
1022 |         assert_eq!(&Wtf8::from_str("").bytes, b"");
1023 |         assert_eq!(&Wtf8::from_str("aé 💩").bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
1024 |     }
1025 | 
1026 |     #[test]
1027 |     fn wtf8_len() {
1028 |         assert_eq!(Wtf8::from_str("").len(), 0);
1029 |         assert_eq!(Wtf8::from_str("aé 💩").len(), 8);
1030 |     }
1031 | 
1032 |     #[test]
1033 |     fn wtf8_slice() {
1034 |         assert_eq!(&Wtf8::from_str("aé 💩").slice(1, 4).bytes, b"\xC3\xA9 ");
1035 |     }
1036 | 
1037 |     #[test]
1038 |     #[should_panic]
1039 |     fn wtf8_slice_not_code_point_boundary() {
1040 |         Wtf8::from_str("aé 💩").slice(2, 4);
1041 |     }
1042 | 
1043 |     #[test]
1044 |     fn wtf8_slice_from() {
1045 |         assert_eq!(&Wtf8::from_str("aé 💩").slice_from(1).bytes, b"\xC3\xA9 \xF0\x9F\x92\xA9");
1046 |     }
1047 | 
1048 |     #[test]
1049 |     #[should_panic]
1050 |     fn wtf8_slice_from_not_code_point_boundary() {
1051 |         Wtf8::from_str("aé 💩").slice_from(2);
1052 |     }
1053 | 
1054 |     #[test]
1055 |     fn wtf8_slice_to() {
1056 |         assert_eq!(&Wtf8::from_str("aé 💩").slice_to(4).bytes, b"a\xC3\xA9 ");
1057 |     }
1058 | 
1059 |     #[test]
1060 |     #[should_panic]
1061 |     fn wtf8_slice_to_not_code_point_boundary() {
1062 |         Wtf8::from_str("aé 💩").slice_from(5);
1063 |     }
1064 | 
1065 |     #[test]
1066 |     fn wtf8_ascii_byte_at() {
1067 |         let slice = Wtf8::from_str("aé 💩");
1068 |         assert_eq!(slice.ascii_byte_at(0), b'a');
1069 |         assert_eq!(slice.ascii_byte_at(1), b'\xFF');
1070 |         assert_eq!(slice.ascii_byte_at(2), b'\xFF');
1071 |         assert_eq!(slice.ascii_byte_at(3), b' ');
1072 |         assert_eq!(slice.ascii_byte_at(4), b'\xFF');
1073 |     }
1074 | 
1075 |     #[test]
1076 |     fn wtf8_code_points() {
1077 |         fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() }
1078 |         fn cp(string: &Wtf8Buf) -> Vec<Option<char>> {
1079 |             string.code_points().map(|c| c.to_char()).collect::<Vec<_>>()
1080 |         }
1081 |         let mut string = Wtf8Buf::from_str("é ");
1082 |         assert_eq!(cp(&string), vec![Some('é'), Some(' ')]);
1083 |         string.push(c(0xD83D));
1084 |         assert_eq!(cp(&string), vec![Some('é'), Some(' '), None]);
1085 |         string.push(c(0xDCA9));
1086 |         assert_eq!(cp(&string), vec![Some('é'), Some(' '), Some('💩')]);
1087 |     }
1088 | 
1089 |     #[test]
1090 |     fn wtf8_as_str() {
1091 |         assert_eq!(Wtf8::from_str("").as_str(), Some(""));
1092 |         assert_eq!(Wtf8::from_str("aé 💩").as_str(), Some("aé 💩"));
1093 |         let mut string = Wtf8Buf::new();
1094 |         string.push(CodePoint::from_u32(0xD800).unwrap());
1095 |         assert_eq!(string.as_str(), None);
1096 |     }
1097 | 
1098 |     #[test]
1099 |     fn wtf8_to_string_lossy() {
1100 |         assert_eq!(Wtf8::from_str("").to_string_lossy(), Cow::Borrowed(""));
1101 |         assert_eq!(Wtf8::from_str("aé 💩").to_string_lossy(), Cow::Borrowed("aé 💩"));
1102 |         let mut string = Wtf8Buf::from_str("aé 💩");
1103 |         string.push(CodePoint::from_u32(0xD800).unwrap());
1104 |         assert_eq!(string.to_string_lossy(), {
1105 |             let o: Cow<str> = Cow::Owned(String::from("aé 💩�"));
1106 |             o
1107 |         });
1108 |     }
1109 | 
1110 |     #[test]
1111 |     fn wtf8_to_ill_formed_utf16() {
1112 |         let mut string = Wtf8Buf::from_str("aé ");
1113 |         string.push(CodePoint::from_u32(0xD83D).unwrap());
1114 |         string.push_char('💩');
1115 |         assert_eq!(string.to_ill_formed_utf16().collect::<Vec<_>>(),
1116 |                    vec![0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]);
1117 |     }
1118 | }
1119 | 


--------------------------------------------------------------------------------