├── .gitignore ├── fuzz ├── .gitignore ├── fuzz_targets │ ├── utf8char_from_slice_start.rs │ ├── utf16char_decoding_iterators.rs │ └── utf8char_decoding_iterators.rs └── Cargo.toml ├── AUTHORS.md ├── LICENSE-MIT ├── Cargo.toml ├── tests ├── exhaustive.rs ├── iterators.rs ├── errs.rs └── oks.rs ├── .cirrus.yml ├── examples └── length_distribution.rs ├── do.sh ├── src ├── lib.rs ├── utf16_iterators.rs ├── utf8_iterators.rs ├── errors.rs ├── decoding_iterators.rs ├── utf8_char.rs └── utf16_char.rs ├── README.md ├── benches ├── multiiterators.rs └── length.rs ├── RELEASES.md └── LICENSE-APACHE /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | Cargo.lock 3 | benches/texts/ 4 | -------------------------------------------------------------------------------- /fuzz/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target 3 | corpus 4 | artifacts 5 | -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | # The encode_unicode Developers 2 | 3 | * Torbjørn Birch Moltu 4 | * Aljoscha Meyer 5 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/utf8char_from_slice_start.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | #[macro_use] 3 | extern crate libfuzzer_sys; 4 | extern crate encode_unicode; 5 | 6 | use encode_unicode::Utf8Char; 7 | 8 | fuzz_target!(|data: &[u8]| { 9 | if data.len() > 0 { 10 | // validate the result of encode_unicode against the std library 11 | match Utf8Char::from_slice_start(data) { 12 | Err(_) => assert!(std::str::from_utf8(data).is_err()), 13 | Ok((c, len)) => assert_eq!(c.as_str(), std::str::from_utf8(&data[..len]).unwrap()), 14 | } 15 | } 16 | }); 17 | -------------------------------------------------------------------------------- /fuzz/Cargo.toml: -------------------------------------------------------------------------------- 1 | 2 | [package] 3 | name = "encode_unicode-fuzz" 4 | version = "0.0.1" 5 | authors = ["Automatically generated"] 6 | publish = false 7 | 8 | [package.metadata] 9 | cargo-fuzz = true 10 | 11 | [dependencies.encode_unicode] 12 | path = ".." 13 | [dependencies.libfuzzer-sys] 14 | git = "https://github.com/rust-fuzz/libfuzzer-sys.git" 15 | 16 | # Prevent this from interfering with workspaces 17 | [workspace] 18 | members = ["."] 19 | 20 | [[bin]] 21 | name = "utf8char_from_slice_start" 22 | path = "fuzz_targets/utf8char_from_slice_start.rs" 23 | 24 | [[bin]] 25 | name = "utf8char_decoding_iterators" 26 | path = "fuzz_targets/utf8char_decoding_iterators.rs" 27 | 28 | [[bin]] 29 | name = "utf16char_decoding_iterators" 30 | path = "fuzz_targets/utf16char_decoding_iterators.rs" 31 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Permission is hereby granted, free of charge, to any person obtaining a copy 2 | of this software and associated documentation files (the "Software"), to deal 3 | in the Software without restriction, including without limitation the rights 4 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 5 | copies of the Software, and to permit persons to whom the Software is 6 | furnished to do so, subject to the following conditions: 7 | 8 | The above copyright notice and this permission notice shall be included in all 9 | copies or substantial portions of the Software. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 17 | SOFTWARE 18 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "encode_unicode" 3 | keywords = ["unicode","UTF-8","UTF-16"] 4 | categories = ["encoding","no-std"] 5 | description = """ 6 | UTF-8 and UTF-16 character types, iterators and related methods for char, u8 and u16. 7 | """ 8 | readme = "README.md" 9 | version = "1.0.0" 10 | license = "Apache-2.0 OR MIT" 11 | repository = "https://github.com/tormol/encode_unicode" 12 | documentation = "https://docs.rs/encode_unicode/" 13 | authors = ["Torbjørn Birch Moltu "] 14 | edition = "2021" 15 | 16 | [dependencies.ascii] 17 | optional = true 18 | version = "^1.0.0" 19 | default-features = false # don't need std for the parts we use 20 | 21 | [target.'cfg(unix)'.dev-dependencies] 22 | lazy_static = "^1.0" 23 | 24 | [dev-dependencies.minreq] 25 | version = "^2.6" 26 | features = ["https-native"] 27 | 28 | [features] 29 | std = [] 30 | default = ["std"] 31 | 32 | [[bench]] 33 | name="length" 34 | required-features = ["std"] 35 | 36 | [badges.maintenance] 37 | status = "passively-maintained" 38 | # Too low activity for is-it-maintained-issue-resolution 39 | 40 | [package.metadata.docs.rs] 41 | features = ["ascii/std"] 42 | -------------------------------------------------------------------------------- /tests/exhaustive.rs: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-2022 Torbjørn Birch Moltu 2 | * 3 | * Licensed under the Apache License, Version 2.0, or the MIT license , at your option. This file may not be 6 | * copied, modified, or distributed except according to those terms. 7 | */ 8 | 9 | //! Tests that try all possible values for at least one parameter / byte / unit 10 | //! of the tested function. 11 | 12 | use core::char; 13 | extern crate encode_unicode; 14 | use encode_unicode::*; 15 | 16 | #[test] 17 | fn from_ascii() { 18 | for cp in 0u32..256 { 19 | assert_eq!(Utf8Char::from_ascii(cp as u8).is_ok(), cp & 0x80 == 0); 20 | if let Ok(u8c) = Utf8Char::from_ascii(cp as u8) { 21 | assert_eq!(u8c, Utf8Char::from(cp as u8 as char)); 22 | } 23 | } 24 | } 25 | 26 | #[test] 27 | #[cfg_attr(miri, ignore)] 28 | fn from_bmp() { 29 | for cp in 0u32..0x1_00_00 { 30 | assert_eq!( 31 | Utf16Char::from_bmp(cp as u16).ok(), 32 | char::from_u32(cp).map(Utf16Char::from) 33 | ); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/utf16char_decoding_iterators.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | #[macro_use] extern crate libfuzzer_sys; 3 | extern crate encode_unicode; 4 | 5 | use encode_unicode::{IterExt, SliceExt, Utf16Char}; 6 | use std::char; 7 | 8 | fuzz_target!(|data: &[u8]| { 9 | if data.len() % 2 != 0 { 10 | return; 11 | } 12 | let data = (0..data.len()/2).into_iter() 13 | .map(|i| ((data[i*2] as u16) << 8) | (data[i*2+1] as u16) ) 14 | .collect::>(); 15 | 16 | let from_units: Vec<_> = data.iter().to_utf16chars().collect(); 17 | let from_slice: Vec<_> = data.utf16char_indices().collect(); 18 | 19 | let mut surrogates = 0; 20 | for (i, (&ur, &(offset,sr,len))) in from_units.iter().zip(&from_slice).enumerate() { 21 | assert_eq!(sr, ur, "{} (data: +{})", i, surrogates); 22 | assert_eq!(offset, i+surrogates); 23 | let unit = data[i+surrogates]; 24 | if let Some(c) = char::from_u32(unit as u32) { 25 | assert_eq!(ur, Ok(Utf16Char::from(c)), "{} (data: +{})", i, surrogates); 26 | assert_eq!(len, 1); 27 | } else { 28 | assert_eq!(char::from_u32(unit as u32), None); 29 | if let Ok(u16c) = ur { 30 | surrogates += 1; 31 | assert_eq!(char::from_u32(data[i+surrogates] as u32), None); 32 | assert_eq!(len, 2); 33 | assert!(u16c.to_char() > '\u{ffff}'); 34 | } else { 35 | assert_eq!(len, 1); 36 | } 37 | } 38 | } 39 | assert_eq!(from_units.len(), data.len()-surrogates); 40 | assert_eq!(from_slice.len(), data.len()-surrogates); 41 | }); 42 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/utf8char_decoding_iterators.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | #[macro_use] 3 | extern crate libfuzzer_sys; 4 | extern crate encode_unicode; 5 | 6 | use encode_unicode::{IterExt, SliceExt, U8UtfExt, Utf8Char}; 7 | use encode_unicode::error::Utf8ErrorKind::*; 8 | use std::str; 9 | 10 | fuzz_target!(|data: &[u8]| { 11 | let from_bytes: Vec<_> = data.iter().to_utf8chars().collect(); 12 | 13 | let mut byte_start = 0; 14 | let mut item_start = 0; 15 | loop { 16 | let (valid_up_to, error_length) = match str::from_utf8(&data[byte_start..]) { 17 | Ok(s) => (s.len(), None), 18 | Err(e) => (e.valid_up_to(), e.error_len()), 19 | }; 20 | let valid_range = byte_start..byte_start+valid_up_to; 21 | let good_part = str::from_utf8(&data[valid_range]).unwrap(); 22 | let mut chars = 0; 23 | for (i,c) in good_part.chars().enumerate() { 24 | chars += 1; 25 | assert_eq!(from_bytes.get(item_start+i), Some(&Ok(Utf8Char::from(c)))); 26 | } 27 | 28 | let error_start = item_start + chars; 29 | if let Some(error_length) = error_length { 30 | let error_end = error_start + error_length; 31 | assert!(from_bytes[error_start..error_end].iter().all(|r| r.is_err() )); 32 | item_start = error_end; 33 | byte_start = byte_start + valid_up_to + error_length; 34 | } else if byte_start + valid_up_to == data.len() { 35 | assert_eq!(from_bytes.len(), error_start); 36 | break; 37 | } else { 38 | data[byte_start + valid_up_to].extra_utf8_bytes().unwrap(); 39 | assert_eq!(from_bytes.len() - error_start, data.len() - valid_up_to - byte_start); 40 | assert_eq!(from_bytes[error_start].map_err(|e| e.kind() ), Err(TooFewBytes)); 41 | break; 42 | } 43 | } 44 | 45 | let from_slice: Vec<_> = data.utf8char_indices().map(|(_,r,_)| r ).collect(); 46 | for (i, (&br, &sr)) in from_bytes.iter().zip(&from_slice).enumerate() { 47 | match sr { 48 | // the slice-based iterator might detect too short earlier, 49 | // but that should be the only difference 50 | Err(e) if e.kind() == TooFewBytes || e.kind() == InterruptedSequence 51 | => assert!(br.is_err(), "byte {}", i), 52 | _ => assert_eq!(sr, br, "byte {}", i), 53 | } 54 | } 55 | assert_eq!(from_slice.len(), from_bytes.len()); 56 | }); 57 | -------------------------------------------------------------------------------- /.cirrus.yml: -------------------------------------------------------------------------------- 1 | task: 2 | name: stable 3 | container: 4 | image: rust 5 | cpu: 1 6 | memory: 1G 7 | allow_failures: false 8 | env: 9 | RUST_BACKTRACE: 1 10 | cargo_cache: 11 | folder: $HOME/.cargo/registry 12 | fingerprint_script: cat Cargo.lock 2> /dev/null || true 13 | target_cache: 14 | folder: target 15 | fingerprint_script: cat Cargo.lock 2> /dev/null || true 16 | setup_script: 17 | - rustup component add clippy 18 | info_script: 19 | - rustc --version 20 | check_script: 21 | - cargo check --examples --tests --no-default-features 22 | - cargo check --examples --tests --no-default-features --features std 23 | - cargo check --examples --tests --no-default-features --features ascii 24 | - cargo build --examples --tests --all-features 25 | - cargo clippy --tests --examples --all-features 26 | test_script: 27 | - cargo test --all-features --no-fail-fast -- --test-threads=1 28 | before_cache_script: 29 | - rm -rf $HOME/.cargo/registry/index 30 | 31 | task: 32 | name: MSRV 33 | container: 34 | image: rust:1.56 35 | cpu: 1 36 | memory: 1G 37 | allow_failures: false 38 | env: 39 | RUST_BACKTRACE: 1 40 | cargo_cache: 41 | folder: $HOME/.cargo/registry 42 | fingerprint_script: cat Cargo.lock 2> /dev/null || true 43 | target_cache: 44 | folder: target 45 | fingerprint_script: cat Cargo.lock 2> /dev/null || true 46 | info_script: 47 | - rustc --version 48 | build_script: 49 | # Lock to the specified minor versions of dependencies 50 | # to test that they work with our MSRV. 51 | # But that doesn't cover recursive dependencies, 52 | # so avoid checking examples and tests because they build dev dependencies. 53 | # Tests and examples don't need to work at MSRV anyway. 54 | - sed -i 's/"^/"~/' Cargo.toml 55 | - cargo check --no-default-features 56 | - cargo check --no-default-features --features std 57 | - cargo check --no-default-features --features ascii 58 | - cargo check --all-features 59 | before_cache_script: 60 | - rm -rf $HOME/.cargo/registry/index 61 | 62 | task: 63 | name: nightly 64 | container: 65 | image: rustlang/rust:nightly 66 | cpu: 1 67 | memory: 1G 68 | allow_failures: false 69 | cargo_cache: 70 | folder: $HOME/.cargo/registry 71 | fingerprint_script: cat Cargo.lock 2> /dev/null || true 72 | # rustc version is so likely to have changed that build artefacts are not worth caching 73 | setup_script: 74 | - cargo install cargo-fuzz 75 | - rustup component add miri 76 | info_script: 77 | - rustc --version 78 | check_script: 79 | - cargo check --benches --no-default-features 80 | - cargo check --benches --no-default-features --features std 81 | - cargo check --benches --no-default-features --features ascii 82 | - cargo build --benches --all-features 83 | - cargo fuzz build 84 | # fuzz supports feature selection, 85 | # but --no-default-features doesn't seem to have any effect 86 | test_script: 87 | # the doc tets are fast and should cover a lot of code 88 | - cargo miri test --all-features --doc -- --test-threads=1 89 | before_cache_script: 90 | - rm -rf $HOME/.cargo/registry/index 91 | -------------------------------------------------------------------------------- /examples/length_distribution.rs: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Torbjørn Birch Moltu 2 | * 3 | * Licensed under the Apache License, Version 2.0, or the MIT license , at your option. This file may not be 6 | * copied, modified, or distributed except according to those terms. 7 | */ 8 | 9 | //! Counts the number of codepoints of each UTF-8 length in files 10 | 11 | use std::env::args_os; 12 | use std::fs::File; 13 | use std::io::{self, Read, stdin}; 14 | use std::borrow::Cow; 15 | extern crate encode_unicode; 16 | use encode_unicode::U8UtfExt; 17 | 18 | #[derive(Default)] 19 | struct Distribution { 20 | bytes: usize, 21 | utf8: [usize; 4], 22 | } 23 | 24 | fn read(file: &mut dyn Read) -> (Distribution, Option) { 25 | let mut r = Distribution::default(); 26 | let mut buf = [0u8; 4096]; 27 | loop { 28 | let read = match file.read(&mut buf) { 29 | Ok(0) => return (r, None), 30 | Ok(n) => n, 31 | Err(e) => return (r, Some(e)), 32 | }; 33 | r.bytes += read; 34 | for (o, &b) in buf[..read].iter().enumerate() { 35 | if let Ok(i) = b.extra_utf8_bytes() { 36 | r.utf8[i] += 1; 37 | if i == 3 { 38 | let min = o.saturating_sub(20); 39 | let max = if o+23 <= read {o+23} else {read}; 40 | println!("{}", String::from_utf8_lossy(&buf[min..max])); 41 | } 42 | } 43 | } 44 | } 45 | } 46 | 47 | fn display(name_pad: usize, name: Cow, 48 | r: Distribution, err: Option) { 49 | let c = r.utf8; 50 | let characters = c[0]+c[1]+c[2]+c[3]; 51 | let s = [c[0], c[1]*2, c[2]*3, c[3]*4]; 52 | let p = [ 53 | (s[0]*100) as f32 / r.bytes as f32, 54 | (s[1]*100) as f32 / r.bytes as f32, 55 | (s[2]*100) as f32 / r.bytes as f32, 56 | (s[3]*100) as f32 / r.bytes as f32, 57 | ]; 58 | println!("{:>6$}: bytes: {:7}, UTF-8 distribution: [{:7}, {:6}, {:6}, {:6}]", 59 | name, r.bytes, s[0], s[1], s[2], s[3], name_pad 60 | ); 61 | println!("{5:6$} chars: {:7}, UTF-8 percentages: [{:>6.2}%, {:>5.2}%, {:>5.2}%, {:>5.2}%]", 62 | characters, p[0], p[1], p[2], p[3], "", name_pad 63 | ); 64 | if let Some(err) = err { 65 | println!("{1:2$} {}", err, "", name_pad); 66 | } 67 | } 68 | 69 | fn main() { 70 | let name_length = args_os().skip(1) 71 | .map(|path| path.to_string_lossy().chars().count() ) 72 | .max(); 73 | for path in args_os().skip(1) { 74 | let name = path.to_string_lossy(); 75 | let (r,err) = match File::open(&path) { 76 | Ok(mut file) => read(&mut file), 77 | Err(err) => { 78 | eprintln!("{}:\t{}", name, err); 79 | continue; 80 | } 81 | }; 82 | display(name_length.unwrap(), name, r, err); 83 | } 84 | if name_length.is_none() { 85 | let stdin = stdin(); 86 | let (r,err) = read(&mut stdin.lock()); 87 | display(0, Cow::Borrowed("stdin"), r, err); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /do.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e -o pipefail 3 | 4 | MSRV=1.56.1 5 | FUZZ_DURATION=60 6 | FUZZ_PAUSE=2 7 | 8 | if [[ ${1:0:1} == - || $1 == help ]] || (( $# > 1 )); then 9 | echo "A script to make it easy to check & lint & test everything." >&2 10 | echo "It assumes rustup is installed and that cargo +release works." >&2 11 | echo >&2 12 | echo "Usage: $0 ([setup|MSRV|check|test|ignored|clippy|miri|fuzz|bench|shellcheck|help])" >&2 13 | echo "If no argument is provided, all parts except ignored and help are run," >&2 14 | echo "but setup is only done if auto-detection fails." >&2 15 | exit 1 16 | fi 17 | 18 | # should have been a Makefile 19 | 20 | # core check, Minimum supported Rust version 21 | if [[ $1 == setup ]] || ! rustup show | grep --silent "$MSRV"; then 22 | rustup install "$MSRV" --no-self-update 23 | fi 24 | if [[ -z $1 || $1 == msrv ]]; then 25 | # FIXME modify Cargo.toml like on CI, and then restore it and Cargo.lock afterwards 26 | cargo "+$MSRV" build --all-features 27 | fi 28 | 29 | # check all feature combinations, stable 30 | if [[ $1 == setup ]] || ! rustup show | grep --silent stable; then 31 | rustup install stable --no-self-update 32 | fi 33 | if [[ -z $1 || $1 == check ]]; then 34 | cargo +stable check --examples --tests --no-default-features 35 | cargo +stable check --examples --tests --no-default-features --features std 36 | cargo +stable check --examples --tests --no-default-features --features ascii 37 | cargo +stable check --examples --tests --all-features 38 | fi 39 | 40 | # tests, stable 41 | if [[ -z $1 || $1 == test ]]; then 42 | cargo +stable test --all-features -- --quiet 43 | elif [[ $1 == ignored ]]; then 44 | cargo +stable test --all-features -- --quiet --ignored 45 | fi 46 | 47 | # clippy, nightly 48 | if [[ $1 == setup ]] || ! rustup show | grep --silent nightly; then 49 | rustup install nightly --no-self-update 50 | fi 51 | if [[ $1 == setup ]] || ! cargo +nightly help clippy >/dev/null 2>/dev/null; then 52 | rustup component add clippy --toolchain nightly 53 | fi 54 | if [[ -z $1 || $1 == clippy ]]; then 55 | cargo +nightly clippy --all-features --tests --benches --examples 56 | fi 57 | 58 | # miri, nightly 59 | if [[ $1 == setup ]] || ! cargo +nightly help miri >/dev/null 2>/dev/null; then 60 | rustup component add miri --toolchain nightly 61 | cargo +nightly miri setup 62 | fi 63 | if [[ -z $1 || $1 == miri ]]; then 64 | cargo +nightly miri test --all-features -- --quiet 65 | fi 66 | 67 | # fuzzing tests, nightly 68 | if [[ $1 == setup ]] || ! command -V cargo-fuzz >/dev/null 2>/dev/null; then 69 | cargo +nightly install cargo-fuzz 70 | fi 71 | if [[ -z $1 || $1 == fuzz ]]; then 72 | cargo +nightly fuzz build 73 | for fuzztest in $(cargo +nightly fuzz list); do 74 | sleep "$FUZZ_PAUSE" 75 | echo "Fuzzing $fuzztest" 76 | timeout "$FUZZ_DURATION" \ 77 | cargo +nightly fuzz run "$fuzztest" \ 78 | || true 79 | echo 80 | done 81 | fi 82 | 83 | # benchmarks, nightly 84 | if [[ -z $1 || $1 == bench ]]; then 85 | cargo +nightly check --benches --no-default-features 86 | cargo +nightly check --benches --no-default-features --features std 87 | cargo +nightly check --benches --no-default-features --features ascii 88 | cargo +nightly check --benches --all-features 89 | # need nocapture to not hide error if setup fails 90 | cargo +nightly bench --all-features -- --nocapture 91 | fi 92 | 93 | if [[ $1 == shellcheck || $1 == selfcheck ]] \ 94 | || ([[ -z $1 ]] && command -V shellcheck >/dev/null 2>/dev/null); then 95 | shellcheck "$0" 96 | fi 97 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | /* Copyright 2016-2022 Torbjørn Birch Moltu 2 | * Copyright 2018 Aljoscha Meyer 3 | * 4 | * Licensed under the Apache License, Version 2.0, or the MIT license , at your option. This file may not be 7 | * copied, modified, or distributed except according to those terms. 8 | */ 9 | 10 | 11 | /*! 12 | Miscellaneous UTF-8 and UTF-16 types and methods. 13 | 14 | # Optional features: 15 | * `#![no_std]`-mode: There are a few differences: 16 | * `Error` doesn't exist, but `description()` is made available as an inherent impl. 17 | * `Extend`/`FromIterator`-implementations for `String`/`Vec`/`Vec` are missing. 18 | * There is no `io`, so `Utf8Iterator` and `Utf8CharSplitter` doesn't implement `Read`. 19 | 20 | This feature is enabled by setting `default-features=false` in `Cargo.toml`: 21 | `encode_unicode = {version="0.3.4", default-features=false}` 22 | * Integration with the [ascii](https://tomprogrammer.github.io/rust-ascii/ascii/index.html) crate: 23 | Convert `Utf8Char` and `Utf16Char` to and from 24 | [`ascii::AsciiChar`](https://tomprogrammer.github.io/rust-ascii/ascii/enum.AsciiChar.html). 25 | 26 | # Minimum supported Rust version 27 | 28 | The minimum supported Rust version for 1.0.\* releases is 1.56. 29 | Later 1.y.0 releases might require newer Rust versions, but the three most 30 | recent stable releases at the time of publishing will always be supported. 31 | For example this means that if the current stable Rust version is 1.66 when 32 | `encode_unicode` 1.1.0 is released, then `encode_unicode` 1.1.\* will 33 | not require a newer Rust version than 1.63. 34 | 35 | [crates.io page](https://crates.io/crates/encode_unicode) 36 | [github repository](https://github.com/tormol/encode_unicode) 37 | 38 | */ 39 | 40 | #![cfg_attr(not(feature="std"), no_std)] 41 | 42 | #![warn(missing_docs, unsafe_op_in_unsafe_fn)] 43 | #![allow( 44 | clippy::unusual_byte_groupings,// I sometimes group into UTF-8 control part and codepoint part 45 | clippy::derive_hash_xor_eq,// tested 46 | clippy::len_without_is_empty,// the character types are never empty 47 | clippy::needless_return,// `foo.bar();\n foo` looks unfinished 48 | clippy::redundant_closure_call,// not redundant in macros 49 | clippy::cast_lossless,// the sizes are part of the struct name and so won't change 50 | clippy::many_single_char_names,// the variables are in different scopes 51 | clippy::cmp_owned,// smaller than pointer, and no allocations anyway 52 | clippy::wrong_self_convention,// smaller than pointer 53 | clippy::needless_range_loop,// the suggested iterator chains are less intuitive 54 | clippy::identity_op,// applying a set of opereations with varying arguments to many elements looks nice 55 | clippy::get_first,// .get(0), .get(1) is more readable 56 | clippy::question_mark,// I prefer it very explicit 57 | )] 58 | #![warn(clippy::doc_markdown, clippy::manual_filter_map)] 59 | // opt-in lints that might be interesting to recheck once in a while: 60 | //#![warn(clippy::unwrap_used)] 61 | 62 | mod errors; 63 | mod traits; 64 | mod utf8_char; 65 | mod utf8_iterators; 66 | mod utf16_char; 67 | mod utf16_iterators; 68 | mod decoding_iterators; 69 | 70 | pub use traits::{CharExt, U8UtfExt, U16UtfExt, StrExt, IterExt, SliceExt}; 71 | pub use utf8_char::Utf8Char; 72 | pub use utf16_char::Utf16Char; 73 | 74 | pub mod error {// keeping the public interface in one file 75 | //! Errors returned by various conversion methods in this crate. 76 | pub use crate::errors::{FromStrError, EmptyStrError}; 77 | pub use crate::errors::{CodepointError, NonAsciiError, NonBmpError}; 78 | pub use crate::errors::{Utf8Error, Utf8ErrorKind}; 79 | pub use crate::errors::{Utf16SliceError, Utf16ArrayError, Utf16TupleError}; 80 | pub use crate::errors::{Utf16FirstUnitError, Utf16PairError}; 81 | } 82 | 83 | pub mod iterator { 84 | //! Iterator types that you should rarely need to name 85 | pub use crate::utf8_iterators::{Utf8Iterator, Utf8CharSplitter, Utf8Chars, Utf8CharIndices}; 86 | pub use crate::utf16_iterators::{Utf16Iterator, Utf16CharSplitter, Utf16Chars, Utf16CharIndices}; 87 | pub use crate::decoding_iterators::{Utf8CharMerger, Utf8CharDecoder}; 88 | pub use crate::decoding_iterators::{Utf16CharMerger, Utf16CharDecoder}; 89 | } 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # encode_unicode 2 | 3 | UTF-8 and UTF-16 character types, iterators and related methods for `char`, `u8` and `u16`. 4 | 5 | [![crates.io page](https://img.shields.io/crates/v/encode_unicode.svg)](https://crates.io/crates/encode_unicode/) ![License: Apache-2 or MIT](https://img.shields.io/crates/l/encode_unicode.svg) [![Documentation on docs.rs](https://docs.rs/encode_unicode/badge.svg)](https://docs.rs/encode_unicode/) [![CI build status](https://api.cirrus-ci.com/github/tormol/encode_unicode.svg)](https://cirrus-ci.com/github/tormol/encode_unicode) 6 | 7 | ## Features 8 | 9 | * **[`Utf8Char`](https://docs.rs/encode_unicode/latest/encode_unicode/struct.Utf8Char.html)**: 10 | A `char` stored as UTF-8. Can be borrowed as a `str` or `u8` slice. 11 | * **[`Utf16Char`](https://docs.rs/encode_unicode/latest/encode_unicode/struct.Utf16Char.html)**: 12 | A `char` stored as UTF-16. Can be borrowed as an `u16` slice. 13 | * [Conversion methods on `char`](https://docs.rs/encode_unicode/latest/encode_unicode/trait.CharExt.html): 14 | * to and from UTF-8 as `[u8; 4]` or slice. 15 | * to and from UTF-16 as `(u16, Option)` or slice. 16 | * [Iterator adapters](https://docs.rs/encode_unicode/latest/encode_unicode/trait.IterExt.html) 17 | for converting betwenn `u8`s and `Utf8Char`s or `u16`s and `Utf16Char`s. 18 | * Optimized [slice-based decoding iterators](https://docs.rs/encode_unicode/latest/encode_unicode/trait.SliceExt.html). 19 | * [Precise errors when decoding a char from UTF-8, UTF-16 or `u32` fails](http://docs.rs/encode_unicode/latest/encode_unicode/error/index.html). 20 | * Utility methods on [`u8`](https://docs.rs/encode_unicode/latest/encode_unicode/trait.U8UtfExt.html) 21 | and [`u16`](https://docs.rs/encode_unicode/latest/encode_unicode/trait.U16UtfExt.html). 22 | 23 | ## Minimum supported Rust version 24 | 25 | The minimum supported Rust version for 1.0.\* releases is 1.56. 26 | Later 1.y.0 releases might require newer Rust versions, but the three most 27 | recent stable releases at the time of publishing will always be supported. 28 | For example this means that if the current stable Rust version is 1.66 when 29 | encode_unicode 1.1.0 is released, then encode_unicode 1.1.\* will 30 | not require a newer Rust version than 1.63. 31 | 32 | ## Optional features 33 | 34 | * `#![no_std]`-mode: There are a few differences: 35 | * `Error` doesn't exist, but `description()` is made available as an inherent impl. 36 | * `Extend`/`FromIterator`-implementations for `String`/`Vec`/`Vec` are missing. 37 | * There is no `io`, so `Utf8Iterator` and `Utf8CharSplitter` doesn't implement `Read`. 38 | This feature is enabled by setting `default-features=false` in `Cargo.toml`: 39 | `encode_unicode = {version="0.3.4", default-features=false}`. 40 | * Integration with the [ascii](https://tomprogrammer.github.io/rust-ascii/ascii/index.html) crate: 41 | Convert `Utf8Char` and `Utf16Char` to and from [ascii::`AsciiChar`](https://tomprogrammer.github.io/rust-ascii/ascii/enum.AsciiChar.html). 42 | 43 | ## License 44 | 45 | Licensed under either of 46 | 47 | * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) 48 | * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) 49 | 50 | at your option. 51 | 52 | ### Contribution 53 | 54 | Unless you explicitly state otherwise, any contribution intentionally 55 | submitted for inclusion in the work by you, as defined in the Apache-2.0 56 | license, shall be dual licensed as above, without any additional terms or 57 | conditions. 58 | 59 | ## Developing 60 | 61 | `do.sh` can be used to check all feature combinations, test everything, show output from benchmarks in case setup fails, run fuzz tests for a while and lint everything (except fuzz tests). 62 | It assumes [rustup](https://rustup.rs) is installed and that [`cargo +release`](https://rust-lang.github.io/rustup/concepts/index.html#how-rustup-works) works. 63 | (It is named the way it is to autocomplete fully from the first character after `./`.) 64 | 65 | ## History 66 | 67 | The original purpose of this crate was to provide standins for the then 68 | unstable `encode_utf8()` and `encode_utf16()` methods on `char`. 69 | The standins were removed in version 0.3 when Rust 1.15 stabilized the 70 | `encode_` methods, but the other stuff I added, such as iterators like 71 | those `encode_utf{8,16}()` returned for a while, might still be of use. 72 | -------------------------------------------------------------------------------- /benches/multiiterators.rs: -------------------------------------------------------------------------------- 1 | /* Copyright 2018 Torbjørn Birch Moltu 2 | * 3 | * Licensed under the Apache License, Version 2.0, or the MIT license , at your option. This file may not be 6 | * copied, modified, or distributed except according to those terms. 7 | */ 8 | 9 | // Run with -- --nocapture to show error messages if setup fails. 10 | // (or use ./do.sh) 11 | 12 | // uses /usr/share/dict/ for text to convert to Vec and iterate over 13 | #![cfg(all(unix, feature="std"))] 14 | #![feature(test)] 15 | extern crate test; 16 | use test::{Bencher, black_box}; 17 | #[macro_use] extern crate lazy_static; 18 | extern crate encode_unicode; 19 | use encode_unicode::{CharExt, Utf8Char, Utf16Char, IterExt}; 20 | 21 | fn read_or_exit(file: &str) -> String { 22 | let mut fd = std::fs::File::open(file).unwrap_or_else(|err| { 23 | if err.kind() == std::io::ErrorKind::NotFound { 24 | eprintln!("{} not found, skipping benchmarks.", file); 25 | std::process::exit(0); 26 | } else { 27 | eprintln!("Failed to open {}: {}.", file, err); 28 | std::process::exit(1); 29 | } 30 | }); 31 | let mut content = String::new(); 32 | std::io::Read::read_to_string(&mut fd, &mut content).unwrap_or_else(|err| { 33 | eprintln!("Failed to read {}: {}.", file, err); 34 | std::process::exit(1); 35 | }); 36 | content 37 | } 38 | 39 | lazy_static!{ 40 | // TODO find a big chinese file; `aptitude search '?provides(wordlist)'` didn't have one 41 | static ref ENGLISH: String = read_or_exit("/usr/share/dict/american-english"); 42 | static ref UTF8CHARS: Vec = ENGLISH.chars().map(|c| c.to_utf8() ).collect(); 43 | static ref UTF16CHARS: Vec = ENGLISH.chars().map(|c| c.to_utf16() ).collect(); 44 | } 45 | 46 | 47 | #[bench] 48 | fn utf16_split_all_single_mulititerator(b: &mut Bencher) { 49 | b.iter(|| { 50 | black_box(&*UTF16CHARS).iter().to_units().for_each(|u| assert!(u != 0) ); 51 | }); 52 | } 53 | #[bench] 54 | fn utf16_split_all_single_flatmap(b: &mut Bencher) { 55 | b.iter(|| { 56 | black_box(&*UTF16CHARS).iter().cloned().flatten().for_each(|u| assert!(u != 0) ); 57 | }); 58 | } 59 | #[bench] 60 | fn utf16_split_all_single_cloned_flatten(b: &mut Bencher) { 61 | b.iter(|| { 62 | black_box(&*UTF16CHARS).iter().cloned().flatten().for_each(|u| assert!(u != 0) ); 63 | }); 64 | } 65 | 66 | 67 | #[bench] 68 | fn utf8_split_mostly_ascii_multiiterator(b: &mut Bencher) { 69 | b.iter(|| { 70 | black_box(&*UTF8CHARS).iter().to_bytes().for_each(|b| assert!(b != 0) ); 71 | }); 72 | } 73 | #[bench] 74 | fn utf8_split_mostly_ascii_flatmap(b: &mut Bencher) { 75 | b.iter(|| { 76 | black_box(&*UTF8CHARS).iter().cloned().flatten().for_each(|b| assert!(b != 0) ); 77 | }); 78 | } 79 | #[bench] 80 | fn utf8_split_mostly_ascii_cloned_flatten(b: &mut Bencher) { 81 | b.iter(|| { 82 | black_box(&*UTF8CHARS).iter().cloned().flatten().for_each(|b| assert!(b != 0) ); 83 | }); 84 | } 85 | 86 | 87 | #[bench] 88 | fn utf8_extend_mostly_ascii_multiiterator(b: &mut Bencher) { 89 | b.iter(|| { 90 | let vec: Vec = black_box(&*UTF8CHARS).iter().to_bytes().collect(); 91 | assert_eq!(black_box(vec).len(), ENGLISH.len()); 92 | }); 93 | } 94 | #[bench] 95 | fn utf8_extend_mostly_ascii_custom(b: &mut Bencher) { 96 | b.iter(|| { 97 | let vec: Vec = black_box(&*UTF8CHARS).iter().collect(); 98 | assert_eq!(black_box(vec).len(), ENGLISH.len()); 99 | }); 100 | } 101 | #[bench] 102 | fn utf8_extend_mostly_ascii_custom_str(b: &mut Bencher) { 103 | b.iter(|| { 104 | let vec: String = black_box(&*UTF8CHARS).iter().cloned().collect(); 105 | assert_eq!(black_box(vec).len(), ENGLISH.len()); 106 | }); 107 | } 108 | 109 | #[bench] 110 | fn utf16_extend_all_single_multiiterator(b: &mut Bencher) { 111 | b.iter(|| { 112 | let vec: Vec = black_box(&*UTF16CHARS).iter().to_units().collect(); 113 | assert!(black_box(vec).len() < ENGLISH.len()); 114 | }); 115 | } 116 | #[bench] 117 | fn utf16_extend_all_single_custom(b: &mut Bencher) { 118 | b.iter(|| { 119 | let vec: Vec = black_box(&*UTF16CHARS).iter().collect(); 120 | assert!(black_box(vec).len() < ENGLISH.len()); 121 | }); 122 | } 123 | -------------------------------------------------------------------------------- /RELEASES.md: -------------------------------------------------------------------------------- 1 | Version 1.0.0 (2022-08-07) 2 | ========================== 3 | * Replace error types `InvalidUtf8Array`, `InvalidUtf8Slice`, `InvalidUtf8FirstByte` and `InvalidUtf8` with `Utf8Error` plus `Utf8ErrorKind`. 4 | Which of the new error kind variants is reported don't map 1:1 to the old enum variants: 5 | For example `Utf8ErrorKind::NonUtf8Byte` is returned for sequences that would previously have been reported as too high codepoint or overlong encoding. 6 | * Rename many other error types for consistency: 7 | * `InvalidCodepoint` -> `CodepointError` 8 | * `InvalidUtf16FirstUnit` -> `Utf16FirstUnitError` 9 | * `InvalidUtf16Array` -> `Utf16ArrayError` 10 | * `InvalidUtf16Slice` -> `Utf16SliceError` 11 | * `1InvalidUtf16Tuple` -> `Utf16TupleError` 12 | * Change return type of `CodepointError::error_range()` to `RangeInclusive`. 13 | * Rename some errors variants: 14 | * `Utf16SliceError::FirstLowSurrogate` -> `FirstIsTrailingSurrogate` 15 | * `Utf16SliceError::SecondNotLowSurrogate` -> `SecondIsNotTrailingSurrogate` 16 | * `Utf16TupleError::InvalidSecond` -> `SecondIsNotTrailingSurrogate` 17 | * Expose the error type of `Utf16Char::from_bmp()` and rename it to `NonBmpError`. 18 | * Remove re-exports of `Utf8CharIterator` and `Utf16CharIterator` from the crate root. 19 | (They are still exposed via the `iterator` module.) 20 | * Remove impls of the deprecated `AsciiExt` trait, 21 | and make the methods available in `#![no_std]`-mode. 22 | * Make many of the previously `AsciiExt` methods take self by value. 23 | * Drop support for pre-1.0 versions of the ascii crate. 24 | * Remove `iter_bytes()` and `iter_units()`. 25 | * Increase minimum Rust version to 1.56 and change the minimum Rust version policy. 26 | * Fix possible UB or panic in `Utf8Char::from_slice_start_unchecked()` when passed an empty slice. 27 | (relates to [#12](https://github.com/tormol/encode_unicode/issues/12).) 28 | * Make many methods `const fn`. 29 | * Add `const fn`s `Utf8Char::new()` and `Utf16Char::new()`. 30 | 31 | Version 0.3.6 (2019-08-23) 32 | ========================== 33 | * Fix pointless undefined behavior in `Utf16Char.to_ascii_char()` (which is part of ascii feature) 34 | * Widen ascii version requirement to include 1.\*. 35 | * Add `[u16; 2]` UTF-16 array alternatives to `(u16, Some(u16))` UTF-16 tuple methods. 36 | * Add `Utf16Char.is_bmp()`. 37 | 38 | Version 0.3.5 (2018-10-23) 39 | ========================== 40 | * Fix docs.rs build failure 41 | 42 | Version 0.3.4 (2018-10-23) 43 | ========================== 44 | * Fix UB in UTF-8 validation which lead to invalid codepoints being accepted in release mode. 45 | * Add fallible decoding iterator adapters `Utf8CharMerger` and `Utf16CharMerger` 46 | and slice-based iterators `Utf8CharDecoder` and `Utf16CharDecoder` 47 | * Widen ascii version requirement from 0.8.\* to 0.8.0 - 0.10.\* 48 | * Implement creating / extending `String`s from `Utf16Char`-producing iterators 49 | 50 | Version 0.3.3 (2018-10-16) 51 | ========================== 52 | * Fix UTF-8 overlong check. (`from_array()` and `from_slice()` accepted two-byte encodings of ASCII characters >= '@', which includes all letters) 53 | * Implement `FromStr` for `Utf16Char` 54 | * Add `from_str_start()` to `Utf8Char` and `Utf16Char` 55 | * Add `Utf{8,16}Char{s,Indices}`: `str`-based iterators for `Utf8Char` and `Utf16Char` equivalent to `char`'s `Chars` and `CharIndices`. 56 | * Add `StrExt` with functions to create the above iterators. 57 | * Implement `FromIterator` and `Extend` for `Vec<{u8,u16}>` with reference-producing `Utf{8,16}Char` iterators too. 58 | * Add `Utf8CharSplitter` and `Utf16CharSplitter`: `Utf{8,16}Char`-to-`u{8,16}` iterator adapters. 59 | * Add `IterExt`, `iter_bytes()` and `iter_units()` to create the above splitting iterators. 60 | * Add `Utf8Char::from_ascii()`, `Utf16Char::from_bmp()` with `_unchecked` versions of both. 61 | * Add cross-type `PartialEq` and `PartialOrd` implementations. 62 | * Change the `description()` for a few error types. 63 | 64 | Version 0.3.2 (2018-08-08) 65 | ========================== 66 | * Hide `AsciiExt` deprecation warning and add replacement methods. 67 | * Correct documentation for `U8UtfExt::extra_utf8_bytes()`. 68 | * Fix misspellings in some error descriptions. 69 | * Avoid potentially bad transmutes. 70 | 71 | Version 0.3.1 (2017-06-16) 72 | ========================== 73 | * Implement `Display` for `Utf8Char` and `Utf16Char`. 74 | 75 | Version 0.3.0 (2017-03-29) 76 | ========================== 77 | * Replace the "no_std" feature with opt-out "std". 78 | * Upgrade ascii to v0.8. 79 | * Make tests compile on stable. 80 | * Remove `CharExt::write_utf{8,16}()` because `encode_utf{8,16}()` has been stabilized. 81 | * Return a proper error from `U16UtfExt::utf16_needs_extra_unit()` instead of `None`. 82 | * Rename `U16UtfExt::utf_is_leading_surrogate()` to `is_utf16_leading_surrogate()`. 83 | * Rename `Utf16Char::from_slice()` to `from_slice_start()` and `CharExt::from_utf{8,16}_slice()` 84 | to `from_utf{8,16}_slice_start()` to be consistent with `Utf8Char`. 85 | * Fix a bug where `CharExt::from_slice()` would accept some trailing surrogates 86 | as standalone codepoints. 87 | 88 | Version 0.2.0 (2016-07-24) 89 | ========================== 90 | * Change `CharExt::write_utf{8,16}()` to panic instead of returning `None` 91 | if the slice is too short. 92 | * Fix bug where `CharExt::write_utf8()` and `Utf8Char::to_slice()` could change bytes it shouldn't. 93 | * Rename lots of errors with search and replace: 94 | * CodePoint -> Codepoint 95 | * Several -> Multiple 96 | * Update the ascii feature to use [ascii](https://tomprogrammer.github.io/rust-ascii/ascii/index.html) v0.7. 97 | * Support `#[no_std]`; see 70e090ee for differences. 98 | * Ungate impls of `AsciiExt`. (doesn't require ascii or nightly) 99 | * Make the tests compile (and pass) again. 100 | (They still require nightly). 101 | 102 | Version 0.1.* (2016-04-07) 103 | ========================== 104 | First release. 105 | -------------------------------------------------------------------------------- /tests/iterators.rs: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-2022 Torbjørn Birch Moltu 2 | * 3 | * Licensed under the Apache License, Version 2.0, or the MIT license , at your option. This file may not be 6 | * copied, modified, or distributed except according to those terms. 7 | */ 8 | 9 | //! Iterator tests 10 | 11 | #![cfg(feature="std")] 12 | 13 | #![allow( 14 | clippy::needless_collect,// test oee thing at a time 15 | )] 16 | 17 | extern crate encode_unicode; 18 | 19 | use encode_unicode::{IterExt, SliceExt, CharExt, Utf8Char}; 20 | use encode_unicode::iterator::Utf8CharSplitter; 21 | use encode_unicode::error::Utf8ErrorKind::*; 22 | use encode_unicode::error::Utf16PairError::*; 23 | use std::io::Read; 24 | use std::cmp::min; 25 | 26 | #[test] fn utf8charmerger() { 27 | let slice = b"\xf0\xa1\x92X\xcc\xbb"; 28 | let mut iter = slice.iter().to_utf8chars(); 29 | assert_eq!(iter.size_hint(), (1, Some(6))); 30 | assert_eq!(format!("{:?}", &iter), 31 | format!("Utf8CharMerger {{ buffered: [], inner: {:?} }}", slice.iter())); 32 | 33 | assert_eq!(iter.next().map(|v| v.map_err(|e| e.kind() ) ), Some(Err(InterruptedSequence))); 34 | assert_eq!(iter.size_hint(), (0, Some(5))); 35 | assert_eq!( 36 | format!("{:?}", &iter), 37 | format!("Utf8CharMerger {{ buffered: [161, 146, 88], inner: {:?} }}", slice[4..].iter()) 38 | ); 39 | 40 | assert_eq!(iter.next().map(|v| v.map_err(|e| e.kind() ) ), Some(Err(UnexpectedContinuationByte))); 41 | assert_eq!(iter.into_inner().next(), Some(&b'\xcc')); 42 | } 43 | 44 | #[test] fn utf8chardecoder() { 45 | let slice = b"\xf4\xbf\x80\x80XY\xcc\xbbZ_"; 46 | let mut iter = slice.utf8char_indices(); 47 | assert_eq!(iter.size_hint(), (2, Some(10))); 48 | assert_eq!( 49 | format!("{:?}", &iter), 50 | format!("Utf8CharDecoder {{ bytes[0..]: {:?} }}", &slice) 51 | ); 52 | 53 | match iter.next() { 54 | Some((0, Err(e), 1)) => assert_eq!(e.kind(), TooHighCodepoint), 55 | wrong => panic!("Expected Some((0, Err(TooHighCodepoint), 1), got {:?}", wrong), 56 | } 57 | assert_eq!( 58 | format!("{:?}", &iter), 59 | format!("Utf8CharDecoder {{ bytes[1..]: {:?} }}", &slice[1..]) 60 | ); 61 | assert_eq!(iter.size_hint(), (2, Some(9))); 62 | assert_eq!(iter.count(), 8); 63 | } 64 | 65 | #[test] fn utf16charmerger() { 66 | let slice = [0xd800, 'x' as u16, 0xd900, 0xdfff, 'λ' as u16]; 67 | let mut iter = slice.iter().to_utf16chars(); 68 | assert_eq!(iter.size_hint(), (2, Some(5))); 69 | assert_eq!(format!("{:?}", &iter), 70 | format!("Utf16CharMerger {{ buffered: None, inner: {:?} }}", slice.iter())); 71 | 72 | assert_eq!(iter.next(), Some(Err(UnmatchedLeadingSurrogate))); 73 | assert_eq!(iter.size_hint(), (1, Some(4))); 74 | assert_eq!( 75 | format!("{:?}", &iter), 76 | format!("Utf16CharMerger {{ buffered: Some(120), inner: {:?} }}", slice[2..].iter()) 77 | ); 78 | 79 | assert_eq!(iter.into_inner().next(), Some(&0xd900)); 80 | } 81 | 82 | #[test] fn utf16chardecoder() { 83 | let slice = [0xd800, 'x' as u16, 0xd900, 0xdfff, 'λ' as u16]; 84 | let mut iter = slice.utf16char_indices(); 85 | assert_eq!(iter.size_hint(), (2, Some(5))); 86 | assert_eq!( 87 | format!("{:?}", &iter), 88 | format!("Utf16CharDecoder {{ units[0..]: {:?} }}", &slice) 89 | ); 90 | 91 | assert_eq!(iter.next(), Some((0, Err(UnmatchedLeadingSurrogate), 1))); 92 | assert_eq!( 93 | format!("{:?}", &iter), 94 | format!("Utf16CharDecoder {{ units[1..]: {:?} }}", &slice[1..]) 95 | ); 96 | assert_eq!(iter.size_hint(), (2, Some(4))); 97 | assert_eq!(iter.count(), 3); 98 | } 99 | 100 | 101 | 102 | /// Tests for ensuring that iterators which also implement Read support 103 | /// interleaving calls of `read()` and `next()`, and that they implement Read 104 | /// correctly (support any buffer size at any time). 105 | 106 | #[test] fn read_single_ascii() { 107 | let uc = 'a'.to_utf8(); 108 | assert_eq!(uc.len(), 1); 109 | for chunk in 1..5 { 110 | let mut buf = [b'E'; 6]; 111 | let mut iter = uc.into_iter(); 112 | let mut written = 0; 113 | for _ in 0..4 { 114 | assert_eq!(iter.read(&mut buf[..0]).unwrap(), 0); 115 | let wrote = iter.read(&mut buf[written..written+chunk]).unwrap(); 116 | assert_eq!(wrote, min(1-written, chunk)); 117 | written += wrote; 118 | for &b in &buf[written..] {assert_eq!(b, b'E');} 119 | assert_eq!(buf[..written], AsRef::<[u8]>::as_ref(&uc)[..written]); 120 | } 121 | assert_eq!(written, 1); 122 | } 123 | } 124 | 125 | #[test] fn read_single_nonascii() { 126 | let uc = 'ä'.to_utf8(); 127 | assert_eq!(uc.len(), 2); 128 | for chunk in 1..5 { 129 | let mut buf = [b'E'; 6]; 130 | let mut iter = uc.into_iter(); 131 | let mut written = 0; 132 | for _ in 0..4 { 133 | assert_eq!(iter.read(&mut buf[..0]).unwrap(), 0); 134 | let wrote = iter.read(&mut buf[written..written+chunk]).unwrap(); 135 | assert_eq!(wrote, min(2-written, chunk)); 136 | written += wrote; 137 | for &b in &buf[written..] {assert_eq!(b, b'E');} 138 | assert_eq!(buf[..written], AsRef::<[u8]>::as_ref(&uc)[..written]); 139 | } 140 | assert_eq!(written, 2); 141 | } 142 | } 143 | 144 | 145 | #[test] fn utf8charsplitter_read_all_sizes() { 146 | let s = "1111\u{104444}\u{222}1\u{833}1111\u{100004}"; 147 | assert!(s.len()%3 == 1); 148 | let mut buf = vec![b'E'; s.len()+6]; 149 | for size in 2..6 {//s.len()+4 { 150 | let mut reader = Utf8CharSplitter::from(s.chars().map(|c| c.to_utf8() )); 151 | for (offset, part) in s.as_bytes().chunks(size).enumerate() { 152 | let read_to = if part.len() == size {(offset+1)*size} else {buf.len()}; 153 | assert_eq!(reader.read(&mut buf[offset*size..read_to]).unwrap(), part.len()); 154 | assert_eq!(&buf[..offset*size+part.len()], &s.as_bytes()[..offset*size+part.len()]); 155 | } 156 | assert_eq!(reader.read(&mut buf[..]).unwrap(), 0); 157 | assert!(buf[s.len()..].iter().all(|&b| b==b'E' )); 158 | } 159 | } 160 | 161 | #[test] fn utf8charsplitter_alternate_iter_read() { 162 | let s = "1111\u{104444}\u{222}1\u{833}1111\u{100004}"; 163 | let mut buf = [b'0'; 10]; 164 | for n in 0..2 { 165 | // need to collect to test size_hint() 166 | // because chars().size_hint() returns ((bytes+3)/4, Some(bytes)) 167 | let u8chars = s.chars().map(|c| c.to_utf8() ).collect::>(); 168 | let mut iter = Utf8CharSplitter::from(u8chars.into_iter()); 169 | for (i, byte) in s.bytes().enumerate() { 170 | let until_next = s.as_bytes()[i..].iter().take_while(|&b| (b>>6)==0b10u8 ).count(); 171 | let remaining_chars = s[i+until_next..].chars().count(); 172 | println!("{}. run: byte {:02} of {}, remaining: {:02}+{}: 0b{:08b} = {:?}", 173 | n, i, s.len(), remaining_chars, until_next, byte, byte as char); 174 | assert_eq!(iter.read(&mut[][..]).unwrap(), 0); 175 | if i % 2 == n { 176 | assert_eq!(iter.next(), Some(byte)); 177 | } else { 178 | assert_eq!(iter.read(&mut buf[..1]).unwrap(), 1); 179 | assert_eq!(buf[0], byte); 180 | } 181 | } 182 | assert_eq!(iter.size_hint(), (0, Some(0))); 183 | assert_eq!(iter.next(), None); 184 | assert_eq!(iter.read(&mut buf[..]).unwrap(), 0); 185 | } 186 | } 187 | -------------------------------------------------------------------------------- /src/utf16_iterators.rs: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-2019 Torbjørn Birch Moltu 2 | * 3 | * Licensed under the Apache License, Version 2.0, or the MIT license , at your option. This file may not be 6 | * copied, modified, or distributed except according to those terms. 7 | */ 8 | 9 | use crate::traits::CharExt; 10 | use crate::utf16_char::Utf16Char; 11 | use crate::errors::EmptyStrError; 12 | extern crate core; 13 | use core::fmt; 14 | use core::borrow::Borrow; 15 | 16 | // Invalid values that says the field is consumed or empty. 17 | const FIRST_USED: u16 = 0x_dc_00; 18 | const SECOND_USED: u16 = 0; 19 | 20 | /// Iterate over the units of the UTF-16 representation of a codepoint. 21 | #[derive(Clone)] 22 | pub struct Utf16Iterator { 23 | first: u16, 24 | second: u16, 25 | } 26 | impl From for Utf16Iterator { 27 | fn from(c: char) -> Self { 28 | Self::from(c.to_utf16()) 29 | } 30 | } 31 | impl From for Utf16Iterator { 32 | fn from(uc: Utf16Char) -> Self { 33 | let (first, second) = uc.to_tuple(); 34 | let second = second.unwrap_or(SECOND_USED); 35 | Utf16Iterator{first, second} 36 | } 37 | } 38 | impl Iterator for Utf16Iterator { 39 | type Item=u16; 40 | fn next(&mut self) -> Option { 41 | match (self.first, self.second) { 42 | (FIRST_USED, SECOND_USED) => { None }, 43 | (FIRST_USED, second ) => {self.second = SECOND_USED; Some(second)}, 44 | (first , _ ) => {self.first = FIRST_USED; Some(first )}, 45 | } 46 | } 47 | fn size_hint(&self) -> (usize, Option) { 48 | (self.len(), Some(self.len())) 49 | } 50 | } 51 | impl ExactSizeIterator for Utf16Iterator { 52 | fn len(&self) -> usize { 53 | (if self.first == FIRST_USED {0} else {1}) + 54 | (if self.second == SECOND_USED {0} else {1}) 55 | } 56 | } 57 | impl fmt::Debug for Utf16Iterator { 58 | fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { 59 | let mut clone = self.clone(); 60 | match (clone.next(), clone.next()) { 61 | (Some(one), None) => write!(fmtr, "[{}]", one), 62 | (Some(a), Some(b)) => write!(fmtr, "[{}, {}]", a, b), 63 | (None, _) => write!(fmtr, "[]"), 64 | } 65 | } 66 | } 67 | 68 | 69 | 70 | /// Converts an iterator of `Utf16Char` (or `&Utf16Char`) 71 | /// to an iterator of `u16`s. 72 | /// 73 | /// Is equivalent to calling `.flatten()` or `.flat_map()` on the original iterator, 74 | /// but the returned iterator is about twice as fast. 75 | /// 76 | /// The exact number of units cannot be known in advance, but `size_hint()` 77 | /// gives the possible range. 78 | /// 79 | /// # Examples 80 | /// 81 | /// From iterator of values: 82 | /// 83 | /// ``` 84 | /// use encode_unicode::{IterExt, CharExt}; 85 | /// 86 | /// let iterator = "foo".chars().map(|c| c.to_utf16() ); 87 | /// let mut units = [0; 4]; 88 | /// iterator.to_units().zip(&mut units).for_each(|(u,dst)| *dst = u ); 89 | /// assert_eq!(units, ['f' as u16, 'o' as u16, 'o' as u16, 0]); 90 | /// ``` 91 | /// 92 | /// From iterator of references: 93 | /// 94 | #[cfg_attr(feature="std", doc=" ```")] 95 | #[cfg_attr(not(feature="std"), doc=" ```no_compile")] 96 | /// use encode_unicode::{IterExt, CharExt, Utf16Char}; 97 | /// 98 | /// // (💣 takes two units) 99 | /// let chars: Vec = "💣 bomb 💣".chars().map(|c| c.to_utf16() ).collect(); 100 | /// let units: Vec = chars.iter().to_units().collect(); 101 | /// let flat_map: Vec = chars.iter().cloned().flatten().collect(); 102 | /// assert_eq!(units, flat_map); 103 | /// ``` 104 | #[derive(Clone)] 105 | pub struct Utf16CharSplitter, I:Iterator> { 106 | inner: I, 107 | prev_second: u16, 108 | } 109 | impl, I:IntoIterator> 110 | From for Utf16CharSplitter { 111 | fn from(iterable: I) -> Self { 112 | Utf16CharSplitter { inner: iterable.into_iter(), prev_second: 0 } 113 | } 114 | } 115 | impl, I:Iterator> Utf16CharSplitter { 116 | /// Extracts the source iterator. 117 | /// 118 | /// Note that `iter.into_inner().to_units()` is not a no-op: 119 | /// If the last returned unit from `next()` was a leading surrogate, 120 | /// the trailing surrogate is lost. 121 | pub fn into_inner(self) -> I { 122 | self.inner 123 | } 124 | } 125 | impl, I:Iterator> Iterator for Utf16CharSplitter { 126 | type Item = u16; 127 | fn next(&mut self) -> Option { 128 | if self.prev_second == 0 { 129 | self.inner.next().map(|u16c| { 130 | let units = u16c.borrow().to_array(); 131 | self.prev_second = units[1]; 132 | units[0] 133 | }) 134 | } else { 135 | let prev_second = self.prev_second; 136 | self.prev_second = 0; 137 | Some(prev_second) 138 | } 139 | } 140 | fn size_hint(&self) -> (usize,Option) { 141 | // Doesn't need to handle unlikely overflows correctly because 142 | // size_hint() cannot be relied upon anyway. (the trait isn't unsafe) 143 | let (min, max) = self.inner.size_hint(); 144 | let add = if self.prev_second == 0 {0} else {1}; 145 | (min.wrapping_add(add), max.map(|max| max.wrapping_mul(2).wrapping_add(add) )) 146 | } 147 | } 148 | 149 | 150 | 151 | /// An iterator over the codepoints in a `str` represented as `Utf16Char`. 152 | #[derive(Clone)] 153 | pub struct Utf16CharIndices<'a>{ 154 | str: &'a str, 155 | index: usize, 156 | } 157 | impl<'a> From<&'a str> for Utf16CharIndices<'a> { 158 | fn from(s: &str) -> Utf16CharIndices { 159 | Utf16CharIndices{str: s, index: 0} 160 | } 161 | } 162 | impl<'a> Utf16CharIndices<'a> { 163 | /// Extract the remainder of the source `str`. 164 | /// 165 | /// # Examples 166 | /// 167 | /// ``` 168 | /// use encode_unicode::{StrExt, Utf16Char}; 169 | /// let mut iter = "abc".utf16char_indices(); 170 | /// assert_eq!(iter.next_back(), Some((2, Utf16Char::from('c')))); 171 | /// assert_eq!(iter.next(), Some((0, Utf16Char::from('a')))); 172 | /// assert_eq!(iter.as_str(), "b"); 173 | /// ``` 174 | pub fn as_str(&self) -> &'a str { 175 | &self.str[self.index..] 176 | } 177 | } 178 | impl<'a> Iterator for Utf16CharIndices<'a> { 179 | type Item = (usize,Utf16Char); 180 | fn next(&mut self) -> Option<(usize,Utf16Char)> { 181 | match Utf16Char::from_str_start(&self.str[self.index..]) { 182 | Ok((u16c, bytes)) => { 183 | let item = (self.index, u16c); 184 | self.index += bytes; 185 | Some(item) 186 | }, 187 | Err(EmptyStrError) => None 188 | } 189 | } 190 | fn size_hint(&self) -> (usize,Option) { 191 | let len = self.str.len() - self.index; 192 | // For len+3 to overflow, the slice must fill all but two bytes of 193 | // addressable memory, and size_hint() doesn't need to be correct. 194 | (len.wrapping_add(3)/4, Some(len)) 195 | } 196 | } 197 | impl<'a> DoubleEndedIterator for Utf16CharIndices<'a> { 198 | fn next_back(&mut self) -> Option<(usize,Utf16Char)> { 199 | if self.index < self.str.len() { 200 | let rev = self.str.bytes().rev(); 201 | let len = 1 + rev.take_while(|b| b & 0b1100_0000 == 0b1000_0000 ).count(); 202 | let starts = self.str.len() - len; 203 | let (u16c,_) = Utf16Char::from_str_start(&self.str[starts..]).unwrap(); 204 | self.str = &self.str[..starts]; 205 | Some((starts, u16c)) 206 | } else { 207 | None 208 | } 209 | } 210 | } 211 | impl<'a> fmt::Debug for Utf16CharIndices<'a> { 212 | fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { 213 | fmtr.debug_tuple("Utf16CharIndices") 214 | .field(&self.index) 215 | .field(&self.as_str()) 216 | .finish() 217 | } 218 | } 219 | 220 | 221 | /// An iterator over the codepoints in a `str` represented as `Utf16Char`. 222 | #[derive(Clone)] 223 | pub struct Utf16Chars<'a>(Utf16CharIndices<'a>); 224 | impl<'a> From<&'a str> for Utf16Chars<'a> { 225 | fn from(s: &str) -> Utf16Chars { 226 | Utf16Chars(Utf16CharIndices::from(s)) 227 | } 228 | } 229 | impl<'a> Utf16Chars<'a> { 230 | /// Extract the remainder of the source `str`. 231 | /// 232 | /// # Examples 233 | /// 234 | /// ``` 235 | /// use encode_unicode::{StrExt, Utf16Char}; 236 | /// let mut iter = "abc".utf16chars(); 237 | /// assert_eq!(iter.next(), Some(Utf16Char::from('a'))); 238 | /// assert_eq!(iter.next_back(), Some(Utf16Char::from('c'))); 239 | /// assert_eq!(iter.as_str(), "b"); 240 | /// ``` 241 | pub fn as_str(&self) -> &'a str { 242 | self.0.as_str() 243 | } 244 | } 245 | impl<'a> Iterator for Utf16Chars<'a> { 246 | type Item = Utf16Char; 247 | fn next(&mut self) -> Option { 248 | self.0.next().map(|(_,u16c)| u16c ) 249 | } 250 | fn size_hint(&self) -> (usize,Option) { 251 | self.0.size_hint() 252 | } 253 | } 254 | impl<'a> DoubleEndedIterator for Utf16Chars<'a> { 255 | fn next_back(&mut self) -> Option { 256 | self.0.next_back().map(|(_,u16c)| u16c ) 257 | } 258 | } 259 | impl<'a> fmt::Debug for Utf16Chars<'a> { 260 | fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { 261 | fmtr.debug_tuple("Utf16Chars") 262 | .field(&self.as_str()) 263 | .finish() 264 | } 265 | } 266 | -------------------------------------------------------------------------------- /tests/errs.rs: -------------------------------------------------------------------------------- 1 | /* Copyright 2016-2022 Torbjørn Birch Moltu 2 | * Copyright 2018 Aljoscha Meyer 3 | * 4 | * Licensed under the Apache License, Version 2.0, or the MIT license , at your option. This file may not be 7 | * copied, modified, or distributed except according to those terms. 8 | */ 9 | 10 | //! Test that methods gives the correct error. 11 | //! Some also test a bit more because it's easy. 12 | 13 | extern crate core; 14 | use core::char; 15 | extern crate encode_unicode; 16 | use encode_unicode::*; 17 | use encode_unicode::error::*; 18 | use encode_unicode::error::CodepointError::*; 19 | use encode_unicode::error::Utf8ErrorKind::*; 20 | 21 | 22 | #[test] fn from_u32() { 23 | for c in 0xd800..0xe000 { 24 | assert_eq!(char::from_u32_detailed(c), Err(Utf16Reserved)); 25 | } 26 | let mut c = 0x11_00_00; 27 | loop { 28 | assert_eq!(char::from_u32_detailed(c), Err(TooHigh)); 29 | // Don't test every value. (Range.step_by() is unstable) 30 | match c.checked_add(0x10_11_11) { 31 | Some(next) => c = next, 32 | None => break, 33 | } 34 | } 35 | } 36 | 37 | fn kind(result: Result) -> Result { 38 | result.map_err(|e| e.kind() ) 39 | } 40 | 41 | 42 | #[test] fn utf8_extra_bytes() { 43 | for c in 0..256 { 44 | assert_eq!( kind((c as u8).extra_utf8_bytes()), match c { 45 | 0b_1000_0000..=0b_1011_1111 => Err(UnexpectedContinuationByte), 46 | 0b_1100_0000..=0b_1100_0001 => Err(NonUtf8Byte), 47 | 0b_1111_0101..=0b_1111_0111 => Err(NonUtf8Byte), 48 | 0b_1111_1000..=0b_1111_1111 => Err(NonUtf8Byte), 49 | 0b_0000_0000..=0b_0111_1111 => Ok(0), 50 | 0b_1100_0010..=0b_1101_1111 => Ok(1), 51 | 0b_1110_0000..=0b_1110_1111 => Ok(2), 52 | 0b_1111_0000..=0b_1111_0100 => Ok(3), 53 | _ => unreachable!(), 54 | }); 55 | } 56 | 57 | for c in 0..256 { 58 | assert_eq!((c as u8).extra_utf8_bytes_unchecked(), match c { 59 | 0b_0000_0000..=0b_0111_1111 => 0, 60 | 0b_1100_0000..=0b_1101_1111 => 1, 61 | 0b_1110_0000..=0b_1110_1111 => 2, 62 | 0b_1111_0000..=0b_1111_0111 => 3, 63 | 0b_1000_0000..=0b_1011_1111 => 0, 64 | 0b_1111_1111 => 7, 65 | _ => continue, 66 | }); 67 | } 68 | } 69 | 70 | #[test] 71 | #[cfg_attr(miri, ignore)] 72 | fn utf16_extra_unit() { 73 | for c in 0..0x1_00_00 { 74 | assert_eq!( (c as u16).utf16_needs_extra_unit(), match c { 75 | 0b_0000_0000_0000_0000..=0b_1101_0111_1111_1111 => Ok(false), 76 | 0b_1101_1000_0000_0000..=0b_1101_1011_1111_1111 => Ok(true), 77 | 0b_1101_1100_0000_0000..=0b_1101_1111_1111_1111 => Err(Utf16FirstUnitError), 78 | 0b_1110_0000_0000_0000..=0b_1111_1111_1111_1111 => Ok(false), 79 | _ => unreachable!(), 80 | }); 81 | } 82 | } 83 | 84 | 85 | #[test] 86 | #[cfg_attr(miri, ignore)] 87 | fn from_utf16_tuple() { 88 | use encode_unicode::error::Utf16TupleError::*; 89 | for u in 0xdc00..0xe000 { 90 | let close = if u%3==0 {u-100} else {u+100}; 91 | let doesnt_matter = if u%2==0 {Some(close)} else {None}; 92 | assert_eq!(char::from_utf16_tuple((u,doesnt_matter)), Err(FirstIsTrailingSurrogate)); 93 | } 94 | for u in (0..0xd800).chain(0xe000..0x10000) { 95 | assert_eq!( 96 | char::from_utf16_tuple((u as u16, Some((0x100+u) as u16))), 97 | Err(SuperfluousSecond) 98 | ); 99 | } 100 | for u in 0xd800..0xdc00 { 101 | assert_eq!(char::from_utf16_tuple((u,None)), Err(MissingSecond)); 102 | assert_eq!(char::from_utf16_tuple((u,Some(u - 0x2ff))), Err(SecondIsNotTrailingSurrogate)); 103 | } 104 | } 105 | 106 | #[test] fn from_utf16_slice_start() { 107 | use encode_unicode::error::Utf16SliceError::*; 108 | assert_eq!(char::from_utf16_slice_start(&[]), Err(EmptySlice)); 109 | let mut buf = [0; 6]; 110 | for u in 0xd800..0xdc00 { 111 | buf[0] = u; 112 | assert_eq!(char::from_utf16_slice_start(&buf[..1]), Err(MissingSecond)); 113 | buf[1] = u; 114 | let pass = 2 + (u as usize % (buf.len()-2)); 115 | assert_eq!(char::from_utf16_slice_start(&buf[..pass]), Err(SecondIsNotTrailingSurrogate)); 116 | } 117 | for u in 0xdc00..0xe000 { 118 | buf[0] = u; 119 | let close = if u%3==0 {u-100} else {u+100}; 120 | let pass = 1 + (u as usize % (buf.len()-1)); 121 | buf[pass] = close; 122 | assert_eq!(char::from_utf16_slice_start(&buf[..pass]), Err(FirstIsTrailingSurrogate)); 123 | } 124 | } 125 | 126 | #[test] fn utf8_overlong() { 127 | let overlongs = [ 128 | [0xf0,0x8f], [0xf0,0x87], [0xf0,0x80], // 4-byte 129 | [0xe0,0x9f], [0xe0,0x8f], [0xe0,0x80], // 3-byte 130 | ]; 131 | for o in overlongs.iter() { 132 | for &last in &[0x80, 0xbf] { 133 | let arr = [o[0], o[1], last, last]; 134 | assert_eq!(kind(char::from_utf8_slice_start(&arr)), Err(OverlongEncoding)); 135 | assert_eq!(kind(char::from_utf8_array(arr)), Err(OverlongEncoding)); 136 | assert_eq!(kind(Utf8Char::from_slice_start(&arr)), Err(OverlongEncoding)); 137 | assert_eq!(kind(Utf8Char::from_array(arr)), Err(OverlongEncoding)); 138 | } 139 | } 140 | 141 | let non_utf8 = [ 142 | [0xc1,0xbf], [0xc1,0x92], [0xc1,0x80], // 2-byte 143 | [0xc0,0xbf], [0xc0,0x9f], [0xc0,0x80], // 2-byte 144 | ]; 145 | for non in non_utf8.iter() { 146 | for &last in &[0x80, 0xbf] { 147 | let arr = [non[0], non[1], last, last]; 148 | assert_eq!(kind(char::from_utf8_slice_start(&arr)), Err(NonUtf8Byte)); 149 | assert_eq!(kind(char::from_utf8_array(arr)), Err(NonUtf8Byte)); 150 | assert_eq!(kind(Utf8Char::from_slice_start(&arr)), Err(NonUtf8Byte)); 151 | assert_eq!(kind(Utf8Char::from_array(arr)), Err(NonUtf8Byte)); 152 | } 153 | } 154 | } 155 | 156 | #[test] fn from_str_start() { 157 | assert_eq!(Utf8Char::from_str_start(""), Err(EmptyStrError)); 158 | assert_eq!(Utf16Char::from_str_start(""), Err(EmptyStrError)); 159 | } 160 | 161 | #[test] fn utf8_codepoint_is_too_high() { 162 | assert_eq!(kind(Utf8Char::from_array([0xf4, 0x90, 0x80, 0x80])), Err(TooHighCodepoint)); 163 | assert_eq!(kind(char::from_utf8_array([0xf4, 0x90, 0x80, 0x80])), Err(TooHighCodepoint)); 164 | assert_eq!(kind(Utf8Char::from_slice_start(&[0xf4, 0x90, 0x80, 0x80])), Err(TooHighCodepoint)); 165 | assert_eq!(kind(char::from_utf8_slice_start(&[0xf4, 0x90, 0x80, 0x80])), Err(TooHighCodepoint)); 166 | 167 | assert_eq!(kind(Utf8Char::from_array([0xf4, 0xa4, 0xb0, 0x9f])), Err(TooHighCodepoint)); 168 | assert_eq!(kind(char::from_utf8_array([0xf4, 0xa4, 0xb0, 0x9f])), Err(TooHighCodepoint)); 169 | assert_eq!(kind(Utf8Char::from_slice_start(&[0xf4, 0xa4, 0xb0, 0x9f])), Err(TooHighCodepoint)); 170 | assert_eq!(kind(char::from_utf8_slice_start(&[0xf4, 0xa4, 0xb8, 0x9f])), Err(TooHighCodepoint)); 171 | 172 | assert_eq!(kind(Utf8Char::from_array([0xf5, 0x88, 0x99, 0xaa])), Err(NonUtf8Byte)); 173 | assert_eq!(kind(char::from_utf8_array([0xf5, 0xaa, 0xbb, 0x88])), Err(NonUtf8Byte)); 174 | assert_eq!(kind(Utf8Char::from_slice_start(&[0xf5, 0x99, 0xaa, 0xbb])), Err(NonUtf8Byte)); 175 | assert_eq!(kind(char::from_utf8_slice_start(&[0xf5, 0xbb, 0x88, 0x99])), Err(NonUtf8Byte)); 176 | } 177 | 178 | #[test] fn utf8_codepoint_is_utf16_reserved() { 179 | assert_eq!(kind(Utf8Char::from_array([0xed, 0xa0, 0x80, 0xff])), Err(Utf16ReservedCodepoint)); 180 | assert_eq!(kind(char::from_utf8_array([0xed, 0xa0, 0x8f, 0x00])), Err(Utf16ReservedCodepoint)); 181 | assert_eq!(kind(Utf8Char::from_slice_start(&[0xed, 0xa0, 0xbe, 0xa5])), Err(Utf16ReservedCodepoint)); 182 | assert_eq!(kind(char::from_utf8_slice_start(&[0xed, 0xa0, 0xbf])), Err(Utf16ReservedCodepoint)); 183 | assert_eq!(kind(Utf8Char::from_array([0xed, 0xbf, 0x80, 0xff])), Err(Utf16ReservedCodepoint)); 184 | assert_eq!(kind(char::from_utf8_array([0xed, 0xbf, 0x8f, 0x00])), Err(Utf16ReservedCodepoint)); 185 | assert_eq!(kind(Utf8Char::from_slice_start(&[0xed, 0xbf, 0xbe, 0xa5])), Err(Utf16ReservedCodepoint)); 186 | assert_eq!(kind(char::from_utf8_slice_start(&[0xed, 0xbf, 0xbf])), Err(Utf16ReservedCodepoint)); 187 | } 188 | 189 | #[test] fn utf8_first_is_continuation_byte() { 190 | for first in 0x80..0xc0 { 191 | let arr = [first, first<<2, first<<4, first<<6]; 192 | assert_eq!(kind(Utf8Char::from_array(arr)), Err(UnexpectedContinuationByte)); 193 | assert_eq!(kind(char::from_utf8_array(arr)), Err(UnexpectedContinuationByte)); 194 | let len = (1 + first%3) as usize; 195 | assert_eq!(kind(Utf8Char::from_slice_start(&arr[..len])), Err(UnexpectedContinuationByte)); 196 | assert_eq!(kind(char::from_utf8_slice_start(&arr[..len])), Err(UnexpectedContinuationByte)); 197 | } 198 | } 199 | 200 | #[test] fn utf8_too_long() { 201 | for first in 0xf8..0x100 { 202 | let arr = [first as u8, 0x88, 0x80, 0x80]; 203 | assert_eq!(kind(Utf8Char::from_array(arr)), Err(NonUtf8Byte)); 204 | assert_eq!(kind(char::from_utf8_array(arr)), Err(NonUtf8Byte)); 205 | let arr = [first as u8, 0x88, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80]; 206 | let slice = &arr[..if first&1 == 0 {1} else {8}]; 207 | assert_eq!(kind(Utf8Char::from_slice_start(slice)), Err(NonUtf8Byte)); 208 | assert_eq!(kind(char::from_utf8_slice_start(slice)), Err(NonUtf8Byte)); 209 | } 210 | } 211 | 212 | #[test] fn utf8_not_continuation_byte() { 213 | for first in 0xc2..0xf4 { 214 | let mut arr = [first, 0x90, 0xa0, 0xb0]; 215 | let extra = first.extra_utf8_bytes().unwrap(); 216 | for corrupt in (1..extra).rev() { 217 | for &bad in &[0x00, 0x3f, 0x40, 0x7f, 0xc0, 0xff] { 218 | arr[corrupt] = bad; 219 | assert_eq!(kind(Utf8Char::from_array(arr)), Err(InterruptedSequence), "{:?}", arr); 220 | assert_eq!(kind(char::from_utf8_array(arr)), Err(InterruptedSequence)); 221 | let slice = if first&1 == 0 {&arr[..1+extra]} else {&arr}; 222 | assert_eq!(kind(Utf8Char::from_slice_start(slice)), Err(InterruptedSequence), "{:?}", slice); 223 | assert_eq!(kind(char::from_utf8_slice_start(slice)), Err(InterruptedSequence)); 224 | } 225 | } 226 | } 227 | } 228 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /src/utf8_iterators.rs: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-2020 Torbjørn Birch Moltu 2 | * 3 | * Licensed under the Apache License, Version 2.0, or the MIT license , at your option. This file may not be 6 | * copied, modified, or distributed except according to those terms. 7 | */ 8 | 9 | use crate::utf8_char::Utf8Char; 10 | use crate::errors::EmptyStrError; 11 | extern crate core; 12 | use core::{u32, u64}; 13 | use core::ops::Not; 14 | use core::fmt; 15 | use core::borrow::Borrow; 16 | #[cfg(feature="std")] 17 | use std::io::{Read, Error as ioError}; 18 | 19 | 20 | 21 | /// Read or iterate over the bytes of the UTF-8 representation of a codepoint. 22 | #[derive(Clone)] 23 | pub struct Utf8Iterator (u32); 24 | 25 | impl From for Utf8Iterator { 26 | fn from(uc: Utf8Char) -> Self { 27 | let used = u32::from_le_bytes(uc.to_array().0); 28 | // uses u64 because shifting an u32 by 32 bits is a no-op. 29 | let unused_set = (u64::MAX << (uc.len() as u64*8)) as u32; 30 | Utf8Iterator(used | unused_set) 31 | } 32 | } 33 | impl From for Utf8Iterator { 34 | fn from(c: char) -> Self { 35 | Self::from(Utf8Char::from(c)) 36 | } 37 | } 38 | impl Iterator for Utf8Iterator { 39 | type Item=u8; 40 | fn next(&mut self) -> Option { 41 | let next = self.0 as u8; 42 | if next == 0xff { 43 | None 44 | } else { 45 | self.0 = (self.0 >> 8) | 0xff_00_00_00; 46 | Some(next) 47 | } 48 | } 49 | fn size_hint(&self) -> (usize, Option) { 50 | (self.len(), Some(self.len())) 51 | } 52 | } 53 | impl ExactSizeIterator for Utf8Iterator { 54 | fn len(&self) -> usize {// not straightforward, but possible 55 | let unused_bytes = self.0.not().leading_zeros() / 8; 56 | 4 - unused_bytes as usize 57 | } 58 | } 59 | #[cfg(feature="std")] 60 | impl Read for Utf8Iterator { 61 | /// Always returns Ok 62 | fn read(&mut self, buf: &mut[u8]) -> Result { 63 | // Cannot call self.next() until I know I can write the result. 64 | for (i, dst) in buf.iter_mut().enumerate() { 65 | match self.next() { 66 | Some(b) => *dst = b, 67 | None => return Ok(i), 68 | } 69 | } 70 | Ok(buf.len()) 71 | } 72 | } 73 | impl fmt::Debug for Utf8Iterator { 74 | fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { 75 | let mut content = [0; 4]; 76 | let mut i = 0; 77 | for b in self.clone() { 78 | content[i] = b; 79 | i += 1; 80 | } 81 | write!(fmtr, "{:?}", &content[..i]) 82 | } 83 | } 84 | 85 | 86 | 87 | /// Converts an iterator of `Utf8Char` (or `&Utf8Char`) 88 | /// to an iterator of `u8`s. 89 | /// 90 | /// Is equivalent to calling `.flatten()` or `.flat_map()` on the original iterator, 91 | /// but the returned iterator is ~40% faster. 92 | /// 93 | /// The iterator also implements `Read` (if the `std` feature isn't disabled). 94 | /// Reading will never produce an error, and calls to `.read()` and `.next()` 95 | /// can be mixed. 96 | /// 97 | /// The exact number of bytes cannot be known in advance, but `size_hint()` 98 | /// gives the possible range. 99 | /// (min: all remaining characters are ASCII, max: all require four bytes) 100 | /// 101 | /// # Examples 102 | /// 103 | /// From iterator of values: 104 | /// 105 | /// ``` 106 | /// use encode_unicode::{IterExt, CharExt}; 107 | /// 108 | /// let iterator = "foo".chars().map(|c| c.to_utf8() ); 109 | /// let mut bytes = [0; 4]; 110 | /// iterator.to_bytes().zip(&mut bytes).for_each(|(b,dst)| *dst = b ); 111 | /// assert_eq!(&bytes, b"foo\0"); 112 | /// ``` 113 | /// 114 | /// From iterator of references: 115 | /// 116 | #[cfg_attr(feature="std", doc=" ```")] 117 | #[cfg_attr(not(feature="std"), doc=" ```no_compile")] 118 | /// use encode_unicode::{IterExt, CharExt, Utf8Char}; 119 | /// 120 | /// let chars: Vec = "💣 bomb 💣".chars().map(|c| c.to_utf8() ).collect(); 121 | /// let bytes: Vec = chars.iter().to_bytes().collect(); 122 | /// let flat_map: Vec = chars.iter().cloned().flatten().collect(); 123 | /// assert_eq!(bytes, flat_map); 124 | /// ``` 125 | /// 126 | /// `Read`ing from it: 127 | /// 128 | #[cfg_attr(feature="std", doc=" ```")] 129 | #[cfg_attr(not(feature="std"), doc=" ```no_compile")] 130 | /// use encode_unicode::{IterExt, CharExt}; 131 | /// use std::io::Read; 132 | /// 133 | /// let s = "Ååh‽"; 134 | /// assert_eq!(s.len(), 8); 135 | /// let mut buf = [b'E'; 9]; 136 | /// let mut reader = s.chars().map(|c| c.to_utf8() ).to_bytes(); 137 | /// assert_eq!(reader.read(&mut buf[..]).unwrap(), 8); 138 | /// assert_eq!(reader.read(&mut buf[..]).unwrap(), 0); 139 | /// assert_eq!(&buf[..8], s.as_bytes()); 140 | /// assert_eq!(buf[8], b'E'); 141 | /// ``` 142 | #[derive(Clone)] 143 | pub struct Utf8CharSplitter, I:Iterator> { 144 | inner: I, 145 | prev: u32, 146 | } 147 | impl, I:IntoIterator> 148 | From for Utf8CharSplitter { 149 | fn from(iterable: I) -> Self { 150 | Utf8CharSplitter { inner: iterable.into_iter(), prev: 0 } 151 | } 152 | } 153 | impl, I:Iterator> Utf8CharSplitter { 154 | /// Extracts the source iterator. 155 | /// 156 | /// Note that `iter.into_inner().to_bytes()` is not a no-op: 157 | /// If the last returned byte from `next()` was not an ASCII character, 158 | /// the remaining bytes of that codepoint is lost. 159 | pub fn into_inner(self) -> I { 160 | self.inner 161 | } 162 | } 163 | impl, I:Iterator> Iterator for Utf8CharSplitter { 164 | type Item = u8; 165 | fn next(&mut self) -> Option { 166 | if self.prev == 0 { 167 | self.inner.next().map(|u8c| { 168 | let array = u8c.borrow().to_array().0; 169 | self.prev = u32::from_le_bytes(array) >> 8; 170 | array[0] 171 | }) 172 | } else { 173 | let next = self.prev as u8; 174 | self.prev >>= 8; 175 | Some(next) 176 | } 177 | } 178 | fn size_hint(&self) -> (usize,Option) { 179 | // Doesn't need to handle unlikely overflows correctly because 180 | // size_hint() cannot be relied upon anyway. (the trait isn't unsafe) 181 | let (min, max) = self.inner.size_hint(); 182 | let add = 4 - (self.prev.leading_zeros() / 8) as usize; 183 | (min.wrapping_add(add), max.map(|max| max.wrapping_mul(4).wrapping_add(add) )) 184 | } 185 | } 186 | #[cfg(feature="std")] 187 | impl, I:Iterator> Read for Utf8CharSplitter { 188 | /// Always returns `Ok` 189 | fn read(&mut self, buf: &mut[u8]) -> Result { 190 | let mut i = 0; 191 | // write remaining bytes of previous codepoint 192 | while self.prev != 0 && i < buf.len() { 193 | buf[i] = self.prev as u8; 194 | self.prev >>= 8; 195 | i += 1; 196 | } 197 | // write whole characters 198 | while i < buf.len() { 199 | let bytes = match self.inner.next() { 200 | Some(u8c) => u8c.borrow().to_array().0, 201 | None => break 202 | }; 203 | buf[i] = bytes[0]; 204 | i += 1; 205 | if bytes[1] != 0 { 206 | let len = bytes[0].not().leading_zeros() as usize; 207 | let mut written = 1; 208 | while written < len { 209 | if i < buf.len() { 210 | buf[i] = bytes[written]; 211 | i += 1; 212 | written += 1; 213 | } else { 214 | let bytes_as_u32 = u32::from_le_bytes(bytes); 215 | self.prev = bytes_as_u32 >> (8*written); 216 | return Ok(i); 217 | } 218 | } 219 | } 220 | } 221 | Ok(i) 222 | } 223 | } 224 | 225 | 226 | 227 | /// An iterator over the `Utf8Char` of a string slice, and their positions. 228 | /// 229 | /// This struct is created by the `utf8char_indices()` method from [`StrExt`](../trait.StrExt.html) 230 | /// trait. See its documentation for more. 231 | #[derive(Clone)] 232 | pub struct Utf8CharIndices<'a>{ 233 | str: &'a str, 234 | index: usize, 235 | } 236 | impl<'a> From<&'a str> for Utf8CharIndices<'a> { 237 | fn from(s: &str) -> Utf8CharIndices { 238 | Utf8CharIndices{str: s, index: 0} 239 | } 240 | } 241 | impl<'a> Utf8CharIndices<'a> { 242 | /// Extract the remainder of the source `str`. 243 | /// 244 | /// # Examples 245 | /// 246 | /// ``` 247 | /// use encode_unicode::{StrExt, Utf8Char}; 248 | /// let mut iter = "abc".utf8char_indices(); 249 | /// assert_eq!(iter.next_back(), Some((2, Utf8Char::from('c')))); 250 | /// assert_eq!(iter.next(), Some((0, Utf8Char::from('a')))); 251 | /// assert_eq!(iter.as_str(), "b"); 252 | /// ``` 253 | pub fn as_str(&self) -> &'a str { 254 | &self.str[self.index..] 255 | } 256 | } 257 | impl<'a> Iterator for Utf8CharIndices<'a> { 258 | type Item = (usize,Utf8Char); 259 | fn next(&mut self) -> Option<(usize,Utf8Char)> { 260 | match Utf8Char::from_str_start(&self.str[self.index..]) { 261 | Ok((u8c, len)) => { 262 | let item = (self.index, u8c); 263 | self.index += len; 264 | Some(item) 265 | }, 266 | Err(EmptyStrError) => None 267 | } 268 | } 269 | fn size_hint(&self) -> (usize,Option) { 270 | let len = self.str.len() - self.index; 271 | // For len+3 to overflow, the slice must fill all but two bytes of 272 | // addressable memory, and size_hint() doesn't need to be correct. 273 | (len.wrapping_add(3)/4, Some(len)) 274 | } 275 | } 276 | impl<'a> DoubleEndedIterator for Utf8CharIndices<'a> { 277 | fn next_back(&mut self) -> Option<(usize,Utf8Char)> { 278 | // Cannot refactor out the unwrap without switching to ::from_slice() 279 | // since slicing the str panics if not on a boundary. 280 | if self.index < self.str.len() { 281 | let rev = self.str.bytes().rev(); 282 | let len = 1 + rev.take_while(|b| b & 0b1100_0000 == 0b1000_0000 ).count(); 283 | let starts = self.str.len() - len; 284 | let (u8c,_) = Utf8Char::from_str_start(&self.str[starts..]).unwrap(); 285 | self.str = &self.str[..starts]; 286 | Some((starts, u8c)) 287 | } else { 288 | None 289 | } 290 | } 291 | } 292 | impl<'a> fmt::Debug for Utf8CharIndices<'a> { 293 | fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { 294 | fmtr.debug_tuple("Utf8CharIndices") 295 | .field(&self.index) 296 | .field(&self.as_str()) 297 | .finish() 298 | } 299 | } 300 | 301 | 302 | /// An iterator over the codepoints in a `str` represented as `Utf8Char`. 303 | #[derive(Clone)] 304 | pub struct Utf8Chars<'a>(Utf8CharIndices<'a>); 305 | impl<'a> From<&'a str> for Utf8Chars<'a> { 306 | fn from(s: &str) -> Utf8Chars { 307 | Utf8Chars(Utf8CharIndices::from(s)) 308 | } 309 | } 310 | impl<'a> Utf8Chars<'a> { 311 | /// Extract the remainder of the source `str`. 312 | /// 313 | /// # Examples 314 | /// 315 | /// ``` 316 | /// use encode_unicode::{StrExt, Utf8Char}; 317 | /// let mut iter = "abc".utf8chars(); 318 | /// assert_eq!(iter.next(), Some(Utf8Char::from('a'))); 319 | /// assert_eq!(iter.next_back(), Some(Utf8Char::from('c'))); 320 | /// assert_eq!(iter.as_str(), "b"); 321 | /// ``` 322 | pub fn as_str(&self) -> &'a str { 323 | self.0.as_str() 324 | } 325 | } 326 | impl<'a> Iterator for Utf8Chars<'a> { 327 | type Item = Utf8Char; 328 | fn next(&mut self) -> Option { 329 | self.0.next().map(|(_,u8c)| u8c ) 330 | } 331 | fn size_hint(&self) -> (usize,Option) { 332 | self.0.size_hint() 333 | } 334 | } 335 | impl<'a> DoubleEndedIterator for Utf8Chars<'a> { 336 | fn next_back(&mut self) -> Option { 337 | self.0.next_back().map(|(_,u8c)| u8c ) 338 | } 339 | } 340 | impl<'a> fmt::Debug for Utf8Chars<'a> { 341 | fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { 342 | fmtr.debug_tuple("Utf8CharIndices") 343 | .field(&self.as_str()) 344 | .finish() 345 | } 346 | } 347 | -------------------------------------------------------------------------------- /benches/length.rs: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-2022 Torbjørn Birch Moltu 2 | * 3 | * Licensed under the Apache License, Version 2.0, or the MIT license , at your option. This file may not be 6 | * copied, modified, or distributed except according to those terms. 7 | */ 8 | 9 | // Run with -- --nocapture to show error messages if setup fails. 10 | // (or use ./do.sh) 11 | 12 | #![cfg(feature="std")] 13 | #![feature(test)] 14 | extern crate test; 15 | use test::{Bencher, black_box}; 16 | 17 | use std::fs; 18 | use std::path::Path; 19 | use std::io::ErrorKind; 20 | use std::thread::sleep; 21 | use std::time::Duration; 22 | use std::collections::HashMap; 23 | extern crate minreq; 24 | #[macro_use] extern crate lazy_static; 25 | extern crate encode_unicode; 26 | use encode_unicode::{CharExt, Utf8Char, U8UtfExt, Utf16Char, U16UtfExt}; 27 | 28 | // Setup; need longish strings to make benchmarks representative and 29 | // reduce overhead (might get cache misses now though) 30 | // Therefore we download a few wikipedia articles in different languages. 31 | // Downloading a fixed revision of the articles doesn't prevent the HTML from 32 | // changing due to changes in templates or rendering. 33 | 34 | fn load_wikipedia(language: &str, article: &str, english: &str, revision: usize) -> String { 35 | let cache_path = Path::new("benches").join("texts"); 36 | let cache_path = cache_path.to_str().unwrap(); 37 | let name = format!("{}_{}.html", language, english); 38 | let path = Path::new(cache_path).join(&name); 39 | let path = path.to_str().unwrap(); 40 | match fs::read_to_string(path) { 41 | Ok(content) => return content, 42 | Err(ref e) if e.kind() == ErrorKind::NotFound => {},//continue 43 | Err(ref e) if e.kind() == ErrorKind::InvalidData => { 44 | panic!("{} exists but is not UTF-8", &name); 45 | }, 46 | Err(e) => panic!("{} exists but cannot be read ({})", path, e), 47 | } 48 | let mut article_ascii = String::new(); 49 | for c in article.chars() { 50 | if c.is_ascii() { 51 | article_ascii.push(c); 52 | } else { 53 | let encoded = format!("%{:2X}", c as u32); 54 | article_ascii.push_str(encoded.as_str()); 55 | } 56 | } 57 | let url = format!("https://{}.m.wikipedia.org/w/index.php?title={}&oldid={}", 58 | language, article_ascii, revision 59 | ); 60 | println!("Downloading {} and saving to {}", &url, path); 61 | let response = minreq::get(&url).send().unwrap_or_else(|e| { 62 | panic!("Cannot get {}: {}", url, e); 63 | }); 64 | if response.status_code != 200 { 65 | panic!("Bad URL {}: {} {}", url, response.status_code, response.reason_phrase); 66 | } 67 | let content = String::from_utf8(response.into_bytes()).unwrap_or_else(|_| { 68 | panic!("Response from {} is not UTF-8", url); 69 | }); 70 | if let Err(e) = fs::create_dir_all(cache_path) { 71 | eprintln!("Warning: failed to create directory {}: {}", cache_path, e); 72 | } else if let Err(e) = fs::write(&path, &content) { 73 | eprintln!("Warning: failed to save {}: {}", path, e); 74 | } 75 | sleep(Duration::from_secs(1)); 76 | content 77 | } 78 | const ARTICLES: &[(&str, &str, &str, usize)] = &[ 79 | ("en", "United_Kingdom", "United_Kingdom", 855522252),// 99,7% ASCII 80 | ("es", "España", "Spain", 109861222),// 1,75% 2-byte characters 81 | ("ru", "Россия", "Russia", 94607243),// 36% 2-byte characters 82 | ("zh", "中國", "China", 50868604),// 30% 3-byte characters 83 | ]; 84 | lazy_static!{ 85 | static ref STRINGS: HashMap<&'static str, String> = { 86 | let mut content = HashMap::new(); 87 | for &(language, article, english, revision) in ARTICLES { 88 | content.insert(language, load_wikipedia(language, article, english, revision)); 89 | } 90 | // make one string with only ASCII 91 | let only_ascii = content.values() 92 | .map(|v| (v, v.bytes().filter(|b| b.is_ascii() ).count()) ) 93 | .max_by_key(|&(_,len)| len ) 94 | .map(|(v,_)| v.bytes().filter(|b| b.is_ascii() ).map(|b| b as char ).collect() ) 95 | .unwrap(); 96 | content.insert("ascii", only_ascii); 97 | content 98 | }; 99 | static ref EQUAL_CHARS: HashMap<&'static str, &'static str> = { 100 | let (least, chars) = STRINGS.iter() 101 | .map(|(l,s)| (l, s.chars().count()) ) 102 | .min_by_key(|&(_,chars)| chars ) 103 | .unwrap(); 104 | println!("chars: {} (limited by {})", chars, least); 105 | STRINGS.iter().map(|(&language, string)| { 106 | let cut = string.char_indices() 107 | .nth(chars) 108 | .map_or(string.len(), |(i,_)| i ); 109 | let string = &string[..cut]; 110 | assert_eq!(string.chars().count(), chars); 111 | (language, string) 112 | }).collect() 113 | }; 114 | static ref EQUAL_BYTES: HashMap<&'static str, String> = { 115 | let (least, bytes) = STRINGS.iter() 116 | .map(|(l,s)| (l, s.len()) ) 117 | .min_by_key(|&(_,bytes)| bytes ) 118 | .unwrap(); 119 | println!("bytes: {} (limited by {})", bytes, least); 120 | STRINGS.iter().map(|(&language, string)| { 121 | let mut remaining = bytes; 122 | // take just so many characters that their length is exactly $bytes 123 | // slicing won't if !string.is_char_boundary(bytes), 124 | let string = string.chars().filter(|c| { 125 | match remaining.checked_sub(c.len_utf8()) { 126 | Some(after) => {remaining = after; true}, 127 | None => false 128 | } 129 | }).collect::(); 130 | assert_eq!(string.len(), bytes); 131 | (language, string) 132 | }).collect() 133 | }; 134 | static ref EQUAL_UNITS: HashMap<&'static str, String> = { 135 | let (least, units) = STRINGS.iter() 136 | .map(|(l,s)| (l, s.chars().map(|c| c.len_utf16() ).sum::()) ) 137 | .min_by_key(|&(_,units)| units ) 138 | .unwrap(); 139 | println!("units: {} (limited by {})", units, least); 140 | STRINGS.iter().map(|(&language, string)| { 141 | let mut remaining = units; 142 | let string = string.chars().filter(|c| { 143 | match remaining.checked_sub(c.len_utf16()) { 144 | Some(after) => {remaining = after; true}, 145 | None => false 146 | } 147 | }).collect::(); 148 | assert_eq!(string.chars().map(|c| c.len_utf16() ).sum::(), units); 149 | (language, string) 150 | }).collect() 151 | }; 152 | } 153 | 154 | 155 | 156 | /////////////////////////// 157 | // benchmarks begin here // 158 | /////////////////////////// 159 | 160 | fn utf8char_len(language: &str, b: &mut Bencher) { 161 | let string = &EQUAL_BYTES[language]; 162 | let chars: Vec = string.chars().map(|c| c.to_utf8() ).collect(); 163 | let bytes = string.len(); 164 | b.iter(|| { 165 | let sum: usize = black_box(&chars).iter().map(|u8c| u8c.len() ).sum(); 166 | assert_eq!(sum, bytes); 167 | }); 168 | } 169 | #[bench] fn utf8char_len_ascii(b: &mut Bencher) {utf8char_len("ascii", b)} 170 | #[bench] fn utf8char_len_en(b: &mut Bencher) {utf8char_len("en", b)} 171 | #[bench] fn utf8char_len_es(b: &mut Bencher) {utf8char_len("es", b)} 172 | #[bench] fn utf8char_len_ru(b: &mut Bencher) {utf8char_len("ru", b)} 173 | #[bench] fn utf8char_len_zh(b: &mut Bencher) {utf8char_len("zh", b)} 174 | 175 | fn utf8_extra_bytes_unchecked(language: &str, b: &mut Bencher) { 176 | let string = &EQUAL_CHARS[language]; 177 | let chars = string.chars().count(); 178 | let string = string.as_bytes(); 179 | b.iter(|| { 180 | let mut i = 0; 181 | let mut loops = 0; 182 | while i < string.len() { 183 | i += string[i].extra_utf8_bytes_unchecked(); 184 | i += 1; 185 | loops += 1; 186 | } 187 | assert_eq!(loops, chars); 188 | }); 189 | } 190 | #[bench] fn utf8_extra_bytes_unchecked_ascii(b: &mut Bencher) {utf8_extra_bytes_unchecked("ascii", b)} 191 | #[bench] fn utf8_extra_bytes_unchecked_en(b: &mut Bencher) {utf8_extra_bytes_unchecked("en", b)} 192 | #[bench] fn utf8_extra_bytes_unchecked_es(b: &mut Bencher) {utf8_extra_bytes_unchecked("es", b)} 193 | #[bench] fn utf8_extra_bytes_unchecked_ru(b: &mut Bencher) {utf8_extra_bytes_unchecked("ru", b)} 194 | #[bench] fn utf8_extra_bytes_unchecked_zh(b: &mut Bencher) {utf8_extra_bytes_unchecked("zh", b)} 195 | 196 | fn utf8_extra_bytes(language: &str, b: &mut Bencher) { 197 | let string = &EQUAL_CHARS[language]; 198 | let chars = string.chars().count(); 199 | let string = string.as_bytes(); 200 | b.iter(|| { 201 | let mut i = 0; 202 | let mut loops = 0; 203 | let mut errors = 0; 204 | while i < string.len() { 205 | match string[i].extra_utf8_bytes() { 206 | Ok(n) => i += n, 207 | Err(_) => errors += 1, 208 | } 209 | i += 1; 210 | loops += 1; 211 | } 212 | assert_eq!(loops, chars); 213 | assert_eq!(errors, 0); 214 | }); 215 | } 216 | #[bench] fn utf8_extra_bytes_ascii(b: &mut Bencher) {utf8_extra_bytes("ascii", b)} 217 | #[bench] fn utf8_extra_bytes_en(b: &mut Bencher) {utf8_extra_bytes("en", b)} 218 | #[bench] fn utf8_extra_bytes_es(b: &mut Bencher) {utf8_extra_bytes("es", b)} 219 | #[bench] fn utf8_extra_bytes_ru(b: &mut Bencher) {utf8_extra_bytes("ru", b)} 220 | #[bench] fn utf8_extra_bytes_zh(b: &mut Bencher) {utf8_extra_bytes("zh", b)} 221 | 222 | 223 | fn utf16char_len(language: &str, b: &mut Bencher) { 224 | let string = &EQUAL_UNITS[language]; 225 | let chars: Vec = string.chars().map(|c| c.to_utf16() ).collect(); 226 | let units = string.chars().map(|c| c.len_utf16() ).sum::(); 227 | b.iter(|| { 228 | let sum: usize = black_box(&chars).iter().map(|u8c| u8c.len() ).sum(); 229 | assert_eq!(sum, units); 230 | }); 231 | } 232 | #[bench] fn utf16char_len_ascii(b: &mut Bencher) {utf16char_len("ascii", b)} 233 | #[bench] fn utf16char_len_en(b: &mut Bencher) {utf16char_len("en", b)} 234 | #[bench] fn utf16char_len_es(b: &mut Bencher) {utf16char_len("en", b)} 235 | #[bench] fn utf16char_len_ru(b: &mut Bencher) {utf16char_len("ru", b)} 236 | #[bench] fn utf16char_len_zh(b: &mut Bencher) {utf16char_len("zh", b)} 237 | 238 | fn utf16_is_leading_surrogate(language: &str, b: &mut Bencher) { 239 | let string = &EQUAL_UNITS[language]; 240 | let chars = string.chars().count(); 241 | let string: Vec = string.chars().map(|c| c.to_utf16() ).collect(); 242 | b.iter(|| { 243 | let mut i = 0; 244 | let mut loops = 0; 245 | while i < string.len() { 246 | i += if string[i].is_utf16_leading_surrogate() {2} else {1}; 247 | loops += 1; 248 | } 249 | assert_eq!(loops, chars); 250 | }); 251 | } 252 | #[bench] fn utf16_is_leading_surrogate_ascii(b: &mut Bencher) {utf16_is_leading_surrogate("ascii", b)} 253 | #[bench] fn utf16_is_leading_surrogate_en(b: &mut Bencher) {utf16_is_leading_surrogate("en", b)} 254 | #[bench] fn utf16_is_leading_surrogate_es(b: &mut Bencher) {utf16_is_leading_surrogate("es", b)} 255 | #[bench] fn utf16_is_leading_surrogate_ru(b: &mut Bencher) {utf16_is_leading_surrogate("ru", b)} 256 | #[bench] fn utf16_is_leading_surrogate_zh(b: &mut Bencher) {utf16_is_leading_surrogate("zh", b)} 257 | 258 | fn utf16_needs_extra_unit(language: &str, b: &mut Bencher) { 259 | let string = &EQUAL_UNITS[language]; 260 | let chars = string.chars().count(); 261 | let string: Vec = string.chars().map(|c| c.to_utf16() ).collect(); 262 | b.iter(|| { 263 | let mut i = 0; 264 | let mut loops = 0; 265 | let mut errors = 0; 266 | while i < string.len() { 267 | i += match string[i].utf16_needs_extra_unit() { 268 | Ok(true) => 2, 269 | Ok(false) => 1, 270 | Err(_) => {errors+=1; 1} 271 | }; 272 | loops += 1; 273 | } 274 | assert_eq!(loops, chars); 275 | assert_eq!(errors, 0); 276 | }); 277 | } 278 | #[bench] fn utf16_needs_extra_unit_ascii(b: &mut Bencher) {utf16_needs_extra_unit("ascii", b)} 279 | #[bench] fn utf16_needs_extra_unit_en(b: &mut Bencher) {utf16_needs_extra_unit("en", b)} 280 | #[bench] fn utf16_needs_extra_unit_es(b: &mut Bencher) {utf16_needs_extra_unit("es", b)} 281 | #[bench] fn utf16_needs_extra_unit_ru(b: &mut Bencher) {utf16_needs_extra_unit("ru", b)} 282 | #[bench] fn utf16_needs_extra_unit_zh(b: &mut Bencher) {utf16_needs_extra_unit("zh", b)} 283 | -------------------------------------------------------------------------------- /src/errors.rs: -------------------------------------------------------------------------------- 1 | /* Copyright 2016-2022 Torbjørn Birch Moltu 2 | * 3 | * Licensed under the Apache License, Version 2.0, or the MIT license , at your option. This file may not be 6 | * copied, modified, or distributed except according to those terms. 7 | */ 8 | 9 | 10 | //! Boilerplate-y error types. 11 | //! 12 | //! The discriminant values of the enums might change in minor releases. 13 | //! (to reduce the size of the `Result<>` types they are returned in) 14 | 15 | extern crate core; 16 | use core::fmt::{self,Display,Formatter}; 17 | use core::ops::RangeInclusive; 18 | #[cfg(feature="std")] 19 | use std::error::Error; 20 | 21 | 22 | macro_rules! description {($err:ty, $desc:expr) => { 23 | #[cfg(not(feature="std"))] 24 | impl $err { 25 | #[allow(missing_docs)] 26 | pub fn description(&self) -> &'static str { 27 | ($desc)(self) 28 | } 29 | } 30 | #[cfg(feature="std")] 31 | impl Error for $err { 32 | fn description(&self) -> &'static str { 33 | ($desc)(self) 34 | } 35 | } 36 | impl Display for $err { 37 | fn fmt(&self, fmtr: &mut Formatter) -> fmt::Result { 38 | #![allow(deprecated)] // calling our own function 39 | write!(fmtr, "{}", self.description()) 40 | } 41 | } 42 | }} 43 | 44 | 45 | macro_rules! single_cause {($(#[$doc:meta])* $err:ident => $desc:expr) => { 46 | $(#[$doc])* 47 | #[derive(Clone,Copy, Debug, PartialEq,Eq)] 48 | pub struct $err; 49 | description!{$err, |_| $desc } 50 | }} 51 | 52 | 53 | single_cause!{ 54 | /// Error returned by [`U16UtfExt::utf16_needs_extra_unit()`](../trait.U16UtfExt.html#tymethod.utf16_needs_extra_unit) 55 | /// when called on an `u16` that's a trailing surrogate. 56 | Utf16FirstUnitError => "is a trailing surrogate" 57 | } 58 | 59 | single_cause!{ 60 | /// Error returned by [`Utf8Char::from_ascii()`](../struct.Utf8Char.html#method.from_ascii) 61 | /// for bytes that are not ASCII characters. 62 | NonAsciiError => "not an ASCII character" 63 | } 64 | 65 | single_cause!{ 66 | /// Error returned by [`Utf16Char::from_bmp()`](../struct.Utf16Char.html#method.from_bmp) 67 | /// for units that are not a standalone codepoint. 68 | NonBmpError => "not a codepoint in the basic multilingual plane" 69 | } 70 | 71 | single_cause!{ 72 | /// Error returned by [`Utf8Char::from_str_start()`](../struct.Utf8Char.html#method.from_str_start) 73 | /// and [`Utf16Char::from_str_start()`](../struct.Utf16Char.html#method.from_str_start) 74 | /// when called with an empty string. 75 | EmptyStrError => "is empty" 76 | } 77 | 78 | 79 | 80 | macro_rules! simple {($(#[$tydoc:meta])* $err:ident { 81 | $( $(#[$vardoc:meta])* $variant:ident => $string:expr, )+ 82 | } ) => { 83 | $(#[$tydoc])* 84 | #[derive(Clone,Copy, Debug, PartialEq,Eq)] 85 | pub enum $err { 86 | $( $(#[$vardoc])* $variant, )* 87 | } 88 | description!{$err, |e: &$err| match *e {$($err::$variant => $string),*} } 89 | }} 90 | 91 | 92 | simple!{ 93 | /// Error returned when an `u32` is not a valid unicode codepoint. 94 | CodepointError { 95 | /// It's reserved for UTF-16 surrogate pairs. 96 | Utf16Reserved => "is reserved for UTF-16 surrogate pairs", 97 | /// It's higher than the highest codepoint (which is 0x10ffff). 98 | TooHigh => "is higher than the highest codepoint", 99 | }} 100 | use CodepointError::*; 101 | impl CodepointError { 102 | /// Get the range of values for which this error would be given. 103 | pub const fn error_range(self) -> RangeInclusive {match self { 104 | Utf16Reserved => 0xd8_00..=0xdf_ff, 105 | TooHigh => 0x00_10_ff_ff..=0xff_ff_ff_ff, 106 | }} 107 | } 108 | 109 | 110 | simple!{ 111 | /// Error returned when an `[u16; 2]` doesn't form a valid UTF-16 codepoint. 112 | Utf16ArrayError { 113 | /// The first element is a trailing / low surrogate, which is never valid. 114 | FirstIsTrailingSurrogate => "the first element is a trailing surrogate", 115 | /// The second element is needed, but is not a trailing surrogate. 116 | SecondIsNotTrailingSurrogate => "the second element is needed but is not a trailing surrogate", 117 | }} 118 | 119 | simple!{ 120 | /// Error returned when one or two `u16`s are not valid UTF-16. 121 | /// 122 | /// They are returned in sinking precedence; 123 | /// The condition that causes the first variant to be returned is checked 124 | /// for before the condition the next variant is returned for. 125 | Utf16TupleError { 126 | /// The first unit is a trailing / low surrogate, which is never valid. 127 | FirstIsTrailingSurrogate => "the first unit is a trailing surrogate", 128 | /// The provided second unit is not necessary. 129 | SuperfluousSecond => "the second unit is superfluous", 130 | /// The first and only unit requires a second unit. 131 | MissingSecond => "the first unit requires a second unit", 132 | /// The second unit is needed and was provided, but is not a trailing surrogate. 133 | SecondIsNotTrailingSurrogate => "the required second unit is not a trailing surrogate", 134 | }} 135 | 136 | 137 | simple!{ 138 | /// Error returned when a slice of `u16`s doesn't start with valid UTF-16. 139 | Utf16SliceError { 140 | /// The slice is empty. 141 | EmptySlice => "the slice is empty", 142 | /// The first unit is a trailing surrogate. 143 | FirstIsTrailingSurrogate => "the first unit is a trailing surrogate", 144 | /// The first and only unit requires a second unit. 145 | MissingSecond => "the first and only unit requires a second one", 146 | /// The first unit requires a second one, but it's not a trailing surrogate. 147 | SecondIsNotTrailingSurrogate => "the required second unit is not a trailing surrogate", 148 | }} 149 | 150 | simple!{ 151 | /// Error returned by [`Utf16CharDecoder`](../iterator/struct.Utf16CharMerger.html#impl-Iterator) 152 | /// when it encounters an invalid sequence. 153 | Utf16PairError { 154 | /// A trailing surrogate was not preceeded by a leading surrogate. 155 | UnexpectedTrailingSurrogate => "a trailing surrogate was not preceeded by a leading surrogate", 156 | /// A leading surrogate was followed by an unit that was not a trailing surrogate. 157 | UnmatchedLeadingSurrogate => "a leading surrogate was followed by an unit that was not a trailing surrogate", 158 | /// A trailing surrogate was expected when the end was reached. 159 | Incomplete => "a trailing surrogate was expected when the end was reached", 160 | }} 161 | 162 | 163 | simple!{ 164 | /// Error returned when [`Utf8Char::from_str()`](../struct.Utf8Char.html#impl-FromStr) 165 | /// or [`Utf16Char::from_str()`](../struct.Utf16Char.html#impl-FromStr) fails. 166 | FromStrError { 167 | /// `Utf8Char` and `Utf16Char` cannot store more than a single codepoint. 168 | MultipleCodepoints => "contains more than one codepoint", 169 | /// `Utf8Char` and `Utf16Char` cannot be empty. 170 | Empty => "is empty", 171 | } 172 | } 173 | 174 | 175 | 176 | /// Error returned when an invalid UTF-8 sequence is encountered. 177 | /// 178 | /// See [`Utf8ErrorKind`](enum.Utf8ErrorKind.html) for the types of errors 179 | /// that this type can be returned for. 180 | #[derive(Clone,Copy, Debug, PartialEq,Eq)] 181 | pub struct Utf8Error { 182 | pub(crate) kind: Utf8ErrorKind, 183 | } 184 | impl Utf8Error { 185 | /// Get the type of error. 186 | pub const fn kind(&self) -> Utf8ErrorKind { 187 | self.kind 188 | } 189 | 190 | #[cfg(not(feature="std"))] 191 | #[allow(missing_docs)] 192 | pub const fn description(&self) -> &'static str { 193 | utf8_error_description(self.kind) 194 | } 195 | } 196 | #[cfg(feature="std")] 197 | impl Error for Utf8Error { 198 | fn description(&self) -> &'static str { 199 | utf8_error_description(self.kind) 200 | } 201 | } 202 | impl Display for Utf8Error { 203 | fn fmt(&self, fmtr: &mut Formatter) -> fmt::Result { 204 | fmtr.write_str(utf8_error_description(self.kind)) 205 | } 206 | } 207 | 208 | /// The types of errors that can occur when decoding a UTF-8 codepoint. 209 | /// 210 | /// The variants are more technical than what an end user is likely interested 211 | /// in, but might be useful for deciding how to handle the error. 212 | /// 213 | /// They can be grouped into three categories: 214 | /// * Will happen regularly if decoding chunked or buffered text: `TooFewBytes`. 215 | /// * Input might be binary, a different encoding or corrupted, `UnexpectedContinuationByte` 216 | /// and `InterruptedSequence`. 217 | /// (Broken UTF-8 sequence). 218 | /// * Less likely to happen accidentaly and might be malicious: 219 | /// `OverlongEncoding`, `Utf16ReservedCodepoint` and `TooHighCodepoint`. 220 | /// Note that theese can still be caused by certain valid latin-1 strings 221 | /// such as `"Á©"` (`b"\xC1\xA9"`). 222 | #[derive(Clone,Copy, Debug, PartialEq,Eq)] 223 | pub enum Utf8ErrorKind { 224 | /// There are too few bytes to decode the codepoint. 225 | /// 226 | /// This can happen when a slice is empty or too short, or an iterator 227 | /// returned `None` while in the middle of a codepoint. 228 | /// This error is never produced by functions accepting fixed-size 229 | /// `[u8; 4]` arrays. 230 | /// 231 | /// If decoding text coming chunked (such as in buffers passed to `Read`), 232 | /// the remaing bytes should be carried over into the next chunk or buffer. 233 | /// (including the byte this error was produced for.) 234 | TooFewBytes, 235 | /// A byte which is never used by well-formed UTF-8 was encountered. 236 | /// 237 | /// This means that the input is using a different encoding, 238 | /// is corrupted or binary. 239 | /// 240 | /// This error is returned when a byte in the following ranges 241 | /// is encountered anywhere in an UTF-8 sequence: 242 | /// 243 | /// * `192` and `193` (`0b1100_000x`): Indicates an overlong encoding 244 | /// of a single-byte, ASCII, character, and should therefore never occur. 245 | /// * `248..` (`0b1111_1xxx`): Sequences cannot be longer than 4 bytes. 246 | /// * `245..=247` (`0b1111_0101 | 0b1111_0110`): Indicates a too high 247 | /// codepoint. (above `\u10ffff`) 248 | NonUtf8Byte, 249 | /// The first byte is not a valid start of a codepoint. 250 | /// 251 | /// This might happen as a result of slicing into the middle of a codepoint, 252 | /// the input not being UTF-8 encoded or being corrupted. 253 | /// Errors of this type coming right after another error should probably 254 | /// be ignored, unless returned more than three times in a row. 255 | /// 256 | /// This error is returned when the first byte has a value in the range 257 | /// `128..=191` (`0b1000_0000..=0b1011_1111`). 258 | UnexpectedContinuationByte, 259 | /// The byte at index 1..=3 should be a continuation byte, 260 | /// but doesn't fit the pattern `0b10xx_xxxx`. 261 | /// 262 | /// When the input slice or iterator has too few bytes, 263 | /// [`TooFewBytes`](#Incomplete) is returned instead. 264 | InterruptedSequence, 265 | /// The encoding of the codepoint has so many leading zeroes that it 266 | /// could be a byte shorter. 267 | /// 268 | /// [Successfully decoding this can present a security issue](https://tools.ietf.org/html/rfc3629#section-10): 269 | /// Doing so could allow an attacker to circumvent input validation that 270 | /// only checks for ASCII characters, and input characters or strings that 271 | /// would otherwise be rejected, such as `/../`. 272 | /// 273 | /// This error is only returned for 3 and 4-byte encodings; 274 | /// `NonUtf8Byte` is returned for bytes that start longer or shorter 275 | /// overlong encodings. 276 | OverlongEncoding, 277 | /// The codepoint is reserved for UTF-16 surrogate pairs. 278 | /// 279 | /// (`Utf8Char` cannot be used to work with the 280 | /// [WTF-8](https://simonsapin.github.io/wtf-8) encoding for UCS-2 strings.) 281 | /// 282 | /// This error is returned for codepoints in the range `\ud800`..=`\udfff`. 283 | /// (which are three bytes long as UTF-8) 284 | Utf16ReservedCodepoint, 285 | /// The codepoint is higher than `\u10ffff`, which is the highest codepoint 286 | /// unicode permits. 287 | TooHighCodepoint, 288 | } 289 | const fn utf8_error_description(kind: Utf8ErrorKind) -> &'static str { 290 | match kind { 291 | Utf8ErrorKind::TooFewBytes => "too few bytes", 292 | Utf8ErrorKind::NonUtf8Byte => "not UTF-8", 293 | Utf8ErrorKind::UnexpectedContinuationByte => "not UTF-8", 294 | Utf8ErrorKind::InterruptedSequence => "not UTF-8", 295 | Utf8ErrorKind::OverlongEncoding => "malformed input", 296 | Utf8ErrorKind::Utf16ReservedCodepoint => "malformed input", 297 | Utf8ErrorKind::TooHighCodepoint => "invalid character", 298 | } 299 | } 300 | impl PartialEq for Utf8Error { 301 | fn eq(&self, kind: &Utf8ErrorKind) -> bool { 302 | self.kind == *kind 303 | } 304 | } 305 | impl PartialEq for Utf8ErrorKind { 306 | fn eq(&self, error: &Utf8Error) -> bool { 307 | *self == error.kind 308 | } 309 | } 310 | -------------------------------------------------------------------------------- /tests/oks.rs: -------------------------------------------------------------------------------- 1 | /* Copyright 2016-2022 Torbjørn Birch Moltu 2 | * 3 | * Licensed under the Apache License, Version 2.0, or the MIT license , at your option. This file may not be 6 | * copied, modified, or distributed except according to those terms. 7 | */ 8 | 9 | //! Test that every method gives the correct result for valid values. 10 | //! Except iterators, which are stateful. 11 | 12 | #![cfg(feature="std")] 13 | #![allow( 14 | clippy::eq_op, // testing the comparison 15 | )] 16 | 17 | use std::char; 18 | use std::str::{self,FromStr}; 19 | use std::cmp::Ordering; 20 | use std::hash::{Hash,Hasher}; 21 | use std::collections::hash_map::DefaultHasher; 22 | use std::iter::FromIterator; 23 | extern crate encode_unicode; 24 | use encode_unicode::*; 25 | 26 | 27 | #[test] 28 | fn equal_defaults() { 29 | assert_eq!(Utf8Char::default().to_char(), char::default()); 30 | assert_eq!(Utf16Char::default().to_char(), char::default()); 31 | } 32 | 33 | #[test] 34 | fn same_size_as_char() { 35 | use std::mem::size_of; 36 | assert_eq!(size_of::(), size_of::()); 37 | assert_eq!(size_of::(), size_of::()); 38 | } 39 | 40 | #[test] 41 | fn utf16chars_to_string() { 42 | let s = "aå\u{10ffff}‽\u{100000}\u{fee1}"; 43 | let u16cs = s.chars().map(Utf16Char::from).collect::>(); 44 | 45 | let mut from_refs: String = u16cs.iter().collect(); 46 | assert_eq!(&from_refs, s); 47 | from_refs.extend(&u16cs); 48 | assert_eq!(&from_refs[s.len()..], s); 49 | 50 | let mut from_vals: String = u16cs.iter().cloned().collect(); 51 | assert_eq!(&from_vals, s); 52 | from_vals.extend(u16cs); 53 | assert_eq!(&from_vals[s.len()..], s); 54 | } 55 | 56 | 57 | const EDGES_AND_BETWEEN: [char;19] = [ 58 | '\u{0}',// min 59 | '\u{3b}',// middle ASCII 60 | 'A',// min ASCII uppercase 61 | 'N',// middle ASCII uppercase 62 | 'Z',// max ASCII uppercase 63 | 'a',// min ASCII lowercase 64 | 'm',// middle ASCII lowercase 65 | 'z',// max ASCII lowercase 66 | '\u{7f}',// max ASCII and 1-byte UTF-8 67 | '\u{80}',// min 2-byte UTF-8 68 | '\u{111}',// middle 69 | '\u{7ff}',// max 2-byte UTF-8 70 | '\u{800}',// min 3-byte UTF-8 71 | '\u{d7ff}',// before reserved 72 | '\u{e000}',// after reserved 73 | '\u{ffff}',// max UTF-16 single and 3-byte UTF-8 74 | '\u{10000}',// min UTF-16 surrogate and 4-byte UTF-8 75 | '\u{abcde}',// middle 76 | '\u{10ffff}',// max 77 | ]; 78 | 79 | fn eq_cmp_hash(c: char) -> (Utf8Char, Utf16Char) { 80 | fn hash(v: T) -> u64 { 81 | #[allow(deprecated)] 82 | let mut hasher = DefaultHasher::new(); 83 | v.hash(&mut hasher); 84 | hasher.finish() 85 | } 86 | let u8c = c.to_utf8(); 87 | assert_eq!(u8c.to_char(), c); 88 | assert_eq!(u8c, u8c); 89 | assert_eq!(hash(u8c), hash(u8c)); 90 | assert_eq!(u8c.cmp(&u8c), Ordering::Equal); 91 | assert!(u8c.eq_ignore_ascii_case(&u8c)); 92 | let u16c = c.to_utf16(); 93 | assert_eq!(u16c.to_char(), c); 94 | assert_eq!(u16c, u16c); 95 | assert_eq!(hash(u16c), hash(c)); 96 | assert_eq!(u16c.cmp(&u16c), Ordering::Equal); 97 | assert!(u16c.eq_ignore_ascii_case(&u16c)); 98 | 99 | assert_eq!(u8c, c); 100 | assert_eq!(c, u8c); 101 | assert_eq!(u16c, c); 102 | assert_eq!(c, u16c); 103 | assert_eq!(u8c, u16c); 104 | assert_eq!(u16c, u8c); 105 | assert_eq!(u8c == c as u8, c <= '\u{7F}'); 106 | assert_eq!(u16c == c as u8, c <= '\u{FF}'); 107 | assert_eq!(u16c == c as u16, c <= '\u{FFFF}'); 108 | 109 | assert_eq!(u8c.partial_cmp(&c), Some(Ordering::Equal)); 110 | assert_eq!(c.partial_cmp(&u8c), Some(Ordering::Equal)); 111 | assert_eq!(u16c.partial_cmp(&c), Some(Ordering::Equal)); 112 | assert_eq!(c.partial_cmp(&u16c), Some(Ordering::Equal)); 113 | assert_eq!(u8c.partial_cmp(&u16c), Some(Ordering::Equal)); 114 | assert_eq!(u16c.partial_cmp(&u8c), Some(Ordering::Equal)); 115 | 116 | 117 | for &other in &EDGES_AND_BETWEEN { 118 | let u8other = other.to_utf8(); 119 | assert_eq!(u8c == u8other, c == other); 120 | assert_eq!(hash(u8c)==hash(u8other), hash(c)==hash(other)); 121 | assert_eq!(u8c.cmp(&u8other), c.cmp(&other)); 122 | assert_eq!(u8c.eq_ignore_ascii_case(&u8other), c.eq_ignore_ascii_case(&other)); 123 | assert_eq!(u8c.partial_cmp(&other), c.partial_cmp(&other)); 124 | assert_eq!(c.partial_cmp(&u8other), c.partial_cmp(&other)); 125 | assert_eq!(u8other.partial_cmp(&c), other.partial_cmp(&c)); 126 | assert_eq!(other.partial_cmp(&u8c), other.partial_cmp(&c)); 127 | assert_eq!(u8c == other as u8, other as u8 <= 127 && c == other as u8 as char); 128 | 129 | let u16other = other.to_utf16(); 130 | assert_eq!(u16c == u16other, c == other); 131 | assert_eq!(hash(u16c)==hash(u16other), hash(c)==hash(other)); 132 | assert_eq!(u16c.cmp(&u16other), c.cmp(&other)); 133 | assert_eq!(u16c.eq_ignore_ascii_case(&u16other), c.eq_ignore_ascii_case(&other)); 134 | assert_eq!(u16c.partial_cmp(&other), c.partial_cmp(&other)); 135 | assert_eq!(c.partial_cmp(&u16other), c.partial_cmp(&other)); 136 | assert_eq!(u16other.partial_cmp(&c), other.partial_cmp(&c)); 137 | assert_eq!(other.partial_cmp(&u16c), other.partial_cmp(&c)); 138 | assert_eq!(u16c == other as u8, c == other as u8 as char); 139 | assert_eq!(u16c == other as u16, c as u32 == other as u16 as u32); 140 | 141 | assert_eq!(u8c == u16other, c == other); 142 | assert_eq!(u16c == u8other, c == other); 143 | assert_eq!(u8c.partial_cmp(&u16other), c.partial_cmp(&other)); 144 | assert_eq!(u16c.partial_cmp(&u8other), c.partial_cmp(&other)); 145 | assert_eq!(u8other.partial_cmp(&u16c), other.partial_cmp(&c)); 146 | assert_eq!(u16other.partial_cmp(&u8c), other.partial_cmp(&c)); 147 | } 148 | (u8c, u16c) 149 | } 150 | 151 | fn iterators(c: char) { 152 | let mut iter = c.iter_utf8_bytes(); 153 | let mut buf = [0; 4]; 154 | let mut iter_ref = c.encode_utf8(&mut buf[..]).as_bytes().iter(); 155 | for _ in 0..6 { 156 | assert_eq!(iter.size_hint(), iter_ref.size_hint()); 157 | assert_eq!(format!("{:?}", iter), format!("{:?}", iter_ref.as_slice())); 158 | assert_eq!(iter.next(), iter_ref.next().cloned()); 159 | } 160 | 161 | let mut iter = c.iter_utf16_units(); 162 | let mut buf = [0; 2]; 163 | let mut iter_ref = c.encode_utf16(&mut buf[..]).iter(); 164 | for _ in 0..4 { 165 | assert_eq!(iter.size_hint(), iter_ref.size_hint()); 166 | assert_eq!(format!("{:?}", iter), format!("{:?}", iter_ref.as_slice())); 167 | assert_eq!(iter.next(), iter_ref.next().cloned()); 168 | } 169 | } 170 | 171 | fn test(c: char) { 172 | assert_eq!(char::from_u32(c as u32), Some(c)); 173 | assert_eq!(char::from_u32_detailed(c as u32), Ok(c)); 174 | assert_eq!(unsafe{ char::from_u32_unchecked(c as u32) }, c); 175 | let (u8c, u16c) = eq_cmp_hash(c); 176 | iterators(c); 177 | assert_eq!(Utf16Char::from(u8c), u16c); 178 | assert_eq!(Utf8Char::from(u16c), u8c); 179 | let utf8_len = c.len_utf8(); 180 | let utf16_len = c.len_utf16(); 181 | let mut as_str = c.to_string(); 182 | 183 | // UTF-8 184 | let mut buf = [0; 4]; 185 | let reference = c.encode_utf8(&mut buf[..]).as_bytes(); 186 | let len = reference.len(); // short name because it is used in many places. 187 | assert_eq!(len, utf8_len); 188 | assert_eq!(reference[0].extra_utf8_bytes(), Ok(len-1)); 189 | assert_eq!(reference[0].extra_utf8_bytes_unchecked(), len-1); 190 | assert_eq!(AsRef::<[u8]>::as_ref(&u8c), reference); 191 | 192 | let (arr,arrlen) = u8c.to_array(); 193 | assert_eq!(arrlen, len); 194 | assert_eq!(Utf8Char::from_array(arr), Ok(u8c)); 195 | assert_eq!(Utf8Char::new(c), u8c); 196 | assert_eq!(c.to_utf8_array(), (arr, len)); 197 | 198 | let str_ = str::from_utf8(reference).unwrap(); 199 | let ustr = Utf8Char::from_str(str_).unwrap(); 200 | assert_eq!(ustr.to_array().0, arr);// bitwise equality 201 | assert_eq!(char::from_utf8_array(arr), Ok(c)); 202 | let mut longer = [0xff; 5]; // 0xff is never valid 203 | longer[..len].copy_from_slice(reference); 204 | assert_eq!(char::from_utf8_slice_start(reference), Ok((c,len))); 205 | assert_eq!(char::from_utf8_slice_start(&longer), Ok((c,len))); 206 | assert_eq!(Utf8Char::from_slice_start(reference), Ok((u8c,len))); 207 | assert_eq!(Utf8Char::from_slice_start(&longer), Ok((u8c,len))); 208 | for other in &mut longer[len..] {*other = b'?'} 209 | assert_eq!(Utf8Char::from_str(str_), Ok(u8c)); 210 | assert_eq!(Utf8Char::from_str_start(str_), Ok((u8c,len))); 211 | assert_eq!(Utf8Char::from_str_start(str::from_utf8(&longer).unwrap()), Ok((u8c,len))); 212 | unsafe { 213 | // Hopefully make bugs easier to catch by making reads into unallocated memory by filling 214 | // a jemalloc bin. See table on http://jemalloc.net/jemalloc.3.html for bin sizes. 215 | // I have no idea whether this works. 216 | let mut boxed = Box::new([0xffu8; 16]); 217 | let start = boxed.len()-len; // reach the end 218 | boxed[start..].copy_from_slice(reference); 219 | let slice = &boxed[start..]; 220 | assert_eq!(Utf8Char::from_slice_start_unchecked(slice), (u8c,len)); 221 | } 222 | assert_eq!(&Vec::::from_iter(Some(u8c))[..], reference); 223 | assert_eq!(&String::from_iter(Some(u8c))[..], str_); 224 | assert_eq!(format!("{:?}", u8c), format!("{:?}", c)); 225 | assert_eq!(format!("{}", u8c), format!("{}", c)); 226 | assert_eq!(u8c.is_ascii(), c.is_ascii()); 227 | assert_eq!(u8c.to_ascii_lowercase().to_char(), c.to_ascii_lowercase()); 228 | assert_eq!(u8c.to_ascii_uppercase().to_char(), c.to_ascii_uppercase()); 229 | 230 | // UTF-16 231 | let mut buf = [0; 2]; 232 | let reference = c.encode_utf16(&mut buf[..]); 233 | let len = reference.len(); 234 | assert_eq!(len, utf16_len); 235 | assert_eq!(reference[0].utf16_needs_extra_unit(), Ok(len==2)); 236 | assert_eq!(reference[0].is_utf16_leading_surrogate(), len==2); 237 | assert_eq!(u16c.as_ref(), reference); 238 | assert_eq!(Utf16Char::new(c), u16c); 239 | let mut longer = [0; 3]; 240 | longer[..len].copy_from_slice(reference); 241 | assert_eq!(char::from_utf16_slice_start(reference), Ok((c,len))); 242 | assert_eq!(char::from_utf16_slice_start(&longer), Ok((c,len))); 243 | assert_eq!(Utf16Char::from_slice_start(reference), Ok((u16c,len))); 244 | assert_eq!(Utf16Char::from_slice_start(&longer), Ok((u16c,len))); 245 | assert_eq!(Utf16Char::from_str(&as_str), Ok(u16c)); 246 | as_str.push(c); 247 | assert_eq!(Utf16Char::from_str_start(&as_str), Ok((u16c,utf8_len))); 248 | unsafe { 249 | // Hopefully make bugs easier to catch by making reads into unallocated memory by filling 250 | // a jemalloc bin. See table on http://jemalloc.net/jemalloc.3.html for bin sizes. 251 | // I have no idea whether this works. 252 | let mut boxed = Box::new([0u16; 8]); 253 | let start = boxed.len()-len; // reach the end 254 | boxed[start..].copy_from_slice(reference); 255 | let slice = &boxed[start..]; 256 | assert_eq!(Utf16Char::from_slice_start_unchecked(slice), (u16c,len)); 257 | } 258 | let array = c.to_utf16_array(); 259 | let tuple = c.to_utf16_tuple(); 260 | assert_eq!(&array[..reference.len()], reference); 261 | assert_eq!(tuple, (reference[0],reference.get(1).cloned())); 262 | assert_eq!(char::from_utf16_array(array), Ok(c)); 263 | assert_eq!(char::from_utf16_tuple(tuple), Ok(c)); 264 | assert_eq!(c.to_utf16().to_char(), c); 265 | assert_eq!(&Vec::::from_iter(Some(u16c))[..], reference); 266 | assert_eq!(format!("{:?}", u16c), format!("{:?}", c)); 267 | assert_eq!(format!("{}", u16c), format!("{}", c)); 268 | assert_eq!(u16c.is_ascii(), c.is_ascii()); 269 | assert_eq!(u16c.to_ascii_lowercase().to_char(), c.to_ascii_lowercase()); 270 | assert_eq!(u16c.to_ascii_uppercase().to_char(), c.to_ascii_uppercase()); 271 | } 272 | 273 | 274 | #[test] 275 | fn edges_and_middle() { 276 | for &c in &EDGES_AND_BETWEEN { 277 | test(c); 278 | } 279 | } 280 | 281 | 282 | // Test EVERY codepoint. 283 | // By splitting into multiple tests we get multithreading for free. 284 | macro_rules! test_codepoint_range {($name:ident, $range:expr) => { 285 | #[test] 286 | #[ignore] 287 | fn $name() { 288 | for cp in $range { 289 | let c = char::from_u32(cp).expect("not a valid char"); 290 | test(c); 291 | } 292 | } 293 | }} 294 | test_codepoint_range!{all_0000_d800, 0x0000..0xd800} 295 | test_codepoint_range!{all_e000_10000, 0xe000..0x10000} 296 | test_codepoint_range!{all_10000_20000, 0x10000..0x20000} 297 | test_codepoint_range!{all_20000_30000, 0x20000..0x30000} 298 | test_codepoint_range!{all_30000_40000, 0x30000..0x40000} 299 | test_codepoint_range!{all_40000_50000, 0x40000..0x50000} 300 | test_codepoint_range!{all_50000_60000, 0x50000..0x60000} 301 | test_codepoint_range!{all_60000_70000, 0x60000..0x70000} 302 | test_codepoint_range!{all_70000_80000, 0x70000..0x80000} 303 | test_codepoint_range!{all_80000_90000, 0x80000..0x90000} 304 | test_codepoint_range!{all_90000_a0000, 0x90000..0xa0000} 305 | test_codepoint_range!{all_a0000_b0000, 0xa0000..0xb0000} 306 | test_codepoint_range!{all_b0000_c0000, 0xb0000..0xc0000} 307 | test_codepoint_range!{all_c0000_d0000, 0xc0000..0xd0000} 308 | test_codepoint_range!{all_d0000_e0000, 0xd0000..0xe0000} 309 | test_codepoint_range!{all_e0000_f0000, 0xe0000..0xf0000} 310 | test_codepoint_range!{all_f0000_100000, 0xf0000..0x100000} 311 | test_codepoint_range!{all_100000_110000, 0x100000..0x110000} 312 | -------------------------------------------------------------------------------- /src/decoding_iterators.rs: -------------------------------------------------------------------------------- 1 | /* Copyright 2018-2020 Torbjørn Birch Moltu 2 | * 3 | * Licensed under the Apache License, Version 2.0, or the MIT license , at your option. This file may not be 6 | * copied, modified, or distributed except according to those terms. 7 | */ 8 | 9 | //! Iterators that turn multiple `u8`s or `u16`s into `Utf*Char`s, but can fail. 10 | //! 11 | //! To be predictable, all errors consume one element each. 12 | //! 13 | //! The iterator adaptors produce neither offset nor element length to work 14 | //! well with other adaptors, 15 | //! while the slice iterators yield both to make more advanced use cases easy. 16 | 17 | use crate::errors::{Utf16FirstUnitError, Utf16PairError, Utf8Error}; 18 | use crate::errors::Utf16SliceError::*; 19 | use crate::errors::Utf16PairError::*; 20 | use crate::errors::Utf8ErrorKind::*; 21 | use crate::utf8_char::Utf8Char; 22 | use crate::utf16_char::Utf16Char; 23 | use crate::traits::U16UtfExt; 24 | extern crate core; 25 | use core::borrow::Borrow; 26 | use core::fmt::{self, Debug}; 27 | use core::iter::Chain; 28 | use core::option; 29 | 30 | 31 | /// Decodes UTF-8 characters from a byte iterator into `Utf8Char`s. 32 | /// 33 | /// See [`IterExt::to_utf8chars()`](../trait.IterExt.html#tymethod.to_utf8chars) 34 | /// for examples and error handling. 35 | #[derive(Clone, Default)] 36 | pub struct Utf8CharMerger, I:Iterator> { 37 | iter: I, 38 | /// number of bytes that were read before an error was detected 39 | after_err_leftover: u8, 40 | /// stack because it simplifies popping. 41 | after_err_stack: [u8; 3], 42 | } 43 | impl, I:Iterator, T:IntoIterator> 44 | From for Utf8CharMerger { 45 | fn from(t: T) -> Self { 46 | Utf8CharMerger { 47 | iter: t.into_iter(), 48 | after_err_leftover: 0, 49 | after_err_stack: [0; 3], 50 | } 51 | } 52 | } 53 | impl, I:Iterator> Utf8CharMerger { 54 | /// Extract the inner iterator. 55 | /// 56 | /// If the last item produced by `.next()` was an `Err`, 57 | /// up to three following bytes might be missing. 58 | /// The exact number of missing bytes for each error type should not be relied on. 59 | /// 60 | /// # Examples 61 | /// 62 | /// Three bytes swallowed: 63 | /// ``` 64 | /// # use encode_unicode::IterExt; 65 | /// let mut merger = b"\xf4\xa1\xb2FS".iter().to_utf8chars(); 66 | /// assert!(merger.next().unwrap().is_err()); 67 | /// let mut inner: std::slice::Iter = merger.into_inner(); 68 | /// assert_eq!(inner.next(), Some(&b'S')); // b'\xa1', b'\xb2' and b'F' disappeared 69 | /// ``` 70 | /// 71 | /// All bytes present: 72 | /// ``` 73 | /// # use encode_unicode::IterExt; 74 | /// let mut merger = b"\xb0FS".iter().to_utf8chars(); 75 | /// assert!(merger.next().unwrap().is_err()); 76 | /// assert_eq!(merger.into_inner().next(), Some(&b'F')); 77 | /// ``` 78 | /// 79 | /// Two bytes missing: 80 | /// ``` 81 | /// # use encode_unicode::IterExt; 82 | /// let mut merger = b"\xe0\x80\x80FS".iter().to_utf8chars(); 83 | /// assert!(merger.next().unwrap().is_err()); 84 | /// assert_eq!(merger.into_inner().next(), Some(&b'F')); 85 | /// ``` 86 | pub fn into_inner(self) -> I { 87 | self.iter 88 | } 89 | 90 | fn save(&mut self, bytes: &[u8;4], len: usize) { 91 | // forget bytes[0] and push the others onto self.after_err_stack (in reverse). 92 | for &after_err in bytes[1..len].iter().rev() { 93 | self.after_err_stack[self.after_err_leftover as usize] = after_err; 94 | self.after_err_leftover += 1; 95 | } 96 | } 97 | /// Reads len-1 bytes into bytes[1..] 98 | fn extra(&mut self, bytes: &mut[u8;4], len: usize) -> Result<(),Utf8Error> { 99 | // This is the only function that pushes onto after_err_stack, 100 | // and it checks that all bytes are continuation bytes before fetching the next one. 101 | // Therefore only the last byte retrieved can be a non-continuation byte. 102 | // That last byte is also the last to be retrieved from after_err. 103 | // 104 | // Before this function is called, there has been retrieved at least one byte. 105 | // If that byte was a continuation byte, next() produces an error 106 | // and won't call this function. 107 | // Therefore, we know that after_err is empty at this point. 108 | // This means that we can use self.iter directly, and knows where to start pushing 109 | debug_assert_eq!(self.after_err_leftover, 0, "first: {:#02x}, stack: {:?}", bytes[0], self.after_err_stack); 110 | for i in 1..len { 111 | if let Some(extra) = self.iter.next() { 112 | let extra = *extra.borrow(); 113 | bytes[i] = extra; 114 | if extra & 0b1100_0000 != 0b1000_0000 { 115 | // not a continuation byte 116 | self.save(bytes, i+1); 117 | return Err(Utf8Error{ kind: InterruptedSequence }) 118 | } 119 | } else { 120 | self.save(bytes, i); 121 | return Err(Utf8Error{ kind: TooFewBytes }); 122 | } 123 | } 124 | Ok(()) 125 | } 126 | } 127 | impl, I:Iterator> Iterator for Utf8CharMerger { 128 | type Item = Result; 129 | fn next(&mut self) -> Option { 130 | let first: u8; 131 | if self.after_err_leftover != 0 { 132 | self.after_err_leftover -= 1; 133 | first = self.after_err_stack[self.after_err_leftover as usize]; 134 | } else if let Some(next) = self.iter.next() { 135 | first = *next.borrow(); 136 | } else { 137 | return None; 138 | } 139 | 140 | unsafe { 141 | let mut bytes = [first, 0, 0, 0]; 142 | let ok = match first { 143 | 0b0000_0000..=0b0111_1111 => {/*1 and */Ok(())}, 144 | 0b1100_0010..=0b1101_1111 => {//2 and not overlong 145 | self.extra(&mut bytes, 2) // no extra validation required 146 | }, 147 | 0b1110_0000..=0b1110_1111 => {//3 148 | if let Err(e) = self.extra(&mut bytes, 3) { 149 | Err(e) 150 | } else if bytes[0] == 0b1110_0000 && bytes[1] <= 0b10_011111 { 151 | self.save(&bytes, 3); 152 | Err(Utf8Error{ kind: OverlongEncoding }) 153 | } else if bytes[0] == 0b1110_1101 && bytes[1] & 0b11_100000 == 0b10_100000 { 154 | self.save(&bytes, 3); 155 | Err(Utf8Error{ kind: Utf16ReservedCodepoint }) 156 | } else { 157 | Ok(()) 158 | } 159 | }, 160 | 0b1111_0000..=0b1111_0100 => {//4 161 | if let Err(e) = self.extra(&mut bytes, 4) { 162 | Err(e) 163 | } else if bytes[0] == 0b11110_000 && bytes[1] <= 0b10_001111 { 164 | self.save(&bytes, 4); 165 | Err(Utf8Error{ kind: OverlongEncoding }) 166 | } else if bytes[0] == 0b11110_100 && bytes[1] > 0b10_001111 { 167 | self.save(&bytes, 4); 168 | Err(Utf8Error{ kind: TooHighCodepoint }) 169 | } else { 170 | Ok(()) 171 | } 172 | }, 173 | 0b1000_0000..=0b1011_1111 => {// continuation byte 174 | Err(Utf8Error{ kind: UnexpectedContinuationByte }) 175 | }, 176 | 0b1100_0000..=0b1100_0001 => {// 2 and overlong 177 | Err(Utf8Error{ kind: NonUtf8Byte }) 178 | }, 179 | 0b1111_0101..=0b1111_0111 => {// 4 and too high codepoint 180 | Err(Utf8Error{ kind: NonUtf8Byte }) 181 | }, 182 | 0b1111_1000..=0b1111_1111 => { 183 | Err(Utf8Error{ kind: NonUtf8Byte }) 184 | }, 185 | }; 186 | Some(ok.map(|()| Utf8Char::from_array_unchecked(bytes) )) 187 | } 188 | } 189 | fn size_hint(&self) -> (usize,Option) { 190 | let (iter_min, iter_max) = self.iter.size_hint(); 191 | // cannot be exact, so KISS 192 | let min = iter_min / 4; // don't bother rounding up or accounting for after_err 193 | // handle edge case of max > usize::MAX-3 just in case. 194 | // Using wrapping_add() wouldn't violate any API contract as the trait isn't unsafe. 195 | let max = iter_max.and_then(|max| { 196 | max.checked_add(self.after_err_leftover as usize) 197 | }); 198 | (min, max) 199 | } 200 | } 201 | impl, I:Iterator+Debug> Debug for Utf8CharMerger { 202 | fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { 203 | let mut in_order = [0u8; 3]; 204 | for i in 0..self.after_err_leftover as usize { 205 | in_order[i] = self.after_err_stack[self.after_err_leftover as usize - i - 1]; 206 | } 207 | fmtr.debug_struct("Utf8CharMerger") 208 | .field("buffered", &&in_order[..self.after_err_leftover as usize]) 209 | .field("inner", &self.iter) 210 | .finish() 211 | } 212 | } 213 | 214 | 215 | /// An [`Utf8CharMerger`](struct.Utf8CharMerger.html) that also produces 216 | /// offsets and lengths, but can only iterate over slices. 217 | /// 218 | /// See [`SliceExt::utf8char_indices()`](../trait.SliceExt.html#tymethod.utf8char_indices) 219 | /// for examples and error handling. 220 | #[derive(Clone, Default)] 221 | pub struct Utf8CharDecoder<'a> { 222 | slice: &'a[u8], 223 | index: usize, 224 | } 225 | impl<'a> From<&'a[u8]> for Utf8CharDecoder<'a> { 226 | fn from(s: &[u8]) -> Utf8CharDecoder { 227 | Utf8CharDecoder { slice: s, index: 0 } 228 | } 229 | } 230 | impl<'a> Utf8CharDecoder<'a> { 231 | /// Extract the remainder of the source slice. 232 | /// 233 | /// # Examples 234 | /// 235 | /// Unlike `Utf8CharMerger::into_inner()`, bytes directly after an error 236 | /// are never swallowed: 237 | /// ``` 238 | /// # use encode_unicode::SliceExt; 239 | /// let mut iter = b"\xf4\xa1\xb2FS".utf8char_indices(); 240 | /// assert!(iter.next().unwrap().1.is_err()); 241 | /// assert_eq!(iter.as_slice(), b"\xa1\xb2FS"); 242 | /// ``` 243 | pub fn as_slice(&self) -> &'a[u8] { 244 | &self.slice[self.index..] 245 | } 246 | } 247 | impl<'a> Iterator for Utf8CharDecoder<'a> { 248 | type Item = (usize, Result, usize); 249 | fn next(&mut self) -> Option { 250 | let start = self.index; 251 | match Utf8Char::from_slice_start(&self.slice[self.index..]) { 252 | Ok((u8c, len)) => { 253 | self.index += len; 254 | Some((start, Ok(u8c), len)) 255 | }, 256 | Err(_) if self.slice.len() <= self.index => None, 257 | Err(e) => { 258 | self.index += 1; 259 | Some((start, Err(e), 1)) 260 | } 261 | } 262 | } 263 | #[inline] 264 | fn size_hint(&self) -> (usize,Option) { 265 | let bytes = self.slice.len() - self.index; 266 | // Cannot be exact, so KISS and don't bother rounding up. 267 | // The slice is unlikely be full of 4-byte codepoints, so buffers 268 | // allocated with the lower bound will have to be grown anyway. 269 | (bytes/4, Some(bytes)) 270 | } 271 | } 272 | impl<'a> DoubleEndedIterator for Utf8CharDecoder<'a> { 273 | fn next_back(&mut self) -> Option { 274 | if self.index < self.slice.len() { 275 | let extras = self.slice.iter() 276 | .rev() 277 | .take_while(|&b| b & 0b1100_0000 == 0b1000_0000 ) 278 | .count(); 279 | let starts = self.slice.len() - (extras+1); 280 | match Utf8Char::from_slice_start(&self.slice[starts..]) { 281 | Ok((u8c,len)) if len == 1+extras => { 282 | self.slice = &self.slice[..starts]; 283 | Some((starts, Ok(u8c), len)) 284 | }, 285 | // This enures errors for every byte in both directions, 286 | // but means overlong and codepoint errors will be turned into 287 | // tooshort errors. 288 | Err(e) if extras == 0 => { 289 | self.slice = &self.slice[..self.slice.len()-1]; 290 | Some((self.slice.len()-1, Err(e), 1)) 291 | }, 292 | _ => { 293 | self.slice = &self.slice[..self.slice.len()-1]; 294 | Some((self.slice.len()-1, Err(Utf8Error{ kind: UnexpectedContinuationByte }), 1)) 295 | }, 296 | } 297 | } else { 298 | None 299 | } 300 | } 301 | } 302 | impl<'a> Debug for Utf8CharDecoder<'a> { 303 | fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { 304 | write!(fmtr, "Utf8CharDecoder {{ bytes[{}..]: {:?} }}", self.index, self.as_slice()) 305 | } 306 | } 307 | 308 | 309 | 310 | /// Decodes UTF-16 characters from a `u16` iterator into `Utf16Char`s. 311 | /// 312 | /// See [`IterExt::to_utf16chars()`](../trait.IterExt.html#tymethod.to_utf16chars) 313 | /// for examples and error handling. 314 | #[derive(Clone, Default)] 315 | pub struct Utf16CharMerger, I:Iterator> { 316 | iter: I, 317 | /// Used when a trailing surrogate was expected, the u16 can be any value. 318 | prev: Option, 319 | } 320 | impl, I:Iterator, T:IntoIterator> 321 | From for Utf16CharMerger { 322 | fn from(t: T) -> Self { 323 | Utf16CharMerger { iter: t.into_iter(), prev: None } 324 | } 325 | } 326 | impl, I:Iterator> Utf16CharMerger { 327 | /// Extract the inner iterator. 328 | /// 329 | /// If the last item produced was an `Err`, the first unit might be missing. 330 | /// 331 | /// # Examples 332 | /// 333 | /// Unit right after an error missing 334 | /// ``` 335 | /// # use encode_unicode::IterExt; 336 | /// # use encode_unicode::error::Utf16PairError; 337 | /// let mut merger = [0xd901, 'F' as u16, 'S' as u16].iter().to_utf16chars(); 338 | /// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnmatchedLeadingSurrogate))); 339 | /// let mut inner: std::slice::Iter = merger.into_inner(); 340 | /// assert_eq!(inner.next(), Some('S' as u16).as_ref()); // 'F' was consumed by Utf16CharMerger 341 | /// ``` 342 | /// 343 | /// Error that doesn't swallow any units 344 | /// ``` 345 | /// # use encode_unicode::IterExt; 346 | /// # use encode_unicode::error::Utf16PairError; 347 | /// let mut merger = [0xde00, 'F' as u16, 'S' as u16].iter().to_utf16chars(); 348 | /// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnexpectedTrailingSurrogate))); 349 | /// let mut inner: std::slice::Iter = merger.into_inner(); 350 | /// assert_eq!(inner.next(), Some('F' as u16).as_ref()); // not consumed 351 | /// ``` 352 | pub fn into_inner(self) -> I { 353 | self.iter 354 | } 355 | /// Returns an iterator over the remaining units. 356 | /// Unlike `into_inner()` this will never drop any units. 357 | /// 358 | /// The exact type of the returned iterator should not be depended on. 359 | /// 360 | /// # Examples 361 | /// 362 | /// ``` 363 | /// # use encode_unicode::IterExt; 364 | /// # use encode_unicode::error::Utf16PairError; 365 | /// let slice = [0xd901, 'F' as u16, 'S' as u16]; 366 | /// let mut merger = slice.iter().to_utf16chars(); 367 | /// assert_eq!(merger.next(), Some(Err(Utf16PairError::UnmatchedLeadingSurrogate))); 368 | /// let mut remaining = merger.into_remaining_units(); 369 | /// assert_eq!(remaining.next(), Some('F' as u16).as_ref()); 370 | /// ``` 371 | pub fn into_remaining_units(self) -> Chain,I> { 372 | self.prev.into_iter().chain(self.iter) 373 | } 374 | } 375 | impl, I:Iterator> Iterator for Utf16CharMerger { 376 | type Item = Result; 377 | fn next(&mut self) -> Option { 378 | let first = self.prev.take().or_else(|| self.iter.next() ); 379 | first.map(|first| unsafe { 380 | match first.borrow().utf16_needs_extra_unit() { 381 | Ok(false) => Ok(Utf16Char::from_array_unchecked([*first.borrow(), 0])), 382 | Ok(true) => match self.iter.next() { 383 | Some(second) => match second.borrow().utf16_needs_extra_unit() { 384 | Err(Utf16FirstUnitError) => Ok(Utf16Char::from_tuple_unchecked(( 385 | *first.borrow(), 386 | Some(*second.borrow()) 387 | ))), 388 | Ok(_) => { 389 | self.prev = Some(second); 390 | Err(Utf16PairError::UnmatchedLeadingSurrogate) 391 | } 392 | }, 393 | None => Err(Utf16PairError::Incomplete) 394 | }, 395 | Err(Utf16FirstUnitError) => Err(Utf16PairError::UnexpectedTrailingSurrogate), 396 | } 397 | }) 398 | } 399 | fn size_hint(&self) -> (usize,Option) { 400 | let (iter_min, iter_max) = self.iter.size_hint(); 401 | // cannot be exact, so KISS 402 | let min = iter_min / 2; // don't bother rounding up or accounting for self.prev 403 | let max = match (iter_max, &self.prev) { 404 | (Some(max), &Some(_)) => max.checked_add(1), 405 | (max, _) => max, 406 | }; 407 | (min, max) 408 | } 409 | } 410 | impl, I:Iterator+Debug> Debug for Utf16CharMerger { 411 | fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { 412 | fmtr.debug_struct("Utf16CharMerger") 413 | .field("buffered", &self.prev.as_ref().map(|b| *b.borrow() )) 414 | .field("inner", &self.iter) 415 | .finish() 416 | } 417 | } 418 | 419 | 420 | /// An [`Utf16CharMerger`](struct.Utf16CharMerger.html) that also produces 421 | /// offsets and lengths, but can only iterate over slices. 422 | /// 423 | /// See [`SliceExt::utf16char_indices()`](../trait.SliceExt.html#tymethod.utf16char_indices) 424 | /// for examples and error handling. 425 | #[derive(Clone, Default)] 426 | pub struct Utf16CharDecoder<'a> { 427 | slice: &'a[u16], 428 | index: usize, 429 | } 430 | impl<'a> From<&'a[u16]> for Utf16CharDecoder<'a> { 431 | fn from(s: &'a[u16]) -> Self { 432 | Utf16CharDecoder{ slice: s, index: 0 } 433 | } 434 | } 435 | impl<'a> Utf16CharDecoder<'a> { 436 | /// Extract the remainder of the source slice. 437 | /// 438 | /// # Examples 439 | /// 440 | /// Unlike `Utf16CharMerger::into_inner()`, the unit after an error is never swallowed: 441 | /// ``` 442 | /// # use encode_unicode::SliceExt; 443 | /// # use encode_unicode::error::Utf16PairError; 444 | /// let mut iter = [0xd901, 'F' as u16, 'S' as u16].utf16char_indices(); 445 | /// assert_eq!(iter.next(), Some((0, Err(Utf16PairError::UnmatchedLeadingSurrogate), 1))); 446 | /// assert_eq!(iter.as_slice(), &['F' as u16, 'S' as u16]); 447 | /// ``` 448 | pub fn as_slice(&self) -> &[u16] { 449 | &self.slice[self.index..] 450 | } 451 | } 452 | impl<'a> Iterator for Utf16CharDecoder<'a> { 453 | type Item = (usize,Result,usize); 454 | #[inline] 455 | fn next(&mut self) -> Option { 456 | let start = self.index; 457 | match Utf16Char::from_slice_start(self.as_slice()) { 458 | Ok((u16c,len)) => { 459 | self.index += len; 460 | Some((start, Ok(u16c), len)) 461 | }, 462 | Err(EmptySlice) => None, 463 | Err(FirstIsTrailingSurrogate) => { 464 | self.index += 1; 465 | Some((start, Err(UnexpectedTrailingSurrogate), 1)) 466 | }, 467 | Err(SecondIsNotTrailingSurrogate) => { 468 | self.index += 1; 469 | Some((start, Err(UnmatchedLeadingSurrogate), 1)) 470 | }, 471 | Err(MissingSecond) => { 472 | self.index = self.slice.len(); 473 | Some((start, Err(Incomplete), 1)) 474 | } 475 | } 476 | } 477 | #[inline] 478 | fn size_hint(&self) -> (usize,Option) { 479 | let units = self.slice.len() - self.index; 480 | // Cannot be exact, so KISS and don't bother rounding up. 481 | // The slice is unlikely be full of surrogate pairs, so buffers 482 | // allocated with the lower bound will have to be grown anyway. 483 | (units/2, Some(units)) 484 | } 485 | } 486 | impl<'a> Debug for Utf16CharDecoder<'a> { 487 | fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { 488 | write!(fmtr, "Utf16CharDecoder {{ units[{}..]: {:?} }}", self.index, self.as_slice()) 489 | } 490 | } 491 | -------------------------------------------------------------------------------- /src/utf8_char.rs: -------------------------------------------------------------------------------- 1 | /* Copyright 2016-2022 Torbjørn Birch Moltu 2 | * 3 | * Licensed under the Apache License, Version 2.0, or the MIT license , at your option. This file may not be 6 | * copied, modified, or distributed except according to those terms. 7 | */ 8 | 9 | use crate::errors::{FromStrError, EmptyStrError, NonAsciiError, Utf8Error}; 10 | use crate::utf8_iterators::Utf8Iterator; 11 | use crate::traits::{CharExt, U8UtfExt}; 12 | use crate::utf16_char::Utf16Char; 13 | extern crate core; 14 | use core::{hash, fmt, str, ptr}; 15 | use core::cmp::Ordering; 16 | use core::borrow::Borrow; 17 | use core::ops::Deref; 18 | #[cfg(feature="std")] 19 | use core::iter::FromIterator; 20 | #[cfg(feature="ascii")] 21 | extern crate ascii; 22 | #[cfg(feature="ascii")] 23 | use ascii::{AsciiChar,ToAsciiChar,ToAsciiCharError}; 24 | 25 | 26 | // I don't think there is any good default value for char, but char does. 27 | #[derive(Default)] 28 | // char doesn't do anything more advanced than u32 for Eq/Ord, so we shouldn't either. 29 | // The default impl of Ord for arrays works out because longer codepoints 30 | // start with more ones, so if they're equal, the length is the same, 31 | // breaks down for values above 0x1f_ff_ff but those can only be created by unsafe code. 32 | #[derive(PartialEq,Eq, PartialOrd,Ord)] 33 | 34 | #[derive(Clone,Copy)] 35 | 36 | 37 | /// An unicode codepoint stored as UTF-8. 38 | /// 39 | /// It can be borrowed as a `str`, and has the same size as `char`. 40 | pub struct Utf8Char { 41 | bytes: [u8; 4], 42 | } 43 | 44 | 45 | ///////////////////// 46 | //conversion traits// 47 | ///////////////////// 48 | impl str::FromStr for Utf8Char { 49 | type Err = FromStrError; 50 | /// Create an `Utf8Char` from a string slice. 51 | /// The string must contain exactly one codepoint. 52 | /// 53 | /// # Examples 54 | /// 55 | /// ``` 56 | /// use encode_unicode::error::FromStrError::*; 57 | /// use encode_unicode::Utf8Char; 58 | /// use std::str::FromStr; 59 | /// 60 | /// assert_eq!(Utf8Char::from_str("a"), Ok(Utf8Char::from('a'))); 61 | /// assert_eq!(Utf8Char::from_str("🂠"), Ok(Utf8Char::from('🂠'))); 62 | /// assert_eq!(Utf8Char::from_str(""), Err(Empty)); 63 | /// assert_eq!(Utf8Char::from_str("ab"), Err(MultipleCodepoints)); 64 | /// assert_eq!(Utf8Char::from_str("é"), Err(MultipleCodepoints));// 'e'+u301 combining mark 65 | /// ``` 66 | fn from_str(s: &str) -> Result { 67 | if s.is_empty() { 68 | Err(FromStrError::Empty) 69 | } else if s.len() != 1+s.as_bytes()[0].extra_utf8_bytes_unchecked() { 70 | Err(FromStrError::MultipleCodepoints) 71 | } else { 72 | let mut bytes = [0; 4]; 73 | bytes[..s.len()].copy_from_slice(s.as_bytes()); 74 | Ok(Utf8Char{bytes}) 75 | } 76 | } 77 | } 78 | impl From for Utf8Char { 79 | fn from(utf16: Utf16Char) -> Utf8Char { 80 | match utf16.to_tuple() { 81 | (ascii @ 0..=0x00_7f, _) => { 82 | Utf8Char{ bytes: [ascii as u8, 0, 0, 0] } 83 | }, 84 | (unit @ 0..=0x07_ff, _) => { 85 | let byte2 = 0x80 | (unit & 0x00_3f) as u8; 86 | let byte1 = 0xc0 | ((unit & 0x07_c0) >> 6) as u8; 87 | Utf8Char{ bytes: [byte1, byte2, 0, 0] } 88 | }, 89 | (unit, None) => { 90 | let byte3 = 0x80 | (unit & 0x00_3f) as u8; 91 | let byte2 = 0x80 | ((unit & 0x0f_c0) >> 6) as u8; 92 | let byte1 = 0xe0 | ((unit & 0xf0_00) >> 12) as u8; 93 | Utf8Char{ bytes: [byte1, byte2, byte3, 0] } 94 | }, 95 | (first, Some(second)) => { 96 | let first = first + (0x01_00_00u32 >> 10) as u16; 97 | let byte4 = 0x80 | (second & 0x00_3f) as u8; 98 | let byte3 = 0x80 | ((second & 0x03_c0) >> 6) as u8 99 | | (( first & 0x00_03) << 4) as u8; 100 | let byte2 = 0x80 | (( first & 0x00_fc) >> 2) as u8; 101 | let byte1 = 0xf0 | (( first & 0x07_00) >> 8) as u8; 102 | Utf8Char{ bytes: [byte1, byte2, byte3, byte4] } 103 | } 104 | } 105 | } 106 | } 107 | impl From for Utf8Char { 108 | fn from(c: char) -> Self { 109 | Utf8Char::new(c) 110 | } 111 | } 112 | impl From for char { 113 | fn from(uc: Utf8Char) -> char { 114 | uc.to_char() 115 | } 116 | } 117 | impl IntoIterator for Utf8Char { 118 | type Item=u8; 119 | type IntoIter=Utf8Iterator; 120 | /// Iterate over the byte values. 121 | fn into_iter(self) -> Utf8Iterator { 122 | Utf8Iterator::from(self) 123 | } 124 | } 125 | 126 | #[cfg(feature="std")] 127 | impl Extend for Vec { 128 | fn extend>(&mut self, iter: I) { 129 | let iter = iter.into_iter(); 130 | self.reserve(iter.size_hint().0); 131 | for u8c in iter { 132 | // twice as fast as self.extend_from_slice(u8c.as_bytes()); 133 | self.push(u8c.bytes[0]); 134 | for &extra in &u8c.bytes[1..] { 135 | if extra != 0 { 136 | self.push(extra); 137 | } 138 | } 139 | } 140 | } 141 | } 142 | #[cfg(feature="std")] 143 | impl<'a> Extend<&'a Utf8Char> for Vec { 144 | fn extend>(&mut self, iter: I) { 145 | self.extend(iter.into_iter().cloned()) 146 | } 147 | } 148 | #[cfg(feature="std")] 149 | impl Extend for String { 150 | fn extend>(&mut self, iter: I) { 151 | unsafe { self.as_mut_vec().extend(iter) } 152 | } 153 | } 154 | #[cfg(feature="std")] 155 | impl<'a> Extend<&'a Utf8Char> for String { 156 | fn extend>(&mut self, iter: I) { 157 | self.extend(iter.into_iter().cloned()) 158 | } 159 | } 160 | #[cfg(feature="std")] 161 | impl FromIterator for String { 162 | fn from_iter>(iter: I) -> String { 163 | let mut string = String::new(); 164 | string.extend(iter); 165 | return string; 166 | } 167 | } 168 | #[cfg(feature="std")] 169 | impl<'a> FromIterator<&'a Utf8Char> for String { 170 | fn from_iter>(iter: I) -> String { 171 | iter.into_iter().cloned().collect() 172 | } 173 | } 174 | #[cfg(feature="std")] 175 | impl FromIterator for Vec { 176 | fn from_iter>(iter: I) -> Self { 177 | iter.into_iter().collect::().into_bytes() 178 | } 179 | } 180 | #[cfg(feature="std")] 181 | impl<'a> FromIterator<&'a Utf8Char> for Vec { 182 | fn from_iter>(iter: I) -> Self { 183 | iter.into_iter().cloned().collect::().into_bytes() 184 | } 185 | } 186 | 187 | 188 | ///////////////// 189 | //getter traits// 190 | ///////////////// 191 | impl AsRef<[u8]> for Utf8Char { 192 | fn as_ref(&self) -> &[u8] { 193 | &self.bytes[..self.len()] 194 | } 195 | } 196 | impl AsRef for Utf8Char { 197 | fn as_ref(&self) -> &str { 198 | unsafe{ str::from_utf8_unchecked( self.as_ref() ) } 199 | } 200 | } 201 | impl Borrow<[u8]> for Utf8Char { 202 | fn borrow(&self) -> &[u8] { 203 | self.as_ref() 204 | } 205 | } 206 | impl Borrow for Utf8Char { 207 | fn borrow(&self) -> &str { 208 | self.as_ref() 209 | } 210 | } 211 | impl Deref for Utf8Char { 212 | type Target = str; 213 | fn deref(&self) -> &Self::Target { 214 | self.as_ref() 215 | } 216 | } 217 | 218 | 219 | //////////////// 220 | //ascii traits// 221 | //////////////// 222 | #[cfg(feature="ascii")] 223 | /// Requires the feature "ascii". 224 | impl From for Utf8Char { 225 | fn from(ac: AsciiChar) -> Self { 226 | Utf8Char{ bytes: [ac.as_byte(),0,0,0] } 227 | } 228 | } 229 | #[cfg(feature="ascii")] 230 | /// Requires the feature "ascii". 231 | impl ToAsciiChar for Utf8Char { 232 | fn to_ascii_char(self) -> Result { 233 | self.bytes[0].to_ascii_char() 234 | } 235 | unsafe fn to_ascii_char_unchecked(self) -> AsciiChar { 236 | unsafe { self.bytes[0].to_ascii_char_unchecked() } 237 | } 238 | } 239 | 240 | 241 | ///////////////////////////////////////////////////////// 242 | //Genaral traits that cannot be derived to emulate char// 243 | ///////////////////////////////////////////////////////// 244 | impl hash::Hash for Utf8Char { 245 | fn hash(&self, state: &mut H) { 246 | self.to_char().hash(state); 247 | } 248 | } 249 | impl fmt::Debug for Utf8Char { 250 | fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { 251 | fmt::Debug::fmt(&self.to_char(), fmtr) 252 | } 253 | } 254 | impl fmt::Display for Utf8Char { 255 | fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { 256 | fmtr.write_str(self.as_str()) 257 | } 258 | } 259 | 260 | 261 | //////////////////////////////// 262 | //Comparisons with other types// 263 | //////////////////////////////// 264 | impl PartialEq for Utf8Char { 265 | fn eq(&self, u32c: &char) -> bool { 266 | *self == Utf8Char::from(*u32c) 267 | } 268 | } 269 | impl PartialEq for char { 270 | fn eq(&self, u8c: &Utf8Char) -> bool { 271 | Utf8Char::from(*self) == *u8c 272 | } 273 | } 274 | impl PartialOrd for Utf8Char { 275 | fn partial_cmp(&self, u32c: &char) -> Option { 276 | self.partial_cmp(&Self::from(*u32c)) 277 | } 278 | } 279 | impl PartialOrd for char { 280 | fn partial_cmp(&self, u8c: &Utf8Char) -> Option { 281 | Utf8Char::from(*self).partial_cmp(u8c) 282 | } 283 | } 284 | 285 | impl PartialEq for Utf8Char { 286 | fn eq(&self, u16c: &Utf16Char) -> bool { 287 | *self == Self::from(*u16c) 288 | } 289 | } 290 | impl PartialOrd for Utf8Char { 291 | fn partial_cmp(&self, u16c: &Utf16Char) -> Option { 292 | self.partial_cmp(&Self::from(*u16c)) 293 | } 294 | } 295 | // The other direction is implemented in utf16_char.rs 296 | 297 | /// Only considers the byte equal if both it and the `Utf8Char` represents ASCII characters. 298 | /// 299 | /// There is no impl in the opposite direction, as this should only be used to 300 | /// compare `Utf8Char`s against constants. 301 | /// 302 | /// # Examples 303 | /// 304 | /// ``` 305 | /// # use encode_unicode::Utf8Char; 306 | /// assert!(Utf8Char::from('8') == b'8'); 307 | /// assert!(Utf8Char::from_array([0xf1,0x80,0x80,0x80]).unwrap() != 0xf1); 308 | /// assert!(Utf8Char::from('\u{ff}') != 0xff); 309 | /// assert!(Utf8Char::from('\u{80}') != 0x80); 310 | /// ``` 311 | impl PartialEq for Utf8Char { 312 | fn eq(&self, byte: &u8) -> bool { 313 | self.bytes[0] == *byte && self.bytes[1] == 0 314 | } 315 | } 316 | #[cfg(feature = "ascii")] 317 | /// `Utf8Char`s that are not ASCII never compare equal. 318 | impl PartialEq for Utf8Char { 319 | #[inline] 320 | fn eq(&self, ascii: &AsciiChar) -> bool { 321 | self.bytes[0] == *ascii as u8 322 | } 323 | } 324 | #[cfg(feature = "ascii")] 325 | /// `Utf8Char`s that are not ASCII never compare equal. 326 | impl PartialEq for AsciiChar { 327 | #[inline] 328 | fn eq(&self, u8c: &Utf8Char) -> bool { 329 | u8c == self 330 | } 331 | } 332 | #[cfg(feature = "ascii")] 333 | /// `Utf8Char`s that are not ASCII always compare greater. 334 | impl PartialOrd for Utf8Char { 335 | #[inline] 336 | fn partial_cmp(&self, ascii: &AsciiChar) -> Option { 337 | self.bytes[0].partial_cmp(ascii) 338 | } 339 | } 340 | #[cfg(feature = "ascii")] 341 | /// `Utf8Char`s that are not ASCII always compare greater. 342 | impl PartialOrd for AsciiChar { 343 | #[inline] 344 | fn partial_cmp(&self, u8c: &Utf8Char) -> Option { 345 | self.partial_cmp(&u8c.bytes[0]) 346 | } 347 | } 348 | 349 | 350 | /////////////////////////////////////////////////////// 351 | //pub impls that should be together for nicer rustdoc// 352 | /////////////////////////////////////////////////////// 353 | impl Utf8Char { 354 | /// A `const fn` alternative to the trait-based `Utf8Char::from(char)`. 355 | /// 356 | /// # Example 357 | /// 358 | /// ``` 359 | /// # use encode_unicode::Utf8Char; 360 | /// const REPLACEMENT_CHARACTER: Utf8Char = Utf8Char::new('\u{fffd}'); 361 | /// ``` 362 | pub const fn new(c: char) -> Self { 363 | if c.is_ascii() { 364 | Utf8Char{bytes: [c as u8, 0, 0, 0]} 365 | } else { 366 | // How many extra UTF-8 bytes that are needed to represent an 367 | // UTF-32 codepoint with a number of bits. 368 | // Stored as a bit-packed array using two bits per value. 369 | // 0..=7 bits = no extra bytes 370 | // +4 = 8..=11 bits = one xtra byte (5+6 bits) 371 | // +5 = 12..=16 bits = two extra bytes (4+6+6 bits) 372 | // +5 = 17..=21 bits = three extra bytes (3+6+6+6 bits) 373 | const EXTRA_BYTES: u64 = 0b11_11_11_11_11__10_10_10_10_10__01_01_01_01__00_00_00_00_00_00_00__00; 374 | let bits_used = 32 - (c as u32).leading_zeros(); 375 | let len = 1 + ((EXTRA_BYTES >> (bits_used*2)) & 0b11); 376 | // copied from CharExt::to_utf8_array() 377 | let mut c = c as u32; 378 | let mut parts = 0;// convert to 6-bit bytes 379 | parts |= c & 0x3f; c>>=6; 380 | parts<<=8; parts |= c & 0x3f; c>>=6; 381 | parts<<=8; parts |= c & 0x3f; c>>=6; 382 | parts<<=8; parts |= c & 0x3f; 383 | parts |= 0x80_80_80_80;// set the most significant bit 384 | parts >>= 8*(4-len);// right-align bytes 385 | // Now, unused bytes are zero, (which matters for Utf8Char.eq()) 386 | // and the rest are 0b10xx_xxxx 387 | 388 | // set header on first byte 389 | parts |= (0xff_00u32 >> len) & 0xff;// store length 390 | parts &= !(1u32 << (7-len));// clear the next bit after it 391 | 392 | Utf8Char {bytes: parts.to_le_bytes()} 393 | } 394 | } 395 | 396 | /// Create an `Utf8Char` from the first codepoint in a `str`. 397 | /// 398 | /// Returns an error if the `str` is empty. 399 | /// 400 | /// # Examples 401 | /// 402 | /// ``` 403 | /// use encode_unicode::Utf8Char; 404 | /// 405 | /// assert_eq!(Utf8Char::from_str_start("a"), Ok((Utf8Char::from('a'),1))); 406 | /// assert_eq!(Utf8Char::from_str_start("ab"), Ok((Utf8Char::from('a'),1))); 407 | /// assert_eq!(Utf8Char::from_str_start("🂠 "), Ok((Utf8Char::from('🂠'),4))); 408 | /// assert_eq!(Utf8Char::from_str_start("é"), Ok((Utf8Char::from('e'),1)));// 'e'+u301 combining mark 409 | /// assert!(Utf8Char::from_str_start("").is_err()); 410 | /// ``` 411 | pub fn from_str_start(src: &str) -> Result<(Self,usize),EmptyStrError> { 412 | unsafe { 413 | if src.is_empty() { 414 | Err(EmptyStrError) 415 | } else { 416 | Ok(Utf8Char::from_slice_start_unchecked(src.as_bytes())) 417 | } 418 | } 419 | } 420 | /// Create an `Utf8Char` of the first codepoint in an UTF-8 slice. 421 | /// Also returns the length of the UTF-8 sequence for the codepoint. 422 | /// 423 | /// If the slice is from a `str`, use `::from_str_start()` to skip UTF-8 validation. 424 | /// 425 | /// # Errors 426 | /// 427 | /// Returns an `Err` if the slice is empty, doesn't start with a valid 428 | /// UTF-8 sequence or is too short for the sequence. 429 | /// 430 | /// # Examples 431 | /// 432 | /// ``` 433 | /// use encode_unicode::Utf8Char; 434 | /// use encode_unicode::error::Utf8ErrorKind::*; 435 | /// 436 | /// assert_eq!(Utf8Char::from_slice_start(&[b'A', b'B', b'C']), Ok((Utf8Char::from('A'),1))); 437 | /// assert_eq!(Utf8Char::from_slice_start(&[0xdd, 0xbb]), Ok((Utf8Char::from('\u{77b}'),2))); 438 | /// 439 | /// assert_eq!(Utf8Char::from_slice_start(&[]).unwrap_err().kind(), TooFewBytes); 440 | /// assert_eq!(Utf8Char::from_slice_start(&[0xf0, 0x99]).unwrap_err().kind(), TooFewBytes); 441 | /// assert_eq!(Utf8Char::from_slice_start(&[0xee, b'F', 0x80]).unwrap_err().kind(), InterruptedSequence); 442 | /// assert_eq!(Utf8Char::from_slice_start(&[0xee, 0x99, 0x0f]).unwrap_err().kind(), InterruptedSequence); 443 | /// ``` 444 | pub fn from_slice_start(src: &[u8]) -> Result<(Self,usize),Utf8Error> { 445 | char::from_utf8_slice_start(src).map(|(_,len)| { 446 | let mut bytes = [0; 4]; 447 | bytes[..len].copy_from_slice(&src[..len]); 448 | (Utf8Char{bytes}, len) 449 | }) 450 | } 451 | /// A `from_slice_start()` that doesn't validate the codepoint. 452 | /// 453 | /// # Safety 454 | /// 455 | /// The slice must be non-empty and start with a valid UTF-8 codepoint. 456 | /// Invalid or incomplete values might cause reads of uninitalized memory. 457 | pub unsafe fn from_slice_start_unchecked(src: &[u8]) -> (Self,usize) { 458 | unsafe { 459 | let len = 1+src.get_unchecked(0).extra_utf8_bytes_unchecked(); 460 | let mut bytes = [0; 4]; 461 | ptr::copy_nonoverlapping(src.as_ptr(), bytes.as_mut_ptr() as *mut u8, len); 462 | (Utf8Char{bytes}, len) 463 | } 464 | } 465 | /// Create an `Utf8Char` from a byte array after validating it. 466 | /// 467 | /// The codepoint must start at the first byte. 468 | /// Unused bytes are set to zero by this function and so can be anything. 469 | /// 470 | /// # Errors 471 | /// 472 | /// Returns an `Err` if the array doesn't start with a valid UTF-8 sequence. 473 | /// 474 | /// # Examples 475 | /// 476 | /// ``` 477 | /// use encode_unicode::Utf8Char; 478 | /// use encode_unicode::error::Utf8ErrorKind::*; 479 | /// 480 | /// assert_eq!(Utf8Char::from_array([b'A', 0, 0, 0]), Ok(Utf8Char::from('A'))); 481 | /// assert_eq!(Utf8Char::from_array([0xf4, 0x8b, 0xbb, 0xbb]), Ok(Utf8Char::from('\u{10befb}'))); 482 | /// assert_eq!(Utf8Char::from_array([b'A', b'B', b'C', b'D']), Ok(Utf8Char::from('A'))); 483 | /// assert_eq!(Utf8Char::from_array([0, 0, 0xcc, 0xbb]), Ok(Utf8Char::from('\0'))); 484 | /// 485 | /// assert_eq!(Utf8Char::from_array([0xef, b'F', 0x80, 0x80]).unwrap_err().kind(), InterruptedSequence); 486 | /// assert_eq!(Utf8Char::from_array([0xc1, 0x80, 0, 0]).unwrap_err().kind(), NonUtf8Byte); 487 | /// assert_eq!(Utf8Char::from_array([0xe0, 0x9a, 0xbf, 0]).unwrap_err().kind(), OverlongEncoding); 488 | /// assert_eq!(Utf8Char::from_array([0xf4, 0xaa, 0x99, 0x88]).unwrap_err().kind(), TooHighCodepoint); 489 | /// ``` 490 | pub fn from_array(utf8: [u8;4]) -> Result { 491 | // perform all validation 492 | char::from_utf8_array(utf8)?; 493 | let extra = utf8[0].extra_utf8_bytes_unchecked() as u32; 494 | // zero unused bytes in one operation by transmuting the arrary to 495 | // u32, apply an endian-corrected mask and transmute back 496 | let mask = u32::from_le(0xff_ff_ff_ff >> (8*(3-extra))); 497 | let unused_zeroed = mask & u32::from_ne_bytes(utf8); // native endian 498 | Ok(Utf8Char{ bytes: unused_zeroed.to_ne_bytes() }) 499 | } 500 | /// Zero-cost constructor. 501 | /// 502 | /// # Safety 503 | /// 504 | /// Must contain a valid codepoint starting at the first byte, with the 505 | /// unused bytes zeroed. 506 | /// Bad values can easily lead to undefined behavior. 507 | #[inline] 508 | pub const unsafe fn from_array_unchecked(utf8: [u8;4]) -> Self { 509 | Utf8Char{ bytes: utf8 } 510 | } 511 | /// Create an `Utf8Char` from a single byte. 512 | /// 513 | /// The byte must be an ASCII character. 514 | /// 515 | /// # Errors 516 | /// 517 | /// Returns `NonAsciiError` if the byte greater than 127. 518 | /// 519 | /// # Examples 520 | /// 521 | /// ``` 522 | /// # use encode_unicode::Utf8Char; 523 | /// assert_eq!(Utf8Char::from_ascii(b'a').unwrap(), 'a'); 524 | /// assert!(Utf8Char::from_ascii(128).is_err()); 525 | /// ``` 526 | pub const fn from_ascii(ascii: u8) -> Result { 527 | [Ok(Utf8Char{ bytes: [ascii, 0, 0, 0] }), Err(NonAsciiError)][(ascii >> 7) as usize] 528 | } 529 | /// Create an `Utf8Char` from a single byte without checking that it's a 530 | /// valid codepoint on its own, which is only true for ASCII characters. 531 | /// 532 | /// # Safety 533 | /// 534 | /// The byte must be less than 128. 535 | #[inline] 536 | pub const unsafe fn from_ascii_unchecked(ascii: u8) -> Self { 537 | Utf8Char{ bytes: [ascii, 0, 0, 0] } 538 | } 539 | 540 | /// The number of bytes this character needs. 541 | /// 542 | /// Is between 1 and 4 (inclusive) and identical to `.as_ref().len()` or 543 | /// `.as_char().len_utf8()`. 544 | #[inline] 545 | pub const fn len(self) -> usize { 546 | // Invariants of the extra bytes enambles algorithms that 547 | // `u8.extra_utf8_bytes_unchecked()` cannot use. 548 | // Some of them turned out to require fewer x86 instructions: 549 | 550 | // Exploits that unused bytes are zero and calculates the number of 551 | // trailing zero bytes. 552 | // Setting a bit in the first byte prevents the function from returning 553 | // 0 for '\0' (which has 32 leading zeros). 554 | // trailing and leading is swapped below to optimize for little-endian 555 | // architectures. 556 | (4 - (u32::from_le_bytes(self.bytes)|1).leading_zeros()/8) as usize 557 | 558 | // Exploits that the extra bytes have their most significant bit set if 559 | // in use. 560 | // Takes fewer instructions than the one above if popcnt can be used, 561 | // (which it cannot by default, 562 | // set RUSTFLAGS='-C target-cpu=native' to enable) 563 | //let all = u32::from_ne_bytes(self.bytes); 564 | //let msb_mask = u32::from_be(0x00808080); 565 | //let add_one = u32::from_be(0x80000000); 566 | //((all & msb_mask) | add_one).count_ones() as usize 567 | } 568 | // There is no .is_emty() because this type is never empty. 569 | 570 | /// Checks that the codepoint is an ASCII character. 571 | pub const fn is_ascii(self) -> bool { 572 | self.bytes[0].is_ascii() 573 | } 574 | /// Checks that two characters are an ASCII case-insensitive match. 575 | /// 576 | /// Is equivalent to `a.to_ascii_lowercase() == b.to_ascii_lowercase()`. 577 | pub const fn eq_ignore_ascii_case(&self, other: &Self) -> bool { 578 | if self.is_ascii() { 579 | self.bytes[0].eq_ignore_ascii_case(&other.bytes[0]) 580 | } else { 581 | // [u8; 4] can't be const compared as of Rust 1.60, but u32 can 582 | u32::from_le_bytes(self.bytes) == u32::from_le_bytes(other.bytes) 583 | } 584 | } 585 | /// Converts the character to its ASCII upper case equivalent. 586 | /// 587 | /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', 588 | /// but non-ASCII letters are unchanged. 589 | pub const fn to_ascii_uppercase(mut self) -> Self { 590 | self.bytes[0] = self.bytes[0].to_ascii_uppercase(); 591 | self 592 | } 593 | /// Converts the character to its ASCII lower case equivalent. 594 | /// 595 | /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', 596 | /// but non-ASCII letters are unchanged. 597 | pub const fn to_ascii_lowercase(mut self) -> Self { 598 | self.bytes[0] = self.bytes[0].to_ascii_lowercase(); 599 | self 600 | } 601 | /// Converts the character to its ASCII upper case equivalent in-place. 602 | /// 603 | /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', 604 | /// but non-ASCII letters are unchanged. 605 | #[inline] 606 | pub fn make_ascii_uppercase(&mut self) { 607 | self.bytes[0].make_ascii_uppercase() 608 | } 609 | /// Converts the character to its ASCII lower case equivalent in-place. 610 | /// 611 | /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', 612 | /// but non-ASCII letters are unchanged. 613 | #[inline] 614 | pub fn make_ascii_lowercase(&mut self) { 615 | self.bytes[0].make_ascii_lowercase(); 616 | } 617 | 618 | /// Convert from UTF-8 to UTF-32 619 | pub fn to_char(self) -> char { 620 | unsafe { char::from_utf8_exact_slice_unchecked(&self.bytes[..self.len()]) } 621 | } 622 | /// Write the internal representation to a slice, 623 | /// and then returns the number of bytes written. 624 | /// 625 | /// # Panics 626 | /// 627 | /// Will panic the buffer is too small; 628 | /// You can get the required length from `.len()`, 629 | /// but a buffer of length four is always large enough. 630 | pub fn to_slice(self, dst: &mut[u8]) -> usize { 631 | if self.len() > dst.len() { 632 | panic!("The provided buffer is too small."); 633 | } 634 | dst[..self.len()].copy_from_slice(&self.bytes[..self.len()]); 635 | self.len() 636 | } 637 | /// Expose the internal array and the number of used bytes. 638 | pub const fn to_array(self) -> ([u8;4],usize) { 639 | (self.bytes, self.len()) 640 | } 641 | /// Return a `str` view of the array the codepoint is stored as. 642 | /// 643 | /// Is an unambiguous version of `.as_ref()`. 644 | pub fn as_str(&self) -> &str { 645 | self.deref() 646 | } 647 | } 648 | -------------------------------------------------------------------------------- /src/utf16_char.rs: -------------------------------------------------------------------------------- 1 | /* Copyright 2016-2022 Torbjørn Birch Moltu 2 | * 3 | * Licensed under the Apache License, Version 2.0, or the MIT license , at your option. This file may not be 6 | * copied, modified, or distributed except according to those terms. 7 | */ 8 | 9 | use crate::utf16_iterators::Utf16Iterator; 10 | use crate::traits::{CharExt, U16UtfExt}; 11 | use crate::utf8_char::Utf8Char; 12 | use crate::errors::{Utf16SliceError, Utf16ArrayError, Utf16TupleError}; 13 | use crate::errors::{NonBmpError, EmptyStrError, FromStrError}; 14 | extern crate core; 15 | use core::{hash,fmt}; 16 | use core::cmp::Ordering; 17 | use core::borrow::Borrow; 18 | use core::ops::Deref; 19 | use core::str::FromStr; 20 | #[cfg(feature="std")] 21 | use core::iter::FromIterator; 22 | #[cfg(feature="ascii")] 23 | use core::char; 24 | #[cfg(feature="ascii")] 25 | extern crate ascii; 26 | #[cfg(feature="ascii")] 27 | use ascii::{AsciiChar,ToAsciiChar,ToAsciiCharError}; 28 | 29 | 30 | // I don't think there is any good default value for char, but char does. 31 | #[derive(Default)] 32 | // char doesn't do anything more advanced than u32 for Eq/Ord, so we shouldn't either. 33 | // When it's a single unit, the second is zero, so Eq works. 34 | // #[derive(Ord)] however, breaks on surrogate pairs. 35 | #[derive(PartialEq,Eq)] 36 | #[derive(Clone,Copy)] 37 | 38 | 39 | /// An unicode codepoint stored as UTF-16. 40 | /// 41 | /// It can be borrowed as an `u16` slice, and has the same size as `char`. 42 | pub struct Utf16Char { 43 | units: [u16; 2], 44 | } 45 | 46 | 47 | ///////////////////// 48 | //conversion traits// 49 | ///////////////////// 50 | impl FromStr for Utf16Char { 51 | type Err = FromStrError; 52 | /// Create an `Utf16Char` from a string slice. 53 | /// The string must contain exactly one codepoint. 54 | /// 55 | /// # Examples 56 | /// 57 | /// ``` 58 | /// use encode_unicode::error::FromStrError::*; 59 | /// use encode_unicode::Utf16Char; 60 | /// use std::str::FromStr; 61 | /// 62 | /// assert_eq!(Utf16Char::from_str("a"), Ok(Utf16Char::from('a'))); 63 | /// assert_eq!(Utf16Char::from_str("🂠"), Ok(Utf16Char::from('🂠'))); 64 | /// assert_eq!(Utf16Char::from_str(""), Err(Empty)); 65 | /// assert_eq!(Utf16Char::from_str("ab"), Err(MultipleCodepoints)); 66 | /// assert_eq!(Utf16Char::from_str("é"), Err(MultipleCodepoints));// 'e'+u301 combining mark 67 | /// ``` 68 | fn from_str(s: &str) -> Result { 69 | match Utf16Char::from_str_start(s) { 70 | Ok((u16c,bytes)) if bytes == s.len() => Ok(u16c), 71 | Ok((_,_)) => Err(FromStrError::MultipleCodepoints), 72 | Err(EmptyStrError) => Err(FromStrError::Empty), 73 | } 74 | } 75 | } 76 | impl From for Utf16Char { 77 | fn from(c: char) -> Self { 78 | let (first, second) = c.to_utf16_tuple(); 79 | Utf16Char{ units: [first, second.unwrap_or(0)] } 80 | } 81 | } 82 | impl From for Utf16Char { 83 | fn from(utf8: Utf8Char) -> Utf16Char { 84 | let (b, utf8_len) = utf8.to_array(); 85 | match utf8_len { 86 | 1 => Utf16Char{ units: [b[0] as u16, 0] }, 87 | 4 => {// need surrogate 88 | let mut first = 0xd800 - (0x01_00_00u32 >> 10) as u16; 89 | first += (b[0] as u16 & 0x07) << 8; 90 | first += (b[1] as u16 & 0x3f) << 2; 91 | first += (b[2] as u16 & 0x30) >> 4; 92 | let mut second = 0xdc00; 93 | second |= (b[2] as u16 & 0x0f) << 6; 94 | second |= b[3] as u16 & 0x3f; 95 | Utf16Char{ units: [first, second] } 96 | }, 97 | _ => { // 2 or 3 98 | let mut unit = ((b[0] as u16 & 0x1f) << 6) | (b[1] as u16 & 0x3f); 99 | if utf8_len == 3 { 100 | unit = (unit << 6) | (b[2] as u16 & 0x3f); 101 | } 102 | Utf16Char{ units: [unit, 0] } 103 | }, 104 | } 105 | } 106 | } 107 | impl From for char { 108 | fn from(uc: Utf16Char) -> char { 109 | char::from_utf16_array_unchecked(uc.to_array()) 110 | } 111 | } 112 | impl IntoIterator for Utf16Char { 113 | type Item=u16; 114 | type IntoIter=Utf16Iterator; 115 | /// Iterate over the units. 116 | fn into_iter(self) -> Utf16Iterator { 117 | Utf16Iterator::from(self) 118 | } 119 | } 120 | 121 | #[cfg(feature="std")] 122 | impl Extend for Vec { 123 | fn extend>(&mut self, iter: I) { 124 | let iter = iter.into_iter(); 125 | self.reserve(iter.size_hint().0); 126 | for u16c in iter { 127 | self.push(u16c.units[0]); 128 | if u16c.units[1] != 0 { 129 | self.push(u16c.units[1]); 130 | } 131 | } 132 | } 133 | } 134 | #[cfg(feature="std")] 135 | impl<'a> Extend<&'a Utf16Char> for Vec { 136 | fn extend>(&mut self, iter: I) { 137 | self.extend(iter.into_iter().cloned()) 138 | } 139 | } 140 | #[cfg(feature="std")] 141 | impl FromIterator for Vec { 142 | fn from_iter>(iter: I) -> Self { 143 | let mut vec = Vec::new(); 144 | vec.extend(iter); 145 | return vec; 146 | } 147 | } 148 | #[cfg(feature="std")] 149 | impl<'a> FromIterator<&'a Utf16Char> for Vec { 150 | fn from_iter>(iter: I) -> Self { 151 | Self::from_iter(iter.into_iter().cloned()) 152 | } 153 | } 154 | 155 | #[cfg(feature="std")] 156 | impl Extend for String { 157 | fn extend>(&mut self, iter: I) { 158 | self.extend(iter.into_iter().map(Utf8Char::from)); 159 | } 160 | } 161 | #[cfg(feature="std")] 162 | impl<'a> Extend<&'a Utf16Char> for String { 163 | fn extend>(&mut self, iter: I) { 164 | self.extend(iter.into_iter().cloned()); 165 | } 166 | } 167 | #[cfg(feature="std")] 168 | impl FromIterator for String { 169 | fn from_iter>(iter: I) -> Self { 170 | let mut s = String::new(); 171 | s.extend(iter); 172 | return s; 173 | } 174 | } 175 | #[cfg(feature="std")] 176 | impl<'a> FromIterator<&'a Utf16Char> for String { 177 | fn from_iter>(iter: I) -> Self { 178 | Self::from_iter(iter.into_iter().cloned()) 179 | } 180 | } 181 | 182 | 183 | ///////////////// 184 | //getter traits// 185 | ///////////////// 186 | impl AsRef<[u16]> for Utf16Char { 187 | #[inline] 188 | fn as_ref(&self) -> &[u16] { 189 | &self.units[..self.len()] 190 | } 191 | } 192 | impl Borrow<[u16]> for Utf16Char { 193 | #[inline] 194 | fn borrow(&self) -> &[u16] { 195 | self.as_ref() 196 | } 197 | } 198 | impl Deref for Utf16Char { 199 | type Target = [u16]; 200 | #[inline] 201 | fn deref(&self) -> &[u16] { 202 | self.as_ref() 203 | } 204 | } 205 | 206 | 207 | //////////////// 208 | //ascii traits// 209 | //////////////// 210 | #[cfg(feature="ascii")] 211 | /// Requires the feature "ascii". 212 | impl From for Utf16Char { 213 | #[inline] 214 | fn from(ac: AsciiChar) -> Self { 215 | Utf16Char{ units: [ac.as_byte() as u16, 0] } 216 | } 217 | } 218 | #[cfg(feature="ascii")] 219 | /// Requires the feature "ascii". 220 | impl ToAsciiChar for Utf16Char { 221 | #[inline] 222 | fn to_ascii_char(self) -> Result { 223 | self.units[0].to_ascii_char() 224 | } 225 | #[inline] 226 | unsafe fn to_ascii_char_unchecked(self) -> AsciiChar { 227 | unsafe { self.units[0].to_ascii_char_unchecked() } 228 | } 229 | } 230 | 231 | 232 | ///////////////////////////////////////////////////////// 233 | //Genaral traits that cannot be derived to emulate char// 234 | ///////////////////////////////////////////////////////// 235 | impl hash::Hash for Utf16Char { 236 | fn hash(&self, state: &mut H) { 237 | self.to_char().hash(state); 238 | } 239 | } 240 | impl fmt::Debug for Utf16Char { 241 | fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { 242 | fmt::Debug::fmt(&self.to_char(), fmtr) 243 | } 244 | } 245 | impl fmt::Display for Utf16Char { 246 | fn fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result { 247 | fmt::Display::fmt(&Utf8Char::from(*self), fmtr) 248 | } 249 | } 250 | // Cannot derive these impls because two-unit characters must always compare 251 | // greater than one-unit ones. 252 | impl PartialOrd for Utf16Char { 253 | #[inline] 254 | fn partial_cmp(&self, rhs: &Self) -> Option { 255 | Some(self.cmp(rhs)) 256 | } 257 | } 258 | impl Ord for Utf16Char { 259 | #[inline] 260 | fn cmp(&self, rhs: &Self) -> Ordering { 261 | // Shift the first unit by 0xd if surrogate, and 0 otherwise. 262 | // This ensures surrogates are always greater than 0xffff, and 263 | // that the second unit only affect the result when the first are equal. 264 | // Multiplying by a constant factor isn't enough because that factor 265 | // would have to be greater than 1023 and smaller than 5.5. 266 | // This transformation is less complicated than combine_surrogates(). 267 | let lhs = (self.units[0] as u32, self.units[1] as u32); 268 | let rhs = (rhs.units[0] as u32, rhs.units[1] as u32); 269 | let lhs = (lhs.0 << (lhs.1 >> 12)) + lhs.1; 270 | let rhs = (rhs.0 << (rhs.1 >> 12)) + rhs.1; 271 | lhs.cmp(&rhs) 272 | } 273 | } 274 | 275 | 276 | //////////////////////////////// 277 | //Comparisons with other types// 278 | //////////////////////////////// 279 | impl PartialEq for Utf16Char { 280 | fn eq(&self, u32c: &char) -> bool { 281 | *self == Utf16Char::from(*u32c) 282 | } 283 | } 284 | impl PartialEq for char { 285 | fn eq(&self, u16c: &Utf16Char) -> bool { 286 | Utf16Char::from(*self) == *u16c 287 | } 288 | } 289 | impl PartialOrd for Utf16Char { 290 | fn partial_cmp(&self, u32c: &char) -> Option { 291 | self.partial_cmp(&Utf16Char::from(*u32c)) 292 | } 293 | } 294 | impl PartialOrd for char { 295 | fn partial_cmp(&self, u16c: &Utf16Char) -> Option { 296 | Utf16Char::from(*self).partial_cmp(u16c) 297 | } 298 | } 299 | 300 | impl PartialEq for Utf16Char { 301 | fn eq(&self, u8c: &Utf8Char) -> bool { 302 | *self == Utf16Char::from(*u8c) 303 | } 304 | } 305 | impl PartialOrd for Utf16Char { 306 | fn partial_cmp(&self, u8c: &Utf8Char) -> Option { 307 | self.partial_cmp(&Utf16Char::from(*u8c)) 308 | } 309 | } 310 | // The other direction is implemented in utf8_char.rs 311 | 312 | /// Only considers the unit equal if the codepoint of the `Utf16Char` is not 313 | /// made up of a surrogate pair. 314 | /// 315 | /// There is no impl in the opposite direction, as this should only be used to 316 | /// compare `Utf16Char`s against constants. 317 | /// 318 | /// # Examples 319 | /// 320 | /// ``` 321 | /// # use encode_unicode::Utf16Char; 322 | /// assert!(Utf16Char::from('6') == b'6' as u16); 323 | /// assert!(Utf16Char::from('\u{FFFF}') == 0xffff_u16); 324 | /// assert!(Utf16Char::from_tuple((0xd876, Some(0xdef9))).unwrap() != 0xd876_u16); 325 | /// ``` 326 | impl PartialEq for Utf16Char { 327 | fn eq(&self, unit: &u16) -> bool { 328 | self.units[0] == *unit && self.units[1] == 0 329 | } 330 | } 331 | /// Only considers the byte equal if the codepoint of the `Utf16Char` is <= U+FF. 332 | /// 333 | /// # Examples 334 | /// 335 | /// ``` 336 | /// # use encode_unicode::Utf16Char; 337 | /// assert!(Utf16Char::from('6') == b'6'); 338 | /// assert!(Utf16Char::from('\u{00FF}') == b'\xff'); 339 | /// assert!(Utf16Char::from('\u{0100}') != b'\0'); 340 | /// ``` 341 | impl PartialEq for Utf16Char { 342 | fn eq(&self, byte: &u8) -> bool { 343 | self.units[0] == *byte as u16 344 | } 345 | } 346 | #[cfg(feature = "ascii")] 347 | /// `Utf16Char`s that are not ASCII never compare equal. 348 | impl PartialEq for Utf16Char { 349 | #[inline] 350 | fn eq(&self, ascii: &AsciiChar) -> bool { 351 | self.units[0] == *ascii as u16 352 | } 353 | } 354 | #[cfg(feature = "ascii")] 355 | /// `Utf16Char`s that are not ASCII never compare equal. 356 | impl PartialEq for AsciiChar { 357 | #[inline] 358 | fn eq(&self, u16c: &Utf16Char) -> bool { 359 | *self as u16 == u16c.units[0] 360 | } 361 | } 362 | #[cfg(feature = "ascii")] 363 | /// `Utf16Char`s that are not ASCII always compare greater. 364 | impl PartialOrd for Utf16Char { 365 | #[inline] 366 | fn partial_cmp(&self, ascii: &AsciiChar) -> Option { 367 | self.units[0].partial_cmp(&(*ascii as u16)) 368 | } 369 | } 370 | #[cfg(feature = "ascii")] 371 | /// `Utf16Char`s that are not ASCII always compare greater. 372 | impl PartialOrd for AsciiChar { 373 | #[inline] 374 | fn partial_cmp(&self, u16c: &Utf16Char) -> Option { 375 | (*self as u16).partial_cmp(&u16c.units[0]) 376 | } 377 | } 378 | 379 | 380 | /////////////////////////////////////////////////////// 381 | //pub impls that should be together for nicer rustdoc// 382 | /////////////////////////////////////////////////////// 383 | impl Utf16Char { 384 | /// A `const fn` alternative to the trait-based `Utf16Char::from(char)`. 385 | /// 386 | /// # Examples 387 | /// 388 | /// ``` 389 | /// # use encode_unicode::Utf16Char; 390 | /// const REPLACEMENT_CHARACTER: Utf16Char = Utf16Char::new('\u{fffd}'); 391 | /// ``` 392 | pub const fn new(c: char) -> Self { 393 | if c <= '\u{ffff}' { 394 | Utf16Char{ units: [c as u16, 0] } 395 | } else { 396 | let c = (c as u32).wrapping_sub(0x01_00_00); 397 | let first = 0xd8_00 | (c >> 10) as u16; 398 | let second = 0xdc_00 | (c & 0x0_03_ff) as u16; 399 | Utf16Char{ units: [first, second] } 400 | } 401 | } 402 | /// Create an `Utf16Char` from the first codepoint in a string slice, 403 | /// converting from UTF-8 to UTF-16. 404 | /// 405 | /// The returned `usize` is the number of UTF-8 bytes used from the str, 406 | /// and not the number of UTF-16 units. 407 | /// 408 | /// Returns an error if the `str` is empty. 409 | /// 410 | /// # Examples 411 | /// 412 | /// ``` 413 | /// use encode_unicode::Utf16Char; 414 | /// 415 | /// assert_eq!(Utf16Char::from_str_start("a"), Ok((Utf16Char::from('a'),1))); 416 | /// assert_eq!(Utf16Char::from_str_start("ab"), Ok((Utf16Char::from('a'),1))); 417 | /// assert_eq!(Utf16Char::from_str_start("🂠 "), Ok((Utf16Char::from('🂠'),4))); 418 | /// assert_eq!(Utf16Char::from_str_start("é"), Ok((Utf16Char::from('e'),1)));// 'e'+u301 combining mark 419 | /// assert!(Utf16Char::from_str_start("").is_err()); 420 | /// ``` 421 | pub const fn from_str_start(s: &str) -> Result<(Self,usize), EmptyStrError> { 422 | if s.is_empty() { 423 | return Err(EmptyStrError); 424 | } 425 | let b = s.as_bytes(); 426 | // Read the last byte first to reduce the number of unnecesary length checks. 427 | match b[0] { 428 | 0..=127 => {// 1 byte => 1 unit 429 | let unit = b[0] as u16;// 0b0000_0000_0xxx_xxxx 430 | Ok((Utf16Char{ units: [unit, 0] }, 1)) 431 | }, 432 | 0b1000_0000..=0b1101_1111 => {// 2 bytes => 1 unit 433 | let unit = (((b[1] & 0x3f) as u16) << 0) // 0b0000_0000_00xx_xxxx 434 | | (((b[0] & 0x1f) as u16) << 6);// 0b0000_0xxx_xx00_0000 435 | Ok((Utf16Char{ units: [unit, 0] }, 2)) 436 | }, 437 | 0b1110_0000..=0b1110_1111 => {// 3 bytes => 1 unit 438 | let unit = (((b[2] & 0x3f) as u16) << 0) // 0b0000_0000_00xx_xxxx 439 | | (((b[1] & 0x3f) as u16) << 6) // 0b0000_xxxx_xx00_0000 440 | | (((b[0] & 0x0f) as u16) << 12);// 0bxxxx_0000_0000_0000 441 | Ok((Utf16Char{ units: [unit, 0] }, 3)) 442 | }, 443 | _ => {// 4 bytes => 2 units 444 | let second = 0xdc00 // 0b1101_1100_0000_0000 445 | | (((b[3] & 0x3f) as u16) << 0) // 0b0000_0000_00xx_xxxx 446 | | (((b[2] & 0x0f) as u16) << 6);// 0b0000_00xx_xx00_0000 447 | let first = 0xd800-(0x01_00_00u32>>10) as u16// 0b1101_0111_1100_0000 448 | + (((b[2] & 0x30) as u16) >> 4) // 0b0000_0000_0000_00xx 449 | + (((b[1] & 0x3f) as u16) << 2) // 0b0000_0000_xxxx_xx00 450 | + (((b[0] & 0x07) as u16) << 8); // 0b0000_0xxx_0000_0000 451 | Ok((Utf16Char{ units: [first, second] }, 4)) 452 | } 453 | } 454 | } 455 | /// Validate and store the first UTF-16 codepoint in the slice. 456 | /// Also return how many units were needed. 457 | pub fn from_slice_start(src: &[u16]) -> Result<(Self,usize), Utf16SliceError> { 458 | char::from_utf16_slice_start(src).map(|(_,len)| { 459 | let second = if len==2 {src[1]} else {0}; 460 | (Utf16Char{ units: [src[0], second] }, len) 461 | }) 462 | } 463 | /// Store the first UTF-16 codepoint of the slice. 464 | /// 465 | /// # Safety 466 | /// 467 | /// The slice must be non-empty and start with a valid UTF-16 codepoint. 468 | /// The length of the slice is never checked. 469 | pub unsafe fn from_slice_start_unchecked(src: &[u16]) -> (Self,usize) { 470 | unsafe { 471 | let first = *src.get_unchecked(0); 472 | if first.is_utf16_leading_surrogate() { 473 | (Utf16Char{ units: [first, *src.get_unchecked(1)] }, 2) 474 | } else { 475 | (Utf16Char{ units: [first, 0] }, 1) 476 | } 477 | } 478 | } 479 | /// Validate and store an UTF-16 array as returned from `char.to_utf16_array()`. 480 | /// 481 | /// # Examples 482 | /// 483 | /// ``` 484 | /// use encode_unicode::Utf16Char; 485 | /// use encode_unicode::error::Utf16ArrayError; 486 | /// 487 | /// assert_eq!(Utf16Char::from_array(['x' as u16, 'y' as u16]), Ok(Utf16Char::from('x'))); 488 | /// assert_eq!(Utf16Char::from_array(['睷' as u16, 0]), Ok(Utf16Char::from('睷'))); 489 | /// assert_eq!(Utf16Char::from_array([0xda6f, 0xdcde]), Ok(Utf16Char::from('\u{abcde}'))); 490 | /// assert_eq!(Utf16Char::from_array([0xf111, 0xdbad]), Ok(Utf16Char::from('\u{f111}'))); 491 | /// assert_eq!(Utf16Char::from_array([0xdaaf, 0xdaaf]), Err(Utf16ArrayError::SecondIsNotTrailingSurrogate)); 492 | /// assert_eq!(Utf16Char::from_array([0xdcac, 0x9000]), Err(Utf16ArrayError::FirstIsTrailingSurrogate)); 493 | /// ``` 494 | pub const fn from_array(units: [u16; 2]) -> Result { 495 | if (units[0] & 0xf8_00) != 0xd8_00 { 496 | Ok(Utf16Char { units: [units[0], 0] }) 497 | } else if units[0] < 0xdc_00 && (units[1] & 0xfc_00) == 0xdc_00 { 498 | Ok(Utf16Char { units }) 499 | } else if units[0] < 0xdc_00 { 500 | Err(Utf16ArrayError::SecondIsNotTrailingSurrogate) 501 | } else { 502 | Err(Utf16ArrayError::FirstIsTrailingSurrogate) 503 | } 504 | } 505 | /// Create an `Utf16Char` from an array as returned from `char.to_utf16_array()`. 506 | /// 507 | /// # Safety 508 | /// 509 | /// The units must form a valid codepoint, and the second unit must be 0 510 | /// when a surrogate pair is not required. 511 | /// Violating this can easily lead to undefined behavior, although unlike 512 | /// `char` bad `Utf16Char`s simply existing is not immediately UB. 513 | pub const unsafe fn from_array_unchecked(units: [u16; 2]) -> Self { 514 | Utf16Char { units } 515 | } 516 | pub(crate) const fn validate_tuple(utf16: (u16,Option)) -> Result<(),Utf16TupleError> { 517 | match utf16 { 518 | (0x00_00..=0xd7_ff, None) | // single 519 | (0xe0_00..=0xff_ff, None) | // single 520 | (0xd8_00..=0xdb_ff, Some(0xdc_00..=0xdf_ff)) // correct surrogate 521 | => Ok(()), 522 | (0xd8_00..=0xdb_ff, Some(_)) => Err(Utf16TupleError::SecondIsNotTrailingSurrogate), 523 | (0xd8_00..=0xdb_ff, None ) => Err(Utf16TupleError::MissingSecond), 524 | (0xdc_00..=0xdf_ff, _ ) => Err(Utf16TupleError::FirstIsTrailingSurrogate), 525 | ( _ , Some(_)) => Err(Utf16TupleError::SuperfluousSecond), 526 | } 527 | } 528 | /// Validate and store a UTF-16 pair as returned from `char.to_utf16_tuple()`. 529 | pub const fn from_tuple(utf16: (u16,Option)) -> Result { 530 | unsafe { 531 | match Self::validate_tuple(utf16) { 532 | Ok(()) => Ok(Self::from_tuple_unchecked(utf16)), 533 | Err(e) => Err(e), 534 | } 535 | } 536 | } 537 | /// Create an `Utf16Char` from a tuple as returned from `char.to_utf16_tuple()`. 538 | /// 539 | /// # Safety 540 | /// 541 | /// The units must form a valid codepoint with the second being 0 when a 542 | /// surrogate pair is not required. 543 | /// Violating this can easily lead to undefined behavior. 544 | pub const unsafe fn from_tuple_unchecked(utf16: (u16,Option)) -> Self { 545 | let second = match utf16.1 { 546 | Some(extra) => extra, 547 | None => 0, 548 | }; 549 | Utf16Char { units: [utf16.0, second] } 550 | } 551 | /// Create an `Utf16Char` from a single unit. 552 | /// 553 | /// Codepoints less than `'\u{1_00_00}'` (which fit in an `u16`) 554 | /// are part of the basic multilingual plane 555 | /// unless they are reserved for surrogate pairs. 556 | /// 557 | /// # Errors 558 | /// 559 | /// Returns `NonBmpError` if the unit is in the range `0xd800..0xe000` 560 | /// (which means that it's part of a surrogat pair) 561 | /// 562 | /// # Examples 563 | /// 564 | /// ``` 565 | /// # use encode_unicode::Utf16Char; 566 | /// assert_eq!(Utf16Char::from_bmp(0x40).unwrap(), '@'); 567 | /// assert_eq!(Utf16Char::from_bmp('ø' as u16).unwrap(), 'ø'); 568 | /// assert!(Utf16Char::from_bmp(0xdddd).is_err()); 569 | /// ``` 570 | pub const fn from_bmp(bmp_codepoint: u16) -> Result { 571 | let is_not_bmp = bmp_codepoint & 0xf800 == 0xd800; 572 | let if_good = Utf16Char{ units: [bmp_codepoint, 0] }; 573 | [Ok(if_good), Err(NonBmpError)][is_not_bmp as usize] 574 | } 575 | /// Create an `Utf16Char` from a single unit without checking that it's a 576 | /// valid codepoint on its own. 577 | /// 578 | /// # Safety 579 | /// 580 | /// The unit must be less than 0xd800 or greater than 0xdfff. 581 | /// In other words, not part of a surrogate pair. 582 | /// Violating this can easily lead to undefined behavior. 583 | #[inline] 584 | pub const unsafe fn from_bmp_unchecked(bmp_codepoint: u16) -> Self { 585 | Utf16Char{ units: [bmp_codepoint, 0] } 586 | } 587 | /// Checks that the codepoint is in the basic multilingual plane. 588 | /// 589 | /// # Examples 590 | /// ``` 591 | /// # use encode_unicode::Utf16Char; 592 | /// assert_eq!(Utf16Char::from('e').is_bmp(), true); 593 | /// assert_eq!(Utf16Char::from('€').is_bmp(), true); 594 | /// assert_eq!(Utf16Char::from('𝔼').is_bmp(), false); 595 | /// ``` 596 | #[inline] 597 | pub const fn is_bmp(self) -> bool { 598 | self.units[1] == 0 599 | } 600 | 601 | /// The number of units this character is made up of. 602 | /// 603 | /// Is either 1 or 2 and identical to `.as_char().len_utf16()` 604 | /// or `.as_ref().len()`. 605 | #[inline] 606 | pub const fn len(self) -> usize { 607 | 1 + (self.units[1] as usize >> 15) 608 | } 609 | // There is no `.is_emty()` because it would always return false. 610 | 611 | /// Checks that the codepoint is an ASCII character. 612 | #[inline] 613 | pub const fn is_ascii(self) -> bool { 614 | self.units[0] <= 127 615 | } 616 | /// Checks that two characters are an ASCII case-insensitive match. 617 | /// 618 | /// Is equivalent to `a.to_ascii_lowercase() == b.to_ascii_lowercase()`. 619 | pub const fn eq_ignore_ascii_case(&self, other: &Self) -> bool { 620 | if self.is_ascii() && other.is_ascii() { 621 | (self.units[0] as u8).eq_ignore_ascii_case(&(other.units[0] as u8)) 622 | } else { 623 | self.units[0] == other.units[0] && self.units[1] == other.units[1] 624 | } 625 | } 626 | /// Converts the character to its ASCII upper case equivalent. 627 | /// 628 | /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', 629 | /// but non-ASCII letters are unchanged. 630 | pub const fn to_ascii_uppercase(self) -> Self { 631 | let n = self.units[0].wrapping_sub(b'a' as u16); 632 | if n < 26 {Utf16Char{ units: [n+b'A' as u16, 0] }} 633 | else {self} 634 | } 635 | /// Converts the character to its ASCII lower case equivalent. 636 | /// 637 | /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', 638 | /// but non-ASCII letters are unchanged. 639 | pub const fn to_ascii_lowercase(self) -> Self { 640 | let n = self.units[0].wrapping_sub(b'A' as u16); 641 | if n < 26 {Utf16Char{ units: [n+b'a' as u16, 0] }} 642 | else {self} 643 | } 644 | /// Converts the character to its ASCII upper case equivalent in-place. 645 | /// 646 | /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', 647 | /// but non-ASCII letters are unchanged. 648 | pub fn make_ascii_uppercase(&mut self) { 649 | *self = self.to_ascii_uppercase() 650 | } 651 | /// Converts the character to its ASCII lower case equivalent in-place. 652 | /// 653 | /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', 654 | /// but non-ASCII letters are unchanged. 655 | pub fn make_ascii_lowercase(&mut self) { 656 | *self = self.to_ascii_lowercase(); 657 | } 658 | 659 | /// Convert from UTF-16 to UTF-32 660 | pub fn to_char(self) -> char { 661 | self.into() 662 | } 663 | /// Write the internal representation to a slice, 664 | /// and then returns the number of `u16`s written. 665 | /// 666 | /// # Panics 667 | /// Will panic the buffer is too small; 668 | /// You can get the required length from `.len()`, 669 | /// but a buffer of length two is always large enough. 670 | pub fn to_slice(self, dst: &mut[u16]) -> usize { 671 | // Write the last unit first to avoid repeated length checks. 672 | let extra = self.units[1] as usize >> 15; 673 | match dst.get_mut(extra) { 674 | Some(first) => *first = self.units[extra], 675 | None => panic!("The provided buffer is too small.") 676 | } 677 | if extra != 0 {dst[0] = self.units[0];} 678 | extra+1 679 | } 680 | /// Get the character represented as an array of two units. 681 | /// 682 | /// The second `u16` is zero for codepoints that fit in one unit. 683 | #[inline] 684 | pub const fn to_array(self) -> [u16;2] { 685 | self.units 686 | } 687 | /// The second `u16` is used for surrogate pairs. 688 | #[inline] 689 | pub const fn to_tuple(self) -> (u16,Option) { 690 | (self.units[0], [None, Some(self.units[1])][self.units[1] as usize >> 15]) 691 | } 692 | } 693 | --------------------------------------------------------------------------------