├── .github
    ├── FUNDING.yml
    └── workflows
    │   └── ci.yml
├── .gitignore
├── COPYING
├── Cargo.toml
├── LICENSE-APACHE
├── LICENSE-MIT
├── README.md
├── ci
    └── script.sh
├── rustfmt.toml
└── src
    ├── lib.rs
    └── util.rs


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [BurntSushi]
2 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: ci
 2 | on:
 3 |   pull_request:
 4 |   push:
 5 |     branches:
 6 |     - master
 7 |   schedule:
 8 |   - cron: '00 01 * * *'
 9 | jobs:
10 |   test:
11 |     name: test
12 |     runs-on: ${{ matrix.os }}
13 |     strategy:
14 |       matrix:
15 |         build:
16 |         - stable
17 |         - beta
18 |         - nightly
19 |         - macos
20 |         - win-msvc
21 |         - win-gnu
22 |         include:
23 |         - build: stable
24 |           os: ubuntu-18.04
25 |           rust: stable
26 |         - build: beta
27 |           os: ubuntu-18.04
28 |           rust: beta
29 |         - build: nightly
30 |           os: ubuntu-18.04
31 |           rust: nightly
32 |         - build: macos
33 |           os: macos-latest
34 |           rust: stable
35 |         - build: win-msvc
36 |           os: windows-2019
37 |           rust: stable
38 |         - build: win-gnu
39 |           os: windows-2019
40 |           rust: stable-x86_64-gnu
41 |     steps:
42 |     - name: Checkout repository
43 |       uses: actions/checkout@v1
44 |       with:
45 |         fetch-depth: 1
46 |     - name: Install Rust
47 |       uses: actions-rs/toolchain@v1
48 |       with:
49 |         toolchain: ${{ matrix.rust }}
50 |         override: true
51 |         profile: minimal
52 |     - run: cargo build --verbose
53 |     - run: cargo doc --verbose
54 |     - run: cargo test --verbose
55 | 
56 |   rustfmt:
57 |     name: rustfmt
58 |     runs-on: ubuntu-18.04
59 |     steps:
60 |     - name: Checkout repository
61 |       uses: actions/checkout@v1
62 |       with:
63 |         fetch-depth: 1
64 |     - name: Install Rust
65 |       uses: actions-rs/toolchain@v1
66 |       with:
67 |         toolchain: stable
68 |         override: true
69 |         profile: minimal
70 |         components: rustfmt
71 |     - name: Check formatting
72 |       run: |
73 |         cargo fmt -- --check
74 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .*.swp
2 | tags
3 | target
4 | Cargo.lock
5 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
1 | This project is licensed under either of
2 | 
3 |  * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
4 |    http://www.apache.org/licenses/LICENSE-2.0)
5 |  * MIT license ([LICENSE-MIT](LICENSE-MIT) or
6 |    http://opensource.org/licenses/MIT)
7 | 
8 | at your option.
9 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "encoding_rs_io"
 3 | version = "0.1.7"  #:version
 4 | authors = ["Andrew Gallant <jamslam@gmail.com>"]
 5 | description = "Streaming transcoding for encoding_rs"
 6 | documentation = "https://docs.rs/encoding_rs_io"
 7 | repository = "https://github.com/BurntSushi/encoding_rs_io"
 8 | readme = "README.md"
 9 | keywords = ["encoding", "transcoding", "stream", "io", "read"]
10 | license = "MIT OR Apache-2.0"
11 | categories = ["text-processing", "encoding", "web-programming", "email"]
12 | exclude = ["/ci/*", "/.travis.yml", "/appveyor.yml"]
13 | 
14 | [badges]
15 | travis-ci = { repository = "BurntSushi/encoding_rs_io" }
16 | appveyor = { repository = "BurntSushi/encoding_rs_io" }
17 | 
18 | [lib]
19 | bench = false
20 | 
21 | [dependencies]
22 | encoding_rs = "0.8"
23 | 
24 | [profile.release]
25 | debug = true
26 | 


--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
  1 |                               Apache License
  2 |                         Version 2.0, January 2004
  3 |                      http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 |    "License" shall mean the terms and conditions for use, reproduction,
 10 |    and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |    "Licensor" shall mean the copyright owner or entity authorized by
 13 |    the copyright owner that is granting the License.
 14 | 
 15 |    "Legal Entity" shall mean the union of the acting entity and all
 16 |    other entities that control, are controlled by, or are under common
 17 |    control with that entity. For the purposes of this definition,
 18 |    "control" means (i) the power, direct or indirect, to cause the
 19 |    direction or management of such entity, whether by contract or
 20 |    otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |    outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |    "You" (or "Your") shall mean an individual or Legal Entity
 24 |    exercising permissions granted by this License.
 25 | 
 26 |    "Source" form shall mean the preferred form for making modifications,
 27 |    including but not limited to software source code, documentation
 28 |    source, and configuration files.
 29 | 
 30 |    "Object" form shall mean any form resulting from mechanical
 31 |    transformation or translation of a Source form, including but
 32 |    not limited to compiled object code, generated documentation,
 33 |    and conversions to other media types.
 34 | 
 35 |    "Work" shall mean the work of authorship, whether in Source or
 36 |    Object form, made available under the License, as indicated by a
 37 |    copyright notice that is included in or attached to the work
 38 |    (an example is provided in the Appendix below).
 39 | 
 40 |    "Derivative Works" shall mean any work, whether in Source or Object
 41 |    form, that is based on (or derived from) the Work and for which the
 42 |    editorial revisions, annotations, elaborations, or other modifications
 43 |    represent, as a whole, an original work of authorship. For the purposes
 44 |    of this License, Derivative Works shall not include works that remain
 45 |    separable from, or merely link (or bind by name) to the interfaces of,
 46 |    the Work and Derivative Works thereof.
 47 | 
 48 |    "Contribution" shall mean any work of authorship, including
 49 |    the original version of the Work and any modifications or additions
 50 |    to that Work or Derivative Works thereof, that is intentionally
 51 |    submitted to Licensor for inclusion in the Work by the copyright owner
 52 |    or by an individual or Legal Entity authorized to submit on behalf of
 53 |    the copyright owner. For the purposes of this definition, "submitted"
 54 |    means any form of electronic, verbal, or written communication sent
 55 |    to the Licensor or its representatives, including but not limited to
 56 |    communication on electronic mailing lists, source code control systems,
 57 |    and issue tracking systems that are managed by, or on behalf of, the
 58 |    Licensor for the purpose of discussing and improving the Work, but
 59 |    excluding communication that is conspicuously marked or otherwise
 60 |    designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |    "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |    on behalf of whom a Contribution has been received by Licensor and
 64 |    subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 |    this License, each Contributor hereby grants to You a perpetual,
 68 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |    copyright license to reproduce, prepare Derivative Works of,
 70 |    publicly display, publicly perform, sublicense, and distribute the
 71 |    Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 |    this License, each Contributor hereby grants to You a perpetual,
 75 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |    (except as stated in this section) patent license to make, have made,
 77 |    use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |    where such license applies only to those patent claims licensable
 79 |    by such Contributor that are necessarily infringed by their
 80 |    Contribution(s) alone or by combination of their Contribution(s)
 81 |    with the Work to which such Contribution(s) was submitted. If You
 82 |    institute patent litigation against any entity (including a
 83 |    cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |    or a Contribution incorporated within the Work constitutes direct
 85 |    or contributory patent infringement, then any patent licenses
 86 |    granted to You under this License for that Work shall terminate
 87 |    as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 |    Work or Derivative Works thereof in any medium, with or without
 91 |    modifications, and in Source or Object form, provided that You
 92 |    meet the following conditions:
 93 | 
 94 |    (a) You must give any other recipients of the Work or
 95 |        Derivative Works a copy of this License; and
 96 | 
 97 |    (b) You must cause any modified files to carry prominent notices
 98 |        stating that You changed the files; and
 99 | 
100 |    (c) You must retain, in the Source form of any Derivative Works
101 |        that You distribute, all copyright, patent, trademark, and
102 |        attribution notices from the Source form of the Work,
103 |        excluding those notices that do not pertain to any part of
104 |        the Derivative Works; and
105 | 
106 |    (d) If the Work includes a "NOTICE" text file as part of its
107 |        distribution, then any Derivative Works that You distribute must
108 |        include a readable copy of the attribution notices contained
109 |        within such NOTICE file, excluding those notices that do not
110 |        pertain to any part of the Derivative Works, in at least one
111 |        of the following places: within a NOTICE text file distributed
112 |        as part of the Derivative Works; within the Source form or
113 |        documentation, if provided along with the Derivative Works; or,
114 |        within a display generated by the Derivative Works, if and
115 |        wherever such third-party notices normally appear. The contents
116 |        of the NOTICE file are for informational purposes only and
117 |        do not modify the License. You may add Your own attribution
118 |        notices within Derivative Works that You distribute, alongside
119 |        or as an addendum to the NOTICE text from the Work, provided
120 |        that such additional attribution notices cannot be construed
121 |        as modifying the License.
122 | 
123 |    You may add Your own copyright statement to Your modifications and
124 |    may provide additional or different license terms and conditions
125 |    for use, reproduction, or distribution of Your modifications, or
126 |    for any such Derivative Works as a whole, provided Your use,
127 |    reproduction, and distribution of the Work otherwise complies with
128 |    the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 |    any Contribution intentionally submitted for inclusion in the Work
132 |    by You to the Licensor shall be under the terms and conditions of
133 |    this License, without any additional terms or conditions.
134 |    Notwithstanding the above, nothing herein shall supersede or modify
135 |    the terms of any separate license agreement you may have executed
136 |    with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 |    names, trademarks, service marks, or product names of the Licensor,
140 |    except as required for reasonable and customary use in describing the
141 |    origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 |    agreed to in writing, Licensor provides the Work (and each
145 |    Contributor provides its Contributions) on an "AS IS" BASIS,
146 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |    implied, including, without limitation, any warranties or conditions
148 |    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |    PARTICULAR PURPOSE. You are solely responsible for determining the
150 |    appropriateness of using or redistributing the Work and assume any
151 |    risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 |    whether in tort (including negligence), contract, or otherwise,
155 |    unless required by applicable law (such as deliberate and grossly
156 |    negligent acts) or agreed to in writing, shall any Contributor be
157 |    liable to You for damages, including any direct, indirect, special,
158 |    incidental, or consequential damages of any character arising as a
159 |    result of this License or out of the use or inability to use the
160 |    Work (including but not limited to damages for loss of goodwill,
161 |    work stoppage, computer failure or malfunction, or any and all
162 |    other commercial damages or losses), even if such Contributor
163 |    has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 |    the Work or Derivative Works thereof, You may choose to offer,
167 |    and charge a fee for, acceptance of support, warranty, indemnity,
168 |    or other liability obligations and/or rights consistent with this
169 |    License. However, in accepting such obligations, You may act only
170 |    on Your own behalf and on Your sole responsibility, not on behalf
171 |    of any other Contributor, and only if You agree to indemnify,
172 |    defend, and hold each Contributor harmless for any liability
173 |    incurred by, or claims asserted against, such Contributor by reason
174 |    of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 |    To apply the Apache License to your work, attach the following
181 |    boilerplate notice, with the fields enclosed by brackets "[]"
182 |    replaced with your own identifying information. (Don't include
183 |    the brackets!)  The text should be enclosed in the appropriate
184 |    comment syntax for the file format. We also recommend that a
185 |    file or class name and description of purpose be included on the
186 |    same "printed page" as the copyright notice for easier
187 |    identification within third-party archives.
188 | 
189 | Copyright [yyyy] [name of copyright owner]
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 | 	http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Andrew Gallant
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | encoding_rs_io
  2 | ==============
  3 | This crate provides streaming adapters for the
  4 | [`encoding_rs`](https://github.com/hsivonen/encoding_rs)
  5 | crate. Adapters implement the standard library I/O traits and provide streaming
  6 | transcoding support.
  7 | 
  8 | [![Build status](https://github.com/BurntSushi/encoding_rs_io/workflows/ci/badge.svg)](https://github.com/BurntSushi/encoding_rs_io/actions)
  9 | [![](http://meritbadge.herokuapp.com/encoding_rs_io)](https://crates.io/crates/encoding_rs_io)
 10 | 
 11 | 
 12 | ### Documentation
 13 | 
 14 | https://docs.rs/encoding_rs_io
 15 | 
 16 | 
 17 | ### Usage
 18 | 
 19 | Add this to your `Cargo.toml`:
 20 | 
 21 | ```toml
 22 | [dependencies]
 23 | encoding_rs_io = "0.1"
 24 | ```
 25 | 
 26 | and this to your crate root:
 27 | 
 28 | ```rust
 29 | extern crate encoding_rs_io;
 30 | ```
 31 | 
 32 | 
 33 | ### Example
 34 | 
 35 | This example shows how to create a decoder that transcodes UTF-16LE (the
 36 | source, indicated by a BOM) to UTF-8 (the destination).
 37 | 
 38 | ```rust
 39 | extern crate encoding_rs;
 40 | extern crate encoding_rs_io;
 41 | 
 42 | use std::error::Error;
 43 | use std::io::Read;
 44 | 
 45 | use encoding_rs_io::DecodeReaderBytes;
 46 | 
 47 | fn main() {
 48 |     example().unwrap();
 49 | }
 50 | 
 51 | fn example() -> Result<(), Box<Error>> {
 52 |     let source_data = &b"\xFF\xFEf\x00o\x00o\x00b\x00a\x00r\x00"[..];
 53 |     // N.B. `source_data` can be any arbitrary io::Read implementation.
 54 |     let mut decoder = DecodeReaderBytes::new(source_data);
 55 | 
 56 |     let mut dest = String::new();
 57 |     // decoder implements the io::Read trait, so it can easily be plugged
 58 |     // into any consumer expecting an arbitrary reader.
 59 |     decoder.read_to_string(&mut dest)?;
 60 |     assert_eq!(dest, "foobar");
 61 |     Ok(())
 62 | }
 63 | ```
 64 | 
 65 | 
 66 | ### Future work
 67 | 
 68 | Currently, this crate only provides a way to get _possibly valid_ UTF-8 from
 69 | some source encoding. There are other transformations that may be useful that
 70 | we could include in this crate. Namely:
 71 | 
 72 | * An encoder that accepts an arbitrary `std::io::Write` implementation and
 73 |   takes valid UTF-8 and transcodes it to a selected destination encoding. This
 74 |   encoder would implement `std::fmt::Write`.
 75 | * A decoder that accepts an arbitrary `std::fmt::Write` implementation and
 76 |   takes arbitrary bytes and transcodes them from a selected source
 77 |   encoding to valid UTF-8. This decoder would implement `std::io::Write`.
 78 | * An encoder that accepts an arbitrary `UnicodeRead` implementation and
 79 |   takes valid UTF-8 and transcodes it to a selected destination encoding.
 80 |   This encoder would implement `std::io::Read`.
 81 | * A decoder that accepts an arbitrary `std::io::Read` implementation and
 82 |   takes arbitrary bytes and transcodes them from a selected source encoding
 83 |   to valid UTF-8. This decoder would implement the `UnicodeRead` trait.
 84 | 
 85 | Where `UnicodeRead` is a hypothetical trait that does not yet exist. Its
 86 | definition might look something like this:
 87 | 
 88 | ```ignore
 89 | trait UnicodeRead {
 90 |     fn read(&mut self, buf: &mut str) -> Result<usize>;
 91 | }
 92 | ```
 93 | 
 94 | Interestingly, of the above transformations, none of them correspond to
 95 | `DecodeReaderBytes`. Namely, `DecodeReaderBytes` most closely corresponds to
 96 | the last option, but instead of guaranteeing valid UTF-8 by implementing a
 97 | trait like `UnicodeRead`, it instead implements `std::io::Read`, which pushes
 98 | UTF-8 handling on to the caller. However, it turns out that this particular
 99 | use case is important for operations like search, which can often be written
100 | in a way that don't assume UTF-8 validity but still benefit from it.
101 | 
102 | It's not clear which of the above transformations is actually useful, but all
103 | of them could theoretically exist. There is more discussion on this topic
104 | here (and in particular, the above formulation was taken almost verbatim from
105 | Simon Sapin's comments): https://github.com/hsivonen/encoding_rs/issues/8
106 | 
107 | It is also perhaps worth stating that this crate very much intends on
108 | remaining coupled to `encoding_rs`, which helps restrict the scope, but may be
109 | too biased toward Web oriented encoding to solve grander encoding challenges.
110 | As such, it may very well be that this crate is actually a stepping stone to
111 | something with a larger scope. But first, we must learn.
112 | 
113 | 
114 | ### License
115 | 
116 | This project is licensed under either of
117 | 
118 |  * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or
119 |    http://www.apache.org/licenses/LICENSE-2.0)
120 |  * MIT license ([LICENSE-MIT](LICENSE-MIT) or
121 |    http://opensource.org/licenses/MIT)
122 | 
123 | at your option.
124 | 


--------------------------------------------------------------------------------
/ci/script.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | set -ex
4 | 
5 | cargo build --verbose
6 | cargo doc --verbose
7 | cargo test --verbose
8 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | max_width = 79
2 | use_small_heuristics = "max"
3 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | /*!
  2 | This crate provides streaming transcoding by implementing Rust's I/O traits
  3 | and delegating transcoding to the
  4 | [`encoding_rs`](https://crates.io/crates/encoding_rs)
  5 | crate.
  6 | 
  7 | Currently, this crate only provides a means of transcoding from a source
  8 | encoding (that is among the encodings supported by `encoding_rs`) to UTF-8 via
  9 | an implementation of `std::io::Read`, where errors are handled by replacing
 10 | invalid sequences with the Unicode replacement character. Future work may
 11 | provide additional implementations for `std::io::Write` and/or implementations
 12 | that make stronger guarantees about UTF-8 validity.
 13 | 
 14 | # Example
 15 | 
 16 | This example shows how to create a decoder that transcodes UTF-16LE (the
 17 | source) to UTF-8 (the destination).
 18 | 
 19 | ```
 20 | extern crate encoding_rs;
 21 | extern crate encoding_rs_io;
 22 | 
 23 | use std::error::Error;
 24 | use std::io::Read;
 25 | 
 26 | use encoding_rs_io::DecodeReaderBytes;
 27 | 
 28 | # fn main() { example().unwrap(); }
 29 | fn example() -> Result<(), Box<Error>> {
 30 |     let source_data = &b"\xFF\xFEf\x00o\x00o\x00b\x00a\x00r\x00"[..];
 31 |     // N.B. `source_data` can be any arbitrary io::Read implementation.
 32 |     let mut decoder = DecodeReaderBytes::new(source_data);
 33 | 
 34 |     let mut dest = String::new();
 35 |     // decoder implements the io::Read trait, so it can easily be plugged
 36 |     // into any consumer expecting an arbitrary reader.
 37 |     decoder.read_to_string(&mut dest)?;
 38 |     assert_eq!(dest, "foobar");
 39 |     Ok(())
 40 | }
 41 | ```
 42 | 
 43 | # Future work
 44 | 
 45 | Currently, this crate only provides a way to get _possibly valid_ UTF-8 from
 46 | some source encoding. There are other transformations that may be useful that
 47 | we could include in this crate. Namely:
 48 | 
 49 | * An encoder that accepts an arbitrary `std::io::Write` implementation and
 50 |   takes valid UTF-8 and transcodes it to a selected destination encoding. This
 51 |   encoder would implement `std::fmt::Write`.
 52 | * A decoder that accepts an arbitrary `std::fmt::Write` implementation and
 53 |   takes arbitrary bytes and transcodes them from a selected source
 54 |   encoding to valid UTF-8. This decoder would implement `std::io::Write`.
 55 | * An encoder that accepts an arbitrary `UnicodeRead` implementation and
 56 |   takes valid UTF-8 and transcodes it to a selected destination encoding.
 57 |   This encoder would implement `std::io::Read`.
 58 | * A decoder that accepts an arbitrary `std::io::Read` implementation and
 59 |   takes arbitrary bytes and transcodes them from a selected source encoding
 60 |   to valid UTF-8. This decoder would implement the `UnicodeRead` trait.
 61 | 
 62 | Where `UnicodeRead` is a hypothetical trait that does not yet exist. Its
 63 | definition might look something like this:
 64 | 
 65 | ```ignore
 66 | trait UnicodeRead {
 67 |     fn read(&mut self, buf: &mut str) -> Result<usize>;
 68 | }
 69 | ```
 70 | 
 71 | Interestingly, of the above transformations, none of them correspond to
 72 | `DecodeReaderBytes`. Namely, `DecodeReaderBytes` most closely corresponds to
 73 | the last option, but instead of guaranteeing valid UTF-8 by implementing a
 74 | trait like `UnicodeRead`, it instead implements `std::io::Read`, which pushes
 75 | UTF-8 handling on to the caller. However, it turns out that this particular
 76 | use case is important for operations like search, which can often be written
 77 | in a way that don't assume UTF-8 validity but still benefit from it.
 78 | 
 79 | It's not clear which of the above transformations is actually useful, but all
 80 | of them could theoretically exist. There is more discussion on this topic
 81 | here (and in particular, the above formulation was taken almost verbatim from
 82 | Simon Sapin's comments): https://github.com/hsivonen/encoding_rs/issues/8
 83 | 
 84 | It is also perhaps worth stating that this crate very much intends on
 85 | remaining coupled to `encoding_rs`, which helps restrict the scope, but may be
 86 | too biased toward Web oriented encoding to solve grander encoding challenges.
 87 | As such, it may very well be that this crate is actually a stepping stone to
 88 | something with a larger scope. But first, we must learn.
 89 | */
 90 | 
 91 | extern crate encoding_rs;
 92 | 
 93 | use std::fmt;
 94 | use std::io::{self, Read};
 95 | 
 96 | use encoding_rs::{Decoder, Encoding, UTF_8};
 97 | 
 98 | use util::{BomPeeker, TinyTranscoder};
 99 | 
100 | mod util;
101 | 
102 | /// A builder for constructing a byte oriented transcoder to UTF-8.
103 | #[derive(Clone, Debug)]
104 | pub struct DecodeReaderBytesBuilder {
105 |     encoding: Option<&'static Encoding>,
106 |     utf8_passthru: bool,
107 |     bom_override: bool,
108 |     strip_bom: bool,
109 |     bom_sniffing: bool,
110 | }
111 | 
112 | impl Default for DecodeReaderBytesBuilder {
113 |     fn default() -> DecodeReaderBytesBuilder {
114 |         DecodeReaderBytesBuilder::new()
115 |     }
116 | }
117 | 
118 | impl DecodeReaderBytesBuilder {
119 |     /// Create a new decoder builder with a default configuration.
120 |     ///
121 |     /// By default, no explicit encoding is used, but if a UTF-8 or UTF-16
122 |     /// BOM is detected, then an appropriate encoding is automatically
123 |     /// detected and transcoding is performed (where invalid sequences map to
124 |     /// the Unicode replacement codepoint).
125 |     pub fn new() -> DecodeReaderBytesBuilder {
126 |         DecodeReaderBytesBuilder {
127 |             encoding: None,
128 |             utf8_passthru: false,
129 |             bom_override: false,
130 |             strip_bom: false,
131 |             bom_sniffing: true,
132 |         }
133 |     }
134 | 
135 |     /// Build a new decoder that wraps the given reader.
136 |     pub fn build<R: io::Read>(&self, rdr: R) -> DecodeReaderBytes<R, Vec<u8>> {
137 |         self.build_with_buffer(rdr, vec![0; 8 * (1 << 10)]).unwrap()
138 |     }
139 | 
140 |     /// Build a new decoder that wraps the given reader and uses the given
141 |     /// buffer internally for transcoding.
142 |     ///
143 |     /// This is useful for cases where it is advantageuous to amortize
144 |     /// allocation. Namely, this method permits reusing a buffer for
145 |     /// subsequent decoders.
146 |     ///
147 |     /// This returns an error if the buffer is smaller than 4 bytes (which is
148 |     /// too small to hold maximum size of a single UTF-8 encoded codepoint).
149 |     pub fn build_with_buffer<R: io::Read, B: AsMut<[u8]>>(
150 |         &self,
151 |         rdr: R,
152 |         mut buffer: B,
153 |     ) -> io::Result<DecodeReaderBytes<R, B>> {
154 |         if buffer.as_mut().len() < 4 {
155 |             let msg = format!(
156 |                 "DecodeReaderBytesBuilder: buffer of size {} is too small",
157 |                 buffer.as_mut().len(),
158 |             );
159 |             return Err(io::Error::new(io::ErrorKind::Other, msg));
160 |         }
161 |         let encoding =
162 |             self.encoding.map(|enc| enc.new_decoder_with_bom_removal());
163 | 
164 |         // No need to do BOM detection if we opt out of it or have an explicit
165 |         // encoding.
166 |         let has_detected =
167 |             !self.bom_sniffing || (!self.bom_override && encoding.is_some());
168 | 
169 |         let peeker = if self.strip_bom {
170 |             BomPeeker::without_bom(rdr)
171 |         } else {
172 |             BomPeeker::with_bom(rdr)
173 |         };
174 |         Ok(DecodeReaderBytes {
175 |             rdr: peeker,
176 |             decoder: encoding,
177 |             tiny: TinyTranscoder::new(),
178 |             utf8_passthru: self.utf8_passthru,
179 |             buf: buffer,
180 |             buflen: 0,
181 |             pos: 0,
182 |             has_detected: has_detected,
183 |             exhausted: false,
184 |         })
185 |     }
186 | 
187 |     /// Set an explicit encoding to be used by this decoder.
188 |     ///
189 |     /// When an explicit encoding is set, BOM sniffing is disabled and the
190 |     /// encoding provided will be used unconditionally. Errors in the encoded
191 |     /// bytes are replaced by the Unicode replacement codepoint.
192 |     ///
193 |     /// By default, no explicit encoding is set.
194 |     pub fn encoding(
195 |         &mut self,
196 |         encoding: Option<&'static Encoding>,
197 |     ) -> &mut DecodeReaderBytesBuilder {
198 |         self.encoding = encoding;
199 |         self
200 |     }
201 | 
202 |     /// Enable UTF-8 passthru, even when a UTF-8 BOM is observed.
203 |     ///
204 |     /// When an explicit encoding is not set (thereby invoking automatic
205 |     /// encoding detection via BOM sniffing), then a UTF-8 BOM will cause
206 |     /// UTF-8 transcoding to occur. In particular, if the source contains
207 |     /// invalid UTF-8 sequences, then they are replaced with the Unicode
208 |     /// replacement codepoint.
209 |     ///
210 |     /// This transcoding may not be desirable. For example, the caller may
211 |     /// already have its own UTF-8 handling where invalid UTF-8 is
212 |     /// appropriately handled, in which case, doing an extra transcoding
213 |     /// step is extra and unnecessary work. Enabling this option will prevent
214 |     /// that extra transcoding step from occurring. In this case, the bytes
215 |     /// emitted by the reader are passed through unchanged (including the BOM)
216 |     /// and the caller will be responsible for handling any invalid UTF-8.
217 |     ///
218 |     /// # Example
219 |     ///
220 |     /// This example demonstrates the effect of enabling this option on data
221 |     /// that includes a UTF-8 BOM but also, interestingly enough, subsequently
222 |     /// includes invalid UTF-8.
223 |     ///
224 |     /// ```
225 |     /// extern crate encoding_rs;
226 |     /// extern crate encoding_rs_io;
227 |     ///
228 |     /// use std::error::Error;
229 |     /// use std::io::Read;
230 |     ///
231 |     /// use encoding_rs_io::DecodeReaderBytesBuilder;
232 |     ///
233 |     /// # fn main() { example().unwrap(); }
234 |     /// fn example() -> Result<(), Box<Error>> {
235 |     ///     let source_data = &b"\xEF\xBB\xBFfoo\xFFbar"[..];
236 |     ///     let mut decoder = DecodeReaderBytesBuilder::new()
237 |     ///         .utf8_passthru(true)
238 |     ///         .build(source_data);
239 |     ///
240 |     ///     let mut dest = vec![];
241 |     ///     decoder.read_to_end(&mut dest)?;
242 |     ///     // Without the passthru option, you'd get "foo\u{FFFD}bar".
243 |     ///     assert_eq!(dest, b"\xEF\xBB\xBFfoo\xFFbar");
244 |     ///     Ok(())
245 |     /// }
246 |     /// ```
247 |     pub fn utf8_passthru(
248 |         &mut self,
249 |         yes: bool,
250 |     ) -> &mut DecodeReaderBytesBuilder {
251 |         self.utf8_passthru = yes;
252 |         self
253 |     }
254 | 
255 |     /// Whether or not to always strip a BOM if one is found.
256 |     ///
257 |     /// When this is enabled, if a BOM is found at the beginning of a stream,
258 |     /// then it is ignored. This applies even when `utf8_passthru` is enabled
259 |     /// or if `bom_sniffing` is disabled.
260 |     ///
261 |     /// This is disabled by default.
262 |     ///
263 |     /// # Example
264 |     ///
265 |     /// This example shows how to remove the BOM if it's present even when
266 |     /// `utf8_passthru` is enabled.
267 |     ///
268 |     /// ```
269 |     /// extern crate encoding_rs;
270 |     /// extern crate encoding_rs_io;
271 |     ///
272 |     /// use std::error::Error;
273 |     /// use std::io::Read;
274 |     ///
275 |     /// use encoding_rs_io::DecodeReaderBytesBuilder;
276 |     ///
277 |     /// # fn main() { example().unwrap(); }
278 |     /// fn example() -> Result<(), Box<Error>> {
279 |     ///     let source_data = &b"\xEF\xBB\xBFfoo\xFFbar"[..];
280 |     ///     let mut decoder = DecodeReaderBytesBuilder::new()
281 |     ///         .utf8_passthru(true)
282 |     ///         .strip_bom(true)
283 |     ///         .build(source_data);
284 |     ///
285 |     ///     let mut dest = vec![];
286 |     ///     decoder.read_to_end(&mut dest)?;
287 |     ///     // If `strip_bom` wasn't enabled, then this would include the BOM.
288 |     ///     assert_eq!(dest, b"foo\xFFbar");
289 |     ///     Ok(())
290 |     /// }
291 |     /// ```
292 |     pub fn strip_bom(&mut self, yes: bool) -> &mut DecodeReaderBytesBuilder {
293 |         self.strip_bom = yes;
294 |         self
295 |     }
296 | 
297 |     /// Give the highest precedent to the BOM, if one is found.
298 |     ///
299 |     /// When this is enabled, and if a BOM is found, then the encoding
300 |     /// indicated by that BOM is used even if an explicit encoding has been
301 |     /// set via the `encoding` method.
302 |     ///
303 |     /// This does not override `utf8_passthru`.
304 |     ///
305 |     /// This is disabled by default.
306 |     pub fn bom_override(
307 |         &mut self,
308 |         yes: bool,
309 |     ) -> &mut DecodeReaderBytesBuilder {
310 |         self.bom_override = yes;
311 |         self
312 |     }
313 | 
314 |     /// Enable BOM sniffing
315 |     ///
316 |     /// When this is enabled and an explicit encoding is not set, the decoder
317 |     /// will try to detect the encoding with BOM.
318 |     ///
319 |     /// When this is disabled and an explicit encoding is not set, the decoder
320 |     /// will treat the input as raw bytes. The bytes will be passed through
321 |     /// unchanged, including any BOM that may be present.
322 |     ///
323 |     /// This is enabled by default.
324 |     pub fn bom_sniffing(
325 |         &mut self,
326 |         yes: bool,
327 |     ) -> &mut DecodeReaderBytesBuilder {
328 |         self.bom_sniffing = yes;
329 |         self
330 |     }
331 | }
332 | 
333 | /// An implementation of `io::Read` that transcodes to UTF-8 in a streaming
334 | /// fashion.
335 | ///
336 | /// The high level goal of this decoder is to provide access to byte streams
337 | /// that are assumed to be UTF-8 unless an encoding is otherwise specified
338 | /// (either via a BOM or via an explicit designation of an encoding).
339 | ///
340 | /// When no explicit source encoding is specified (via
341 | /// `DecodeReaderBytesBuilder`), the source encoding is determined by
342 | /// inspecting the BOM from the stream read from `R`, if one exists. If a
343 | /// UTF-16 BOM exists, then the source stream is transcoded to UTF-8 with
344 | /// invalid UTF-16 sequences translated to the Unicode replacement character.
345 | /// Similarly if a UTF-8 BOM is seen. In all other cases, the source of the
346 | /// underlying reader is passed through unchanged _as if_ it were UTF-8.
347 | ///
348 | /// Since this particular reader does not guarantee providing valid UTF-8 to
349 | /// the caller, the caller must be prepared to handle invalid UTF-8 itself.
350 | ///
351 | /// `R` is the type of the underlying reader and `B` is the type of an internal
352 | /// buffer used to store the results of transcoding. Callers may elect to reuse
353 | /// the internal buffer via the `DecodeReaderBytesBuilder::build_with_buffer`
354 | /// constructor.
355 | pub struct DecodeReaderBytes<R, B> {
356 |     /// The underlying reader, wrapped in a peeker for reading a BOM if one
357 |     /// exists.
358 |     rdr: BomPeeker<R>,
359 |     /// The underlying text decoder derived from the BOM or an explicitly
360 |     /// specified encoding, if one exists.
361 |     decoder: Option<Decoder>,
362 |     /// A "tiny transcoder" for use when a caller provides a buffer that is
363 |     /// too small to write at least one UTF-8 encoded codepoint to.
364 |     tiny: TinyTranscoder,
365 |     /// When enabled, if a UTF-8 BOM is observed, then the bytes are passed
366 |     /// through from the underlying reader as-is instead of passing through
367 |     /// the UTF-8 transcoder (which will replace invalid sequences with the
368 |     /// REPLACEMENT CHARACTER).
369 |     utf8_passthru: bool,
370 |     /// The internal buffer to store transcoded bytes before they are read by
371 |     /// callers.
372 |     buf: B,
373 |     /// The current position in `buf`. Subsequent reads start here.
374 |     pos: usize,
375 |     /// The number of transcoded bytes in `buf`. Subsequent reads end here.
376 |     buflen: usize,
377 |     /// Whether BOM detection has been performed yet or not.
378 |     has_detected: bool,
379 |     /// Whether the underlying reader has been exhausted or not.
380 |     exhausted: bool,
381 | }
382 | 
383 | impl<R: io::Read, B: AsMut<[u8]>> io::Read for DecodeReaderBytes<R, B> {
384 |     fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
385 |         self.detect()?;
386 |         if self.decoder.is_none() {
387 |             self.rdr.read(buf)
388 |         } else {
389 |             self.transcode(buf)
390 |         }
391 |     }
392 | }
393 | 
394 | impl<R: io::Read> DecodeReaderBytes<R, Vec<u8>> {
395 |     /// Create a new transcoder that converts a source stream to valid UTF-8
396 |     /// via BOM sniffing.
397 |     ///
398 |     /// To explicitly control the encoding, UTF-8 passthru or amortize
399 |     /// allocation, use the
400 |     /// [`DecodeReaderBytesBuilder`](struct.DecodeReaderBytesBuilder.html)
401 |     /// constructor.
402 |     ///
403 |     /// When a BOM is found (which must correspond to UTF-8, UTF-16LE or
404 |     /// UTF-16BE), then transcoding to UTF-8 is performed and any invalid
405 |     /// sequences in the source data are seamlessly replaced by the Unicode
406 |     /// replacement character.
407 |     ///
408 |     /// When no BOM is found (and no other encoding is specified via the
409 |     /// builder), the underlying bytes are passed through as-is.
410 |     pub fn new(rdr: R) -> DecodeReaderBytes<R, Vec<u8>> {
411 |         DecodeReaderBytesBuilder::new().build(rdr)
412 |     }
413 | }
414 | 
415 | impl<R: io::Read, B: AsMut<[u8]>> DecodeReaderBytes<R, B> {
416 |     /// Transcode the inner stream to UTF-8 in `buf`. This assumes that there
417 |     /// is a decoder capable of transcoding the inner stream to UTF-8. This
418 |     /// returns the number of bytes written to `buf`.
419 |     ///
420 |     /// When this function returns, exactly one of the following things will
421 |     /// be true:
422 |     ///
423 |     /// 1. A non-zero number of bytes were written to `buf`.
424 |     /// 2. The underlying reader reached EOF (or `buf` is empty).
425 |     /// 3. An error is returned: the internal buffer ran out of room.
426 |     /// 4. An I/O error occurred.
427 |     fn transcode(&mut self, buf: &mut [u8]) -> io::Result<usize> {
428 |         if self.exhausted || buf.is_empty() {
429 |             return Ok(0);
430 |         }
431 |         let nwrite = self.tiny.read(buf)?;
432 |         if nwrite > 0 {
433 |             // We could technically mush on if the caller provided buffer is
434 |             // big enough, but to keep things we simple, we satisfy the
435 |             // contract and quit.
436 |             return Ok(nwrite);
437 |         }
438 |         if self.pos >= self.buflen {
439 |             self.fill()?;
440 |         }
441 |         if buf.len() < 4 {
442 |             return self.tiny_transcode(buf);
443 |         }
444 |         loop {
445 |             let (_, nin, nout, _) =
446 |                 self.decoder.as_mut().unwrap().decode_to_utf8(
447 |                     &self.buf.as_mut()[self.pos..self.buflen],
448 |                     buf,
449 |                     false,
450 |                 );
451 |             self.pos += nin;
452 |             // If we've written at least one byte to the caller-provided
453 |             // buffer, then our mission is complete.
454 |             if nout > 0 {
455 |                 return Ok(nout);
456 |             }
457 |             // Otherwise, we know that our internal buffer has insufficient
458 |             // data to transcode at least one char, so we attempt to refill it.
459 |             self.fill()?;
460 |             // ... but quit on EOF.
461 |             if self.buflen == 0 {
462 |                 let (_, _, nout, _) = self
463 |                     .decoder
464 |                     .as_mut()
465 |                     .unwrap()
466 |                     .decode_to_utf8(&[], buf, true);
467 |                 return Ok(nout);
468 |             }
469 |         }
470 |     }
471 | 
472 |     /// Like transcode, but deals with the case where the caller provided
473 |     /// buffer is less than 4.
474 |     fn tiny_transcode(&mut self, buf: &mut [u8]) -> io::Result<usize> {
475 |         assert!(buf.len() < 4, "have a small caller buffer");
476 |         loop {
477 |             let (nin, nout) = self.tiny.transcode(
478 |                 self.decoder.as_mut().unwrap(),
479 |                 &self.buf.as_mut()[self.pos..self.buflen],
480 |                 false,
481 |             );
482 |             self.pos += nin;
483 |             if nout > 0 {
484 |                 // We've satisfied the contract of writing at least one byte,
485 |                 // so we're done. The tiny transcoder is guaranteed to yield
486 |                 // a non-zero number of bytes.
487 |                 return self.tiny.read(buf);
488 |             }
489 |             // Otherwise, we know that our internal buffer has insufficient
490 |             // data to transcode at least one char, so we attempt to refill it.
491 |             self.fill()?;
492 |             // ... but quit on EOF.
493 |             if self.buflen == 0 {
494 |                 self.tiny.transcode(self.decoder.as_mut().unwrap(), &[], true);
495 |                 return self.tiny.read(buf);
496 |             }
497 |         }
498 |     }
499 | 
500 |     /// Peeks at the underlying reader to look for a BOM. If one exists, then
501 |     /// an appropriate decoder is created corresponding to the detected BOM.
502 |     fn detect(&mut self) -> io::Result<()> {
503 |         if self.has_detected {
504 |             return Ok(());
505 |         }
506 |         self.has_detected = true;
507 |         let bom = self.rdr.peek_bom()?;
508 |         if let Some(encoding) = bom.encoding() {
509 |             // If we got a UTF-8 BOM, and the decoder was configured for
510 |             // passing through UTF-8, then don't build a decoder at all.
511 |             if encoding == UTF_8 && self.utf8_passthru {
512 |                 return Ok(());
513 |             }
514 |             self.decoder = Some(encoding.new_decoder_with_bom_removal());
515 |         }
516 |         Ok(())
517 |     }
518 | 
519 |     /// Fill the internal buffer from the underlying reader.
520 |     ///
521 |     /// If there are unread bytes in the internal buffer, then we move them
522 |     /// to the beginning of the internal buffer and fill the remainder.
523 |     ///
524 |     /// If the internal buffer is too small to read additional bytes, then an
525 |     /// error is returned.
526 |     fn fill(&mut self) -> io::Result<()> {
527 |         if self.pos < self.buflen {
528 |             // Despite my best efforts, I could not seem to actually exercise
529 |             // this code path in tests. Namely, this code path occurs when the
530 |             // decoder can't make any progress and also doesn't consume all of
531 |             // the input. Since I'm not sure how to trigger that case, this
532 |             // code path is actually untested!
533 | 
534 |             // We can assert this because we require that the caller provided
535 |             // buffer be at least 4 bytes big.
536 |             assert!(
537 |                 self.buflen < self.buf.as_mut().len(),
538 |                 "internal buffer should never be exhausted"
539 |             );
540 |             let buf = self.buf.as_mut();
541 |             for (dst, src) in (self.pos..self.buflen).enumerate() {
542 |                 buf[dst] = buf[src];
543 |             }
544 |             self.buflen -= self.pos;
545 |         } else {
546 |             self.buflen = 0;
547 |         }
548 |         self.pos = 0;
549 |         self.buflen += self.rdr.read(&mut self.buf.as_mut()[self.buflen..])?;
550 |         if self.buflen == 0 {
551 |             self.exhausted = true;
552 |         }
553 |         Ok(())
554 |     }
555 | }
556 | 
557 | impl<R: fmt::Debug, B: fmt::Debug> fmt::Debug for DecodeReaderBytes<R, B> {
558 |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
559 |         let mut fmter = f.debug_struct("DecodeReaderBytes");
560 |         fmter
561 |             .field("rdr", &self.rdr)
562 |             .field("tiny", &self.tiny)
563 |             .field("utf8_passthru", &self.utf8_passthru)
564 |             .field("buf", &self.buf)
565 |             .field("pos", &self.pos)
566 |             .field("buflen", &self.buflen)
567 |             .field("has_detected", &self.has_detected)
568 |             .field("exhausted", &self.exhausted);
569 |         // Because `encoding_rs::Decoder` doesn't impl `fmt::Debug`.
570 |         if let Some(ref d) = self.decoder {
571 |             let msg = format!("Some(<Decoder for {}>)", d.encoding().name());
572 |             fmter.field("decoder", &msg);
573 |         } else {
574 |             fmter.field("decoder", &"None");
575 |         }
576 |         fmter.finish()
577 |     }
578 | }
579 | 
580 | #[cfg(test)]
581 | mod tests {
582 |     use std::io::Read;
583 | 
584 |     use encoding_rs::{self, Encoding};
585 | 
586 |     use super::{DecodeReaderBytes, DecodeReaderBytesBuilder};
587 | 
588 |     fn read_to_string<R: Read>(mut rdr: R) -> String {
589 |         let mut s = String::new();
590 |         rdr.read_to_string(&mut s).unwrap();
591 |         s
592 |     }
593 | 
594 |     // In cases where all we have is a bom, we expect the bytes to be
595 |     // passed through unchanged.
596 |     #[test]
597 |     fn trans_utf16_bom() {
598 |         let srcbuf = vec![0xFF, 0xFE];
599 |         let mut dstbuf = vec![0; 8 * (1 << 10)];
600 |         let mut rdr = DecodeReaderBytes::new(&*srcbuf);
601 |         let n = rdr.read(&mut dstbuf).unwrap();
602 |         assert_eq!(&*srcbuf, &dstbuf[..n]);
603 | 
604 |         let srcbuf = vec![0xFE, 0xFF];
605 |         let mut rdr = DecodeReaderBytes::new(&*srcbuf);
606 |         let n = rdr.read(&mut dstbuf).unwrap();
607 |         assert_eq!(&*srcbuf, &dstbuf[..n]);
608 | 
609 |         let srcbuf = vec![0xEF, 0xBB, 0xBF];
610 |         let mut rdr = DecodeReaderBytes::new(&*srcbuf);
611 |         let n = rdr.read(&mut dstbuf).unwrap();
612 |         assert_eq!(n, 0);
613 | 
614 |         let srcbuf = vec![0xEF, 0xBB, 0xBF];
615 |         let mut rdr = DecodeReaderBytesBuilder::new()
616 |             .utf8_passthru(true)
617 |             .build(&*srcbuf);
618 |         let n = rdr.read(&mut dstbuf).unwrap();
619 |         assert_eq!(&*srcbuf, &dstbuf[..n]);
620 |     }
621 | 
622 |     // Test basic UTF-16 decoding.
623 |     #[test]
624 |     fn trans_utf16_basic() {
625 |         let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00];
626 |         let mut rdr = DecodeReaderBytes::new(&*srcbuf);
627 |         assert_eq!("a", read_to_string(&mut rdr));
628 | 
629 |         let srcbuf = vec![0xFE, 0xFF, 0x00, 0x61];
630 |         let mut rdr = DecodeReaderBytes::new(&*srcbuf);
631 |         assert_eq!("a", read_to_string(&mut rdr));
632 |     }
633 | 
634 |     #[test]
635 |     fn trans_utf16_basic_without_bom() {
636 |         let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00];
637 |         let mut rdr =
638 |             DecodeReaderBytesBuilder::new().strip_bom(true).build(&*srcbuf);
639 |         assert_eq!("a", read_to_string(&mut rdr));
640 | 
641 |         let srcbuf = vec![0xFE, 0xFF, 0x00, 0x61];
642 |         let mut rdr =
643 |             DecodeReaderBytesBuilder::new().strip_bom(true).build(&*srcbuf);
644 |         assert_eq!("a", read_to_string(&mut rdr));
645 |     }
646 | 
647 |     // Test the BOM override.
648 |     #[test]
649 |     fn trans_utf16_bom_override() {
650 |         let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00];
651 |         let mut rdr = DecodeReaderBytesBuilder::new()
652 |             .bom_override(true)
653 |             .encoding(Some(encoding_rs::UTF_8))
654 |             .build(&*srcbuf);
655 |         assert_eq!("a", read_to_string(&mut rdr));
656 |     }
657 | 
658 |     // Test basic UTF-16 decoding with a small  buffer.
659 |     #[test]
660 |     fn trans_utf16_smallbuf() {
661 |         let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00, 0x62, 0x00, 0x63, 0x00];
662 |         let mut rdr = DecodeReaderBytes::new(&*srcbuf);
663 |         let mut tmp = [0u8; 1];
664 | 
665 |         let nread = rdr.read(&mut tmp).unwrap();
666 |         assert_eq!(nread, 1);
667 |         assert_eq!(tmp, [b'a'; 1]);
668 | 
669 |         let nread = rdr.read(&mut tmp).unwrap();
670 |         assert_eq!(nread, 1);
671 |         assert_eq!(tmp, [b'b'; 1]);
672 | 
673 |         let nread = rdr.read(&mut tmp).unwrap();
674 |         assert_eq!(nread, 1);
675 |         assert_eq!(tmp, [b'c'; 1]);
676 | 
677 |         let nread = rdr.read(&mut tmp).unwrap();
678 |         assert_eq!(nread, 0);
679 |     }
680 | 
681 |     // Test incomplete UTF-16 decoding. This ensures we see a replacement char
682 |     // if the stream ends with an unpaired code unit.
683 |     #[test]
684 |     fn trans_utf16_incomplete() {
685 |         let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00, 0x00];
686 |         let mut rdr = DecodeReaderBytes::new(&*srcbuf);
687 |         assert_eq!("a\u{FFFD}", read_to_string(&mut rdr));
688 |     }
689 | 
690 |     // Test transcoding with a minimal buffer but a large caller buffer.
691 |     #[test]
692 |     fn trans_utf16_minimal_buffer_normal_caller_buffer() {
693 |         #[rustfmt::skip]
694 |         let srcbuf = vec![
695 |             0xFF, 0xFE,
696 |             0x61, 0x00,
697 |             0x62, 0x00,
698 |             0x63, 0x00,
699 |             0x64, 0x00,
700 |             0x65, 0x00,
701 |             0x66, 0x00,
702 |             0x67, 0x00,
703 |             0x68, 0x00,
704 |         ];
705 |         let mut rdr = DecodeReaderBytesBuilder::new()
706 |             .build_with_buffer(&*srcbuf, vec![0; 4])
707 |             .unwrap();
708 |         let got = read_to_string(&mut rdr);
709 |         assert_eq!(got, "abcdefgh");
710 |     }
711 | 
712 |     // Test transcoding with a minimal buffer and a minimal caller buffer.
713 |     #[test]
714 |     fn trans_utf16_minimal_buffers() {
715 |         let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00, 0x62, 0x00, 0x63, 0x00];
716 |         let mut rdr = DecodeReaderBytesBuilder::new()
717 |             .build_with_buffer(&*srcbuf, vec![0; 4])
718 |             .unwrap();
719 |         let mut tmp = [0u8; 1];
720 | 
721 |         let nread = rdr.read(&mut tmp).unwrap();
722 |         assert_eq!(nread, 1);
723 |         assert_eq!(tmp, [b'a'; 1]);
724 | 
725 |         let nread = rdr.read(&mut tmp).unwrap();
726 |         assert_eq!(nread, 1);
727 |         assert_eq!(tmp, [b'b'; 1]);
728 | 
729 |         let nread = rdr.read(&mut tmp).unwrap();
730 |         assert_eq!(nread, 1);
731 |         assert_eq!(tmp, [b'c'; 1]);
732 | 
733 |         let nread = rdr.read(&mut tmp).unwrap();
734 |         assert_eq!(nread, 0);
735 |     }
736 | 
737 |     // Test transcoding with using byte oriented APIs.
738 |     #[test]
739 |     fn trans_utf16_byte_api() {
740 |         #[rustfmt::skip]
741 |         let srcbuf = vec![
742 |             0xFF, 0xFE,
743 |             0x61, 0x00,
744 |             0x62, 0x00,
745 |             0x63, 0x00,
746 |             0x64, 0x00,
747 |             0x65, 0x00,
748 |             0x66, 0x00,
749 |             0x67, 0x00,
750 |             0x68, 0x00,
751 |         ];
752 |         let rdr = DecodeReaderBytes::new(&*srcbuf);
753 |         let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
754 |         assert_eq!(got, b"abcdefgh");
755 |     }
756 | 
757 |     #[test]
758 |     fn trans_utf16_no_sniffing() {
759 |         #[rustfmt::skip]
760 |         let srcbuf = vec![
761 |             0xFF, 0xFE,
762 |             0x61, 0x00,
763 |         ];
764 |         let rdr = DecodeReaderBytesBuilder::new()
765 |             .bom_sniffing(false)
766 |             .build(&*srcbuf);
767 |         let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
768 |         assert_eq!(got, srcbuf);
769 |     }
770 | 
771 |     #[test]
772 |     fn trans_utf16_no_sniffing_strip_bom() {
773 |         #[rustfmt::skip]
774 |         let srcbuf = vec![
775 |             0xFF, 0xFE,
776 |             0x61, 0x00,
777 |         ];
778 |         let rdr = DecodeReaderBytesBuilder::new()
779 |             .bom_sniffing(false)
780 |             .strip_bom(true)
781 |             .build(&*srcbuf);
782 |         let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
783 |         assert_eq!(got, &[0x61, 0x00]);
784 |     }
785 | 
786 |     #[test]
787 |     fn trans_utf16_no_sniffing_encoding_override() {
788 |         #[rustfmt::skip]
789 |         let srcbuf = vec![
790 |             0xFF, 0xFE,
791 |             0x61, 0x00,
792 |         ];
793 |         let rdr = DecodeReaderBytesBuilder::new()
794 |             .bom_sniffing(false)
795 |             .encoding(Some(encoding_rs::UTF_16LE))
796 |             .build(&*srcbuf);
797 |         let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
798 |         assert_eq!(got, b"a");
799 |     }
800 | 
801 |     #[test]
802 |     fn trans_utf16_no_sniffing_encoding_override_strip_bom() {
803 |         #[rustfmt::skip]
804 |         let srcbuf = vec![
805 |             0xFF, 0xFE,
806 |             0x61, 0x00,
807 |         ];
808 |         let rdr = DecodeReaderBytesBuilder::new()
809 |             .bom_sniffing(false)
810 |             .strip_bom(true)
811 |             .encoding(Some(encoding_rs::UTF_16LE))
812 |             .build(&*srcbuf);
813 |         let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
814 |         assert_eq!(got, b"a");
815 |     }
816 | 
817 |     // Test transcoding with a minimal buffer using byte oriented APIs.
818 |     #[test]
819 |     fn trans_utf16_minimal_buffer_byte_api() {
820 |         #[rustfmt::skip]
821 |         let srcbuf = vec![
822 |             0xFF, 0xFE,
823 |             0x61, 0x00,
824 |             0x62, 0x00,
825 |             0x63, 0x00,
826 |             0x64, 0x00,
827 |             0x65, 0x00,
828 |             0x66, 0x00,
829 |             0x67, 0x00,
830 |             0x68, 0x00,
831 |         ];
832 |         let rdr = DecodeReaderBytesBuilder::new()
833 |             .build_with_buffer(&*srcbuf, vec![0; 4])
834 |             .unwrap();
835 |         let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
836 |         assert_eq!(got, b"abcdefgh");
837 |     }
838 | 
839 |     // Test a buffer that is too small.
840 |     #[test]
841 |     fn buffer_too_small() {
842 |         let res = DecodeReaderBytesBuilder::new()
843 |             .build_with_buffer(&[][..], vec![0; 3]);
844 |         assert!(res.is_err());
845 |     }
846 | 
847 |     macro_rules! test_trans_simple {
848 |         ($name:ident, $enc:expr, $srcbytes:expr, $dst:expr) => {
849 |             #[test]
850 |             fn $name() {
851 |                 let srcbuf = &$srcbytes[..];
852 |                 let enc = Encoding::for_label($enc.as_bytes());
853 |                 let mut rdr = DecodeReaderBytesBuilder::new()
854 |                     .encoding(enc)
855 |                     .build(&*srcbuf);
856 |                 assert_eq!($dst, read_to_string(&mut rdr));
857 |             }
858 |         };
859 |     }
860 | 
861 |     // This isn't exhaustive obviously, but it lets us test base level support.
862 |     test_trans_simple!(trans_simple_auto, "does not exist", b"\xD0\x96", "Ж");
863 |     test_trans_simple!(trans_simple_utf8, "utf-8", b"\xD0\x96", "Ж");
864 |     test_trans_simple!(trans_simple_utf16le, "utf-16le", b"\x16\x04", "Ж");
865 |     test_trans_simple!(trans_simple_utf16be, "utf-16be", b"\x04\x16", "Ж");
866 |     test_trans_simple!(trans_simple_chinese, "chinese", b"\xA7\xA8", "Ж");
867 |     test_trans_simple!(trans_simple_korean, "korean", b"\xAC\xA8", "Ж");
868 |     test_trans_simple!(
869 |         trans_simple_big5_hkscs,
870 |         "big5-hkscs",
871 |         b"\xC7\xFA",
872 |         "Ж"
873 |     );
874 |     test_trans_simple!(trans_simple_gbk, "gbk", b"\xA7\xA8", "Ж");
875 |     test_trans_simple!(trans_simple_sjis, "sjis", b"\x84\x47", "Ж");
876 |     test_trans_simple!(trans_simple_eucjp, "euc-jp", b"\xA7\xA8", "Ж");
877 |     test_trans_simple!(trans_simple_latin1, "latin1", b"\xA9", "©");
878 | }
879 | 


--------------------------------------------------------------------------------
/src/util.rs:
--------------------------------------------------------------------------------
  1 | use std::cmp;
  2 | use std::io;
  3 | 
  4 | use encoding_rs::{CoderResult, Decoder, Encoding};
  5 | 
  6 | /// This is the minimum amount of space that a decoder-to-utf8-with-replacement
  7 | /// will use for any state and any input.
  8 | const TINY_BUFFER_SIZE: usize = 7;
  9 | 
 10 | /// A tiny transcoder performs transcoding incrementally even when a caller
 11 | /// provided buffer is not large enough.
 12 | ///
 13 | /// This use case comes up when implementing streaming transcoding in cases
 14 | /// where it is permissible to provide incomplete UTF-8 sequences to the
 15 | /// caller (e.g., when decoding into a `&[u8]` where the caller must be capable
 16 | /// of handling invalid UTF-8). In particular, this type specifically handles
 17 | /// cases where a caller provided buffer is too small to store a full UTF-8
 18 | /// sequence. Thus, this type should be used in cases where the caller provided
 19 | /// buffer has length 3 or fewer.
 20 | ///
 21 | /// This could likely be done with better performance by allocating a larger
 22 | /// buffer for these cases, but we instead opt to handle this without
 23 | /// allocation under the assumption that tiny caller provided buffers are
 24 | /// probably a pathological case.
 25 | #[derive(Clone, Debug)]
 26 | pub struct TinyTranscoder {
 27 |     /// This is where we store the results of a transcoding. Since we are
 28 |     /// always decoding to UTF-8, 7 bytes is sufficient to represent any
 29 |     /// codepoint.
 30 |     partial: [u8; TINY_BUFFER_SIZE],
 31 |     /// The number of bytes written in `partial`.
 32 |     len: usize,
 33 |     /// The position in `partial` at which the next byte should be read.
 34 |     pos: usize,
 35 | }
 36 | 
 37 | impl TinyTranscoder {
 38 |     /// Create a new tiny transcoder that is ready for use.
 39 |     pub fn new() -> TinyTranscoder {
 40 |         TinyTranscoder { partial: [0; TINY_BUFFER_SIZE], len: 0, pos: 0 }
 41 |     }
 42 | 
 43 |     /// Transcode the contents of `src` into this buffer using the provided
 44 |     /// decoder, and return the number of bytes consumed in `src` and the
 45 |     /// number of bytes written to this transcoder.
 46 |     ///
 47 |     /// The results of transcoding can be read using the TinyTranscoder's
 48 |     /// `io::Read` implementation.
 49 |     ///
 50 |     /// If `last` is true, then this signals to the decoder that we've reached
 51 |     /// EOF and `src` must be empty. Otherwise, if `last` is false, then
 52 |     /// `src` must be non-empty. Violating either of these constraits will
 53 |     /// cause a panic.
 54 |     ///
 55 |     /// Finally, if this transcoder still has unconsumed bytes from a previous
 56 |     /// transcode, then this panics. Callers must consume all bytes from a
 57 |     /// previous transcoding before performing another one.
 58 |     pub fn transcode(
 59 |         &mut self,
 60 |         decoder: &mut Decoder,
 61 |         src: &[u8],
 62 |         last: bool,
 63 |     ) -> (usize, usize) {
 64 |         assert!(self.as_slice().is_empty(), "transcoder has unconsumed bytes");
 65 |         if last {
 66 |             assert!(src.is_empty(), "src must be empty when last==true");
 67 |         }
 68 |         let (res, nin, nout, _) =
 69 |             decoder.decode_to_utf8(src, &mut self.partial[..], last);
 70 |         if last {
 71 |             assert_eq!(
 72 |                 res,
 73 |                 CoderResult::InputEmpty,
 74 |                 "input should be exhausted",
 75 |             );
 76 |         }
 77 |         self.pos = 0;
 78 |         self.len = nout;
 79 |         (nin, nout)
 80 |     }
 81 | 
 82 |     /// Return the the bytes remaining to be read as a slice.
 83 |     fn as_slice(&self) -> &[u8] {
 84 |         &self.partial[self.pos..self.len]
 85 |     }
 86 | }
 87 | 
 88 | impl io::Read for TinyTranscoder {
 89 |     fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
 90 |         if self.pos >= self.len {
 91 |             return Ok(0);
 92 |         }
 93 |         let mut count = 0;
 94 |         for (src, dst) in self.as_slice().iter().zip(buf) {
 95 |             *dst = *src;
 96 |             count += 1;
 97 |         }
 98 |         self.pos += count;
 99 |         Ok(count)
100 |     }
101 | }
102 | 
103 | /// `BomPeeker` wraps `R` and satisfies the `io::Read` interface while also
104 | /// providing a peek at the BOM if one exists. Peeking at the BOM does not
105 | /// advance the reader.
106 | #[derive(Debug)]
107 | pub struct BomPeeker<R> {
108 |     rdr: R,
109 |     strip: bool,
110 |     bom: Option<PossibleBom>,
111 |     nread: usize,
112 | }
113 | 
114 | impl<R: io::Read> BomPeeker<R> {
115 |     /// Create a new BomPeeker that includes the BOM in calls to `read`.
116 |     ///
117 |     /// The first three bytes can be read using the `peek_bom` method, but
118 |     /// will not advance the reader.
119 |     pub fn with_bom(rdr: R) -> BomPeeker<R> {
120 |         BomPeeker { rdr: rdr, strip: false, bom: None, nread: 0 }
121 |     }
122 | 
123 |     /// Create a new BomPeeker that never includes the BOM in calls to `read`.
124 |     pub fn without_bom(rdr: R) -> BomPeeker<R> {
125 |         BomPeeker { rdr: rdr, strip: true, bom: None, nread: 0 }
126 |     }
127 | 
128 |     /// Peek at the first three bytes of the underlying reader.
129 |     ///
130 |     /// This does not advance the reader provided by `BomPeeker`.
131 |     ///
132 |     /// If the underlying reader does not have at least two bytes available,
133 |     /// then `None` is returned.
134 |     pub fn peek_bom(&mut self) -> io::Result<PossibleBom> {
135 |         if let Some(bom) = self.bom {
136 |             return Ok(bom);
137 |         }
138 |         // If the underlying reader fails or panics, make sure we set at least
139 |         // an empty BOM so that we don't end up here again..
140 |         self.bom = Some(PossibleBom::new());
141 | 
142 |         // OK, try to read the BOM.
143 |         let mut buf = [0u8; 3];
144 |         let bom_len = read_full(&mut self.rdr, &mut buf)?;
145 |         self.bom = Some(PossibleBom { bytes: buf, len: bom_len });
146 |         Ok(self.bom.unwrap())
147 |     }
148 | }
149 | 
150 | impl<R: io::Read> io::Read for BomPeeker<R> {
151 |     fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
152 |         if self.nread < 3 {
153 |             let bom = self.peek_bom()?;
154 | 
155 |             // If we don't have a valid BOM (e.g., no encoding for it), then
156 |             // we always pass through the first 3 bytes. Otherwise, if we have
157 |             // a valid BOM, we only pass it thru if we don't want to strip it.
158 |             let bom = bom.as_slice(!self.strip);
159 |             if self.nread < bom.len() {
160 |                 let rest = &bom[self.nread..];
161 |                 let len = cmp::min(buf.len(), rest.len());
162 |                 buf[..len].copy_from_slice(&rest[..len]);
163 |                 self.nread += len;
164 |                 return Ok(len);
165 |             }
166 |         }
167 |         let nread = self.rdr.read(buf)?;
168 |         self.nread += nread;
169 |         Ok(nread)
170 |     }
171 | }
172 | 
173 | /// A PossibleBom is a sequence of bytes at the beginning of a stream that
174 | /// may represent an actual BOM. To detect the BOM, this must contain at
175 | /// least 3 bytes.
176 | ///
177 | /// If this is a valid UTF-8 or UTF-16 BOM, then an encoding_rs decoder can
178 | /// be built from the BOM.
179 | #[derive(Clone, Copy, Debug, Eq, PartialEq)]
180 | pub struct PossibleBom {
181 |     bytes: [u8; 3],
182 |     len: usize,
183 | }
184 | 
185 | impl PossibleBom {
186 |     /// Build a new empty BOM.
187 |     fn new() -> PossibleBom {
188 |         PossibleBom { bytes: [0; 3], len: 0 }
189 |     }
190 | 
191 |     /// Return the BOM as a normal slice.
192 |     ///
193 |     /// If `bom` is true, then this includes any leading BOM bytes. Otherwise,
194 |     /// this only includes non-BOM bytes.
195 |     fn as_slice(&self, bom: bool) -> &[u8] {
196 |         let slice = &self.bytes[0..self.len];
197 |         if bom || slice.len() <= 1 {
198 |             slice
199 |         } else if &slice[0..2] == b"\xFF\xFE" || &slice[0..2] == b"\xFE\xFF" {
200 |             &slice[2..]
201 |         } else if slice == b"\xEF\xBB\xBF" {
202 |             &[]
203 |         } else {
204 |             slice
205 |         }
206 |     }
207 | 
208 |     /// If this is a valid UTF-8 or UTF-16 BOM, return its corresponding
209 |     /// encoding. Otherwise, return `None`.
210 |     pub fn encoding(&self) -> Option<&'static Encoding> {
211 |         let bom = self.as_slice(true);
212 |         if bom.len() < 3 {
213 |             return None;
214 |         }
215 |         if let Some((enc, _)) = Encoding::for_bom(bom) {
216 |             return Some(enc);
217 |         }
218 |         None
219 |     }
220 | }
221 | 
222 | /// Like `io::Read::read_exact`, except it never returns `UnexpectedEof` and
223 | /// instead returns the number of bytes read if EOF is seen before filling
224 | /// `buf`.
225 | pub fn read_full<R: io::Read>(
226 |     mut rdr: R,
227 |     mut buf: &mut [u8],
228 | ) -> io::Result<usize> {
229 |     let mut nread = 0;
230 |     while !buf.is_empty() {
231 |         match rdr.read(buf) {
232 |             Ok(0) => break,
233 |             Ok(n) => {
234 |                 nread += n;
235 |                 let tmp = buf;
236 |                 buf = &mut tmp[n..];
237 |             }
238 |             Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
239 |             Err(e) => return Err(e),
240 |         }
241 |     }
242 |     Ok(nread)
243 | }
244 | 
245 | #[cfg(test)]
246 | mod tests {
247 |     use super::{BomPeeker, PossibleBom, TinyTranscoder};
248 |     use encoding_rs::Encoding;
249 |     use std::io::Read;
250 | 
251 |     #[test]
252 |     fn tiny_utf16_normal() {
253 |         let enc = Encoding::for_label(b"utf-16le").unwrap();
254 |         let mut dec = enc.new_decoder_with_bom_removal();
255 |         let mut bytes = &b"f\x00o\x00o\x00b\x00a\x00r\x00b\x00a\x00z\x00"[..];
256 |         let mut tiny = TinyTranscoder::new();
257 |         let mut tmp = [0u8; 1];
258 | 
259 |         let (nin, nout) = tiny.transcode(&mut dec, bytes, false);
260 |         assert_eq!(nin, 14);
261 |         assert_eq!(nout, 7);
262 |         bytes = &bytes[nin..];
263 | 
264 |         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
265 |         assert_eq!(tmp, [b'f'; 1]);
266 |         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
267 |         assert_eq!(tmp, [b'o'; 1]);
268 |         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
269 |         assert_eq!(tmp, [b'o'; 1]);
270 |         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
271 |         assert_eq!(tmp, [b'b'; 1]);
272 |         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
273 |         assert_eq!(tmp, [b'a'; 1]);
274 |         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
275 |         assert_eq!(tmp, [b'r'; 1]);
276 |         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
277 |         assert_eq!(tmp, [b'b'; 1]);
278 | 
279 |         let (nin, nout) = tiny.transcode(&mut dec, bytes, false);
280 |         assert_eq!(nin, 4);
281 |         assert_eq!(nout, 2);
282 |         bytes = &bytes[nin..];
283 | 
284 |         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
285 |         assert_eq!(tmp, [b'a'; 1]);
286 |         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
287 |         assert_eq!(tmp, [b'z'; 1]);
288 | 
289 |         let (nin, nout) = tiny.transcode(&mut dec, bytes, true);
290 |         assert_eq!(nin, 0);
291 |         assert_eq!(nout, 0);
292 | 
293 |         assert_eq!(tiny.read(&mut tmp).unwrap(), 0);
294 |     }
295 | 
296 |     #[test]
297 |     fn tiny_utf16_invalid() {
298 |         let enc = Encoding::for_label(b"utf-16le").unwrap();
299 |         let mut dec = enc.new_decoder_with_bom_removal();
300 |         let mut bytes = &b"\x00"[..];
301 |         let mut tiny = TinyTranscoder::new();
302 |         let mut tmp = [0u8; 1];
303 | 
304 |         let (nin, nout) = tiny.transcode(&mut dec, bytes, false);
305 |         assert_eq!(nin, 1);
306 |         assert_eq!(nout, 0);
307 |         assert_eq!(tiny.read(&mut tmp).unwrap(), 0);
308 |         bytes = &bytes[nin..];
309 | 
310 |         let (nin, nout) = tiny.transcode(&mut dec, bytes, true);
311 |         assert_eq!(nin, 0);
312 |         assert_eq!(nout, 3);
313 | 
314 |         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
315 |         assert_eq!(tmp, [b'\xEF'; 1]);
316 |         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
317 |         assert_eq!(tmp, [b'\xBF'; 1]);
318 |         assert_eq!(tiny.read(&mut tmp).unwrap(), 1);
319 |         assert_eq!(tmp, [b'\xBD'; 1]);
320 |         assert_eq!(tiny.read(&mut tmp).unwrap(), 0);
321 |     }
322 | 
323 |     #[test]
324 |     fn peeker_empty() {
325 |         let buf = [];
326 |         let mut peeker = BomPeeker::with_bom(&buf[..]);
327 |         assert_eq!(PossibleBom::new(), peeker.peek_bom().unwrap());
328 | 
329 |         let mut tmp = [0; 100];
330 |         assert_eq!(0, peeker.read(&mut tmp).unwrap());
331 |     }
332 | 
333 |     #[test]
334 |     fn peeker_one() {
335 |         let buf = [1];
336 |         let mut peeker = BomPeeker::with_bom(&buf[..]);
337 |         assert_eq!(
338 |             PossibleBom { bytes: [1, 0, 0], len: 1 },
339 |             peeker.peek_bom().unwrap()
340 |         );
341 | 
342 |         let mut tmp = [0; 100];
343 |         assert_eq!(1, peeker.read(&mut tmp).unwrap());
344 |         assert_eq!(1, tmp[0]);
345 |         assert_eq!(0, peeker.read(&mut tmp).unwrap());
346 |     }
347 | 
348 |     #[test]
349 |     fn peeker_two() {
350 |         let buf = [1, 2];
351 |         let mut peeker = BomPeeker::with_bom(&buf[..]);
352 |         assert_eq!(
353 |             PossibleBom { bytes: [1, 2, 0], len: 2 },
354 |             peeker.peek_bom().unwrap()
355 |         );
356 | 
357 |         let mut tmp = [0; 100];
358 |         assert_eq!(2, peeker.read(&mut tmp).unwrap());
359 |         assert_eq!(1, tmp[0]);
360 |         assert_eq!(2, tmp[1]);
361 |         assert_eq!(0, peeker.read(&mut tmp).unwrap());
362 |     }
363 | 
364 |     #[test]
365 |     fn peeker_three() {
366 |         let buf = [1, 2, 3];
367 |         let mut peeker = BomPeeker::with_bom(&buf[..]);
368 |         assert_eq!(
369 |             PossibleBom { bytes: [1, 2, 3], len: 3 },
370 |             peeker.peek_bom().unwrap()
371 |         );
372 | 
373 |         let mut tmp = [0; 100];
374 |         assert_eq!(3, peeker.read(&mut tmp).unwrap());
375 |         assert_eq!(1, tmp[0]);
376 |         assert_eq!(2, tmp[1]);
377 |         assert_eq!(3, tmp[2]);
378 |         assert_eq!(0, peeker.read(&mut tmp).unwrap());
379 |     }
380 | 
381 |     #[test]
382 |     fn peeker_four() {
383 |         let buf = [1, 2, 3, 4];
384 |         let mut peeker = BomPeeker::with_bom(&buf[..]);
385 |         assert_eq!(
386 |             PossibleBom { bytes: [1, 2, 3], len: 3 },
387 |             peeker.peek_bom().unwrap()
388 |         );
389 | 
390 |         let mut tmp = [0; 100];
391 |         assert_eq!(3, peeker.read(&mut tmp).unwrap());
392 |         assert_eq!(1, tmp[0]);
393 |         assert_eq!(2, tmp[1]);
394 |         assert_eq!(3, tmp[2]);
395 |         assert_eq!(1, peeker.read(&mut tmp).unwrap());
396 |         assert_eq!(4, tmp[0]);
397 |         assert_eq!(0, peeker.read(&mut tmp).unwrap());
398 |     }
399 | 
400 |     #[test]
401 |     fn peeker_one_at_a_time() {
402 |         let buf = [1, 2, 3, 4];
403 |         let mut peeker = BomPeeker::with_bom(&buf[..]);
404 | 
405 |         let mut tmp = [0; 1];
406 |         assert_eq!(0, peeker.read(&mut tmp[..0]).unwrap());
407 |         assert_eq!(0, tmp[0]);
408 |         assert_eq!(1, peeker.read(&mut tmp).unwrap());
409 |         assert_eq!(1, tmp[0]);
410 |         assert_eq!(1, peeker.read(&mut tmp).unwrap());
411 |         assert_eq!(2, tmp[0]);
412 |         assert_eq!(1, peeker.read(&mut tmp).unwrap());
413 |         assert_eq!(3, tmp[0]);
414 |         assert_eq!(1, peeker.read(&mut tmp).unwrap());
415 |         assert_eq!(4, tmp[0]);
416 |     }
417 | 
418 |     #[test]
419 |     fn peeker_without_bom() {
420 |         let buf = [b'\xEF', b'\xBB', b'\xBF', b'a'];
421 |         let mut peeker = BomPeeker::without_bom(&buf[..]);
422 |         assert_eq!(
423 |             PossibleBom { bytes: [b'\xEF', b'\xBB', b'\xBF'], len: 3 },
424 |             peeker.peek_bom().unwrap()
425 |         );
426 | 
427 |         let mut tmp = [0; 100];
428 |         assert_eq!(1, peeker.read(&mut tmp).unwrap());
429 |         assert_eq!(b'a', tmp[0]);
430 |         assert_eq!(0, peeker.read(&mut tmp).unwrap());
431 |     }
432 | 
433 |     #[test]
434 |     fn peeker_without_bom_nobom() {
435 |         let buf = [1, 2, 3, 4];
436 |         let mut peeker = BomPeeker::without_bom(&buf[..]);
437 |         assert_eq!(
438 |             PossibleBom { bytes: [1, 2, 3], len: 3 },
439 |             peeker.peek_bom().unwrap()
440 |         );
441 | 
442 |         let mut tmp = [0; 100];
443 |         assert_eq!(3, peeker.read(&mut tmp).unwrap());
444 |         assert_eq!(1, tmp[0]);
445 |         assert_eq!(2, tmp[1]);
446 |         assert_eq!(3, tmp[2]);
447 |         assert_eq!(1, peeker.read(&mut tmp).unwrap());
448 |         assert_eq!(4, tmp[0]);
449 |         assert_eq!(0, peeker.read(&mut tmp).unwrap());
450 |     }
451 | }
452 | 


--------------------------------------------------------------------------------