├── .gitignore ├── CONTRIBUTING.md ├── COPYRIGHT ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md └── src └── lib.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | **/*.rs.bk 3 | Cargo.lock 4 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | If you send a pull request / patch, please observe the following. 2 | 3 | ## Licensing 4 | 5 | Since this crate is dual-licensed, 6 | [section 5 of the Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0#contributions) 7 | is considered to apply in the sense of Contributions being automatically 8 | under the Apache License 2.0 or MIT dual license (see the `COPYRIGHT` file). 9 | That is, by the act of offering a Contribution, you place your Contribution 10 | under the Apache License 2.0 or MIT dual license stated in the `COPYRIGHT` 11 | file. Please do not contribute if you aren't willing or allowed to license your 12 | contributions in this manner. 13 | 14 | You are encouraged to dedicate test code that you contribute to the Public 15 | Domain using the CC0 dedication. If you contribute test code that is not 16 | dedicated to the Public Domain, please be sure not to put it in a part of 17 | source code that the comments designate as being dedicated to the Public 18 | Domain. 19 | 20 | ## Copyright Notices 21 | 22 | If you require the addition of your copyright notice, it's up to you to edit in 23 | your notice as part of your Contribution. Not adding a copyright notice is 24 | taken as a waiver of copyright notice. 25 | 26 | ## Compatibility with Stable Rust 27 | 28 | Please ensure that your Contribution compiles with the latest stable-channel 29 | rustc. 30 | 31 | ## rustfmt 32 | 33 | The `rustfmt` version used for this code is `rustfmt-nightly`. Please either 34 | use that version or avoid using `rustfmt` (so as not to reformat all the code). 35 | 36 | ## Unit tests 37 | 38 | Please ensure that `cargo test` succeeds. 39 | -------------------------------------------------------------------------------- /COPYRIGHT: -------------------------------------------------------------------------------- 1 | charset is copyright 2013-2016 Mozilla Foundation. 2 | 3 | Licensed under the Apache License, Version 2.0 4 | or the MIT 6 | license , 7 | at your option. All files in the project carrying such 8 | notice may not be copied, modified, or distributed except 9 | according to those terms. 10 | 11 | Test code within encoding_rs is dedicated to the Public Domain when so 12 | designated (see the individual files for PD/CC0-dedicated sections). 13 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "charset" 3 | description = "Character encoding decoding for email" 4 | version = "0.1.5" 5 | authors = ["Henri Sivonen "] 6 | edition = "2018" 7 | license = "Apache-2.0 OR MIT" 8 | readme = "README.md" 9 | documentation = "https://docs.rs/charset/" 10 | homepage = "https://docs.rs/charset/" 11 | repository = "https://github.com/hsivonen/charset" 12 | keywords = ["encoding", "email", "unicode", "charset", "utf-7"] 13 | categories = ["text-processing", "encoding", "email"] 14 | rust-version = "1.47.0" 15 | 16 | [dependencies] 17 | encoding_rs = "0.8.34" 18 | base64 = { version = "0.22.1", default-features = false } 19 | serde = { version = "1.0", optional = true } 20 | 21 | [dev-dependencies] 22 | serde_derive = "1.0" 23 | bincode = "1.3.3" 24 | serde_json = "1.0" 25 | 26 | [badges.maintenance] 27 | status = "passively-maintained" 28 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright Mozilla Foundation 2 | 3 | Permission is hereby granted, free of charge, to any 4 | person obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the 6 | Software without restriction, including without 7 | limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software 10 | is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice 14 | shall be included in all copies or substantial portions 15 | of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # charset 2 | 3 | [![crates.io](https://img.shields.io/crates/v/charset.svg)](https://crates.io/crates/charset) 4 | [![docs.rs](https://docs.rs/charset/badge.svg)](https://docs.rs/charset/) 5 | [![Apache-2.0 OR MIT dual-licensed](https://img.shields.io/badge/license-Apache%202%20%2F%20MIT-blue.svg)](https://github.com/hsivonen/charset/blob/master/COPYRIGHT) 6 | 7 | `charset` is a wrapper around [`encoding_rs`][1] that provides 8 | (non-streaming) decoding for character encodings that occur in _email_ by 9 | providing decoding for [UTF-7][2] in addition to the encodings defined by 10 | the [Encoding Standard][3] (and provided by `encoding_rs`). 11 | 12 | _Note:_ Do _not_ use this crate for consuming _Web_ content. For security 13 | reasons, consumers of Web content are [_prohibited_][4] from supporting 14 | UTF-7. Use `encoding_rs` directly when consuming Web content. 15 | 16 | The set of encodings consisting of UTF-7 and the encodings defined in the 17 | Encoding Standard is believed to be appropriate for consuming email, 18 | because that's the set of encodings supported by [Thunderbird][5]. 19 | Furthermore, UTF-7 support is believed to be necessary based on the 20 | experience of the Firefox OS email client. In fact, while the UTF-7 21 | implementation in this crate is independent of Thunderbird's UTF-7 22 | implementation, Thunderbird uses `encoding_rs` to decode the other 23 | encodings. In addition to the labels defined in the Encoding Standard, 24 | this crate recognizes additional `java.io` and `java.nio` names for 25 | compatibility with JavaMail. For UTF-7, IANA and Netscape 4.0 labels 26 | are recognized. 27 | 28 | Known compatibility limitations (known from Thunderbird bug reports): 29 | 30 | * Some ancient Usenet posting in Chinese may not be decodable, because 31 | this crate does not support HZ. 32 | * Some emails sent in Chinese by Sun's email client for CDE on Solaris 33 | around the turn of the millennium may not decodable, because this 34 | crate does not support ISO-2022-CN. 35 | * Some emails sent in Korean by IBM/Lotus Notes may not be decodable, 36 | because this crate does not support ISO-2022-KR. 37 | 38 | This crate intentionally does not support encoding content into legacy 39 | encodings. When sending email, _always_ use UTF-8. This is, just call 40 | `.as_bytes()` on `&str` and label the content as `UTF-8`. 41 | 42 | [1]: https://crates.io/crates/encoding_rs/ 43 | [2]: https://tools.ietf.org/html/rfc2152 44 | [3]: https://encoding.spec.whatwg.org/ 45 | [4]: https://html.spec.whatwg.org/#character-encodings 46 | [5]: https://thunderbird.net/ 47 | 48 | ## Version 1.0 49 | 50 | Logically this crate should be at version 1.0, but it's not worth the hassle 51 | to do a version number semver break when there's no actual API break. The 52 | expectation is to do 1.0 when `encoding_rs` 1.0 comes along. 53 | 54 | ## Licensing 55 | 56 | Apache-2.0 OR MIT; please see the file named 57 | [COPYRIGHT](https://github.com/hsivonen/charset/blob/master/COPYRIGHT). 58 | 59 | ## API Documentation 60 | 61 | Generated [API documentation](https://docs.rs/charset/) is available 62 | online. 63 | 64 | ## Security Considerations 65 | 66 | Again, this crate is for _email_. Please do _NOT_ use it for _Web_ 67 | content. 68 | 69 | Never try to perform any security analysis on the undecoded data in 70 | ASCII-incompatible encodings and in UTF-7 in particular. Always decode 71 | first and analyze after. UTF-7 allows even characters that don't have to 72 | be represented as base64 to be represented as base64. Also, for consistency 73 | with Thunderbird, the UTF-7 decoder in this crate allows e.g. ASCII 74 | controls to be represented without base64 encoding even when the spec 75 | says they should be base64-encoded. 76 | 77 | This implementation is non-constant-time by design. An attacker who 78 | can observe input length and the time it takes to decode it can make 79 | guesses about relative proportions of characters from different ranges. 80 | Guessing the proportion of ASCII vs. non-ASCII should be particularly 81 | feasible. 82 | 83 | ## Serde support 84 | 85 | The cargo features `serde` enables Serde support for `Charset`. 86 | 87 | ## Minimum Rust Version 88 | 89 | The MSRV depends on the `encoding_rs` and `base64` dependencies; not on this 90 | crate. The current MSRV appears to be 1.47.0. This crate does not undergo 91 | semver bumps for `base64` semver bumps. 92 | 93 | ## Disclaimer 94 | 95 | This is a personal project. It has a Mozilla copyright notice, because 96 | I copied and pasted from encoding_rs. You should not try to read anything 97 | more into Mozilla's name appearing. 98 | 99 | ## Release Notes 100 | 101 | ### 0.1.5 102 | 103 | * Update `bincode` (dev dependency only) to 1.3.3. 104 | 105 | ### 0.1.4 106 | 107 | * Update `base64` to 0.22.1. 108 | * Update `encoding_rs` to 0.8.34. 109 | * This crate is now a `no_std` + `alloc` crate. 110 | * Added support for java.io and java.nio names to accommodate JavaMail: 111 | - ISO-8859-N series in the form iso8859_N, except 10, 11, 14 and 16 (no evidence of existing in JavaMail) and 8 (unclear if visual or logical in JavaMail if even actually sent by JavaMail). 112 | - CJK and Thai Windows code page numbers prefixed with ms (and 950 also suffixed with _hkscs). 113 | - EUC variants (including CN, i.e. GBK) and KOI with underscore: euc_jp, euc_kr, euc_cn, koi8_r, and koi8_u. 114 | - Windows code page numbers 874, 949, 950 prefixed with x-windows-. 115 | - tis620 and iso2022jp without hyphens. 116 | * Added IANA and Netscape 4.0 aliases for UTF-7. 117 | 118 | ### 0.1.3 119 | 120 | * Update `base64` to 0.13.0. 121 | 122 | ### 0.1.2 123 | 124 | * Implemented `From<&'static Encoding>` for `Charset`. 125 | * Added optional Serde support. 126 | 127 | ### 0.1.1 128 | 129 | * Added `decode_ascii()`. 130 | * Added `decode_latin1()`. 131 | 132 | ### 0.1.0 133 | 134 | Initial release. -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | // Copyright Mozilla Foundation. See the COPYRIGHT 2 | // file at the top-level directory of this distribution. 3 | // 4 | // Licensed under the Apache License, Version 2.0 or the MIT license 6 | // , at your 7 | // option. This file may not be copied, modified, or distributed 8 | // except according to those terms. 9 | 10 | //! `charset` is a wrapper around [`encoding_rs`][1] that provides 11 | //! (non-streaming) decoding for character encodings that occur in _email_ by 12 | //! providing decoding for [UTF-7][2] in addition to the encodings defined by 13 | //! the [Encoding Standard][3] (and provided by `encoding_rs`). 14 | //! 15 | //! _Note:_ Do _not_ use this crate for consuming _Web_ content. For security 16 | //! reasons, consumers of Web content are [_prohibited_][4] from supporting 17 | //! UTF-7. Use `encoding_rs` directly when consuming Web content. 18 | //! 19 | //! The set of encodings consisting of UTF-7 and the encodings defined in the 20 | //! Encoding Standard is believed to be appropriate for consuming email, 21 | //! because that's the set of encodings supported by [Thunderbird][5]. 22 | //! Furthermore, UTF-7 support is believed to be necessary based on the 23 | //! experience of the Firefox OS email client. In fact, while the UTF-7 24 | //! implementation in this crate is independent of Thunderbird's UTF-7 25 | //! implementation, Thunderbird uses `encoding_rs` to decode the other 26 | //! encodings. In addition to the labels defined in the Encoding Standard, 27 | //! this crate recognizes additional `java.io` and `java.nio` names for 28 | //! compatibility with JavaMail. For UTF-7, IANA and Netscape 4.0 labels 29 | //! are recognized. 30 | //! 31 | //! Known compatibility limitations (known from Thunderbird bug reports): 32 | //! 33 | //! * Some ancient Usenet posting in Chinese may not be decodable, because 34 | //! this crate does not support HZ. 35 | //! * Some emails sent in Chinese by Sun's email client for CDE on Solaris 36 | //! around the turn of the millennium may not decodable, because this 37 | //! crate does not support ISO-2022-CN. 38 | //! * Some emails sent in Korean by IBM/Lotus Notes may not be decodable, 39 | //! because this crate does not support ISO-2022-KR. 40 | //! 41 | //! This crate intentionally does not support encoding content into legacy 42 | //! encodings. When sending email, _always_ use UTF-8. This is, just call 43 | //! `.as_bytes()` on `&str` and label the content as `UTF-8`. 44 | //! 45 | //! [1]: https://crates.io/crates/encoding_rs/ 46 | //! [2]: https://tools.ietf.org/html/rfc2152 47 | //! [3]: https://encoding.spec.whatwg.org/ 48 | //! [4]: https://html.spec.whatwg.org/#character-encodings 49 | //! [5]: https://thunderbird.net/ 50 | //! 51 | //! # Security considerations 52 | //! 53 | //! Again, this crate is for _email_. Please do _NOT_ use it for _Web_ 54 | //! content. 55 | //! 56 | //! Never try to perform any security analysis on the undecoded data in 57 | //! ASCII-incompatible encodings and in UTF-7 in particular. Always decode 58 | //! first and analyze after. UTF-7 allows even characters that don't have to 59 | //! be represeted as base64 to be represented as base64. Also, for consistency 60 | //! with Thunderbird, the UTF-7 decoder in this crate allows e.g. ASCII 61 | //! controls to be represented without base64 encoding even when the spec 62 | //! says they should be base64-encoded. 63 | //! 64 | //! This implementation is non-constant-time by design. An attacker who 65 | //! can observe input length and the time it takes to decode it can make 66 | //! guesses about relative proportions of characters from different ranges. 67 | //! Guessing the proportion of ASCII vs. non-ASCII should be particularly 68 | //! feasible. 69 | 70 | #![no_std] 71 | 72 | #[cfg_attr(feature = "serde", macro_use)] 73 | extern crate alloc; 74 | extern crate base64; 75 | extern crate encoding_rs; 76 | 77 | #[cfg(feature = "serde")] 78 | extern crate serde; 79 | 80 | #[cfg(all(test, feature = "serde"))] 81 | extern crate bincode; 82 | #[cfg(all(test, feature = "serde"))] 83 | #[macro_use] 84 | extern crate serde_derive; 85 | #[cfg(all(test, feature = "serde"))] 86 | extern crate serde_json; 87 | 88 | use base64::engine::general_purpose::STANDARD_NO_PAD; 89 | use base64::Engine; 90 | use encoding_rs::CoderResult; 91 | use encoding_rs::Encoding; 92 | use encoding_rs::GB18030; 93 | use encoding_rs::GBK; 94 | use encoding_rs::UTF_16BE; 95 | 96 | use alloc::borrow::Cow; 97 | use alloc::string::String; 98 | use alloc::vec::Vec; 99 | 100 | use core::cmp::Ordering; 101 | 102 | #[cfg(feature = "serde")] 103 | use serde::de::Visitor; 104 | #[cfg(feature = "serde")] 105 | use serde::{Deserialize, Deserializer, Serialize, Serializer}; 106 | 107 | /// The UTF-7 encoding. 108 | pub const UTF_7: Charset = Charset { 109 | variant: VariantCharset::Utf7, 110 | }; 111 | 112 | /// Converts bytes whose unsigned value is interpreted as Unicode code point 113 | /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8. 114 | /// 115 | /// This is useful for decoding non-conforming header names such that the 116 | /// names stay unique and the decoding cannot fail (except for allocation 117 | /// failure). 118 | /// 119 | /// Borrows if input is ASCII-only. Performs a single heap allocation 120 | /// otherwise. 121 | pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> { 122 | encoding_rs::mem::decode_latin1(bytes) 123 | } 124 | 125 | /// Converts ASCII to UTF-8 with non-ASCII bytes replaced with the 126 | /// REPLACEMENT CHARACTER. 127 | /// 128 | /// This is can be used for strict MIME compliance when there is no declared 129 | /// encoding. 130 | /// 131 | /// Borrows if input is ASCII-only. Performs a single heap allocation 132 | /// otherwise. 133 | pub fn decode_ascii<'a>(bytes: &'a [u8]) -> Cow<'a, str> { 134 | let up_to = Encoding::ascii_valid_up_to(bytes); 135 | // >= makes later things optimize better than == 136 | if up_to >= bytes.len() { 137 | debug_assert_eq!(up_to, bytes.len()); 138 | let s: &str = unsafe { ::core::str::from_utf8_unchecked(bytes) }; 139 | return Cow::Borrowed(s); 140 | } 141 | let (head, tail) = bytes.split_at(up_to); 142 | let capacity = head.len() + tail.len() * 3; 143 | let mut vec = Vec::with_capacity(capacity); 144 | vec.extend_from_slice(head); 145 | for &b in tail.into_iter() { 146 | if b < 0x80 { 147 | vec.push(b); 148 | } else { 149 | vec.extend_from_slice("\u{FFFD}".as_bytes()); 150 | } 151 | } 152 | Cow::Owned(unsafe { String::from_utf8_unchecked(vec) }) 153 | } 154 | 155 | /// A character encoding suitable for decoding _email_. 156 | /// 157 | /// This is either an encoding as defined in the [Encoding Standard][1] 158 | /// or UTF-7 as defined in [RFC 2152][2]. 159 | /// 160 | /// [1]: https://encoding.spec.whatwg.org/ 161 | /// [2]: https://tools.ietf.org/html/rfc2152 162 | /// 163 | /// Each `Charset` has one or more _labels_ that are used to identify 164 | /// the `Charset` in protocol text. In MIME/IANA terminology, these are 165 | /// called _names_ and _aliases_, but for consistency with the Encoding 166 | /// Standard and the encoding_rs crate, they are called labels in this 167 | /// crate. What this crate calls the _name_ (again, for consistency 168 | /// with the Encoding Standard and the encoding_rs crate) is known as 169 | /// _preferred name_ in MIME/IANA terminology. 170 | /// 171 | /// Instances of `Charset` can be compared with `==`. `Charset` is 172 | /// `Copy` and is meant to be passed by value. 173 | /// 174 | /// _Note:_ It is wrong to use this for decoding Web content. Use 175 | /// `encoding_rs::Encoding` instead! 176 | #[derive(PartialEq, Debug, Copy, Clone, Hash)] 177 | pub struct Charset { 178 | variant: VariantCharset, 179 | } 180 | 181 | impl Charset { 182 | /// Implements the 183 | /// [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get) 184 | /// algorithm with the label "UTF-7" added to the set of labels recognized. 185 | /// GBK is unified with gb18030, since they decode the same and `Charset` 186 | /// only supports decoding. 187 | /// 188 | /// If, after ASCII-lowercasing and removing leading and trailing 189 | /// whitespace, the argument matches a label defined in the Encoding 190 | /// Standard or "utf-7", `Some(Charset)` representing the corresponding 191 | /// encoding is returned. If there is no match, `None` is returned. 192 | /// 193 | /// This is the right method to use if the action upon the method returning 194 | /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`) instead. 195 | /// When the action upon the method returning `None` is not to proceed with 196 | /// a fallback but to refuse processing, `for_label_no_replacement()` is more 197 | /// appropriate. 198 | /// 199 | /// The argument is of type `&[u8]` instead of `&str` to save callers 200 | /// that are extracting the label from a non-UTF-8 protocol the trouble 201 | /// of conversion to UTF-8. (If you have a `&str`, just call `.as_bytes()` 202 | /// on it.) 203 | #[inline] 204 | pub fn for_label(label: &[u8]) -> Option { 205 | if let Some(encoding) = Encoding::for_label(label) { 206 | Some(Charset::for_encoding(encoding)) 207 | } else if let Some(variant_charset) = for_label_extended(label) { 208 | Some(Charset { 209 | variant: variant_charset, 210 | }) 211 | } else { 212 | None 213 | } 214 | } 215 | 216 | /// This method behaves the same as `for_label()`, except when `for_label()` 217 | /// would return `Some(Charset::for_encoding(encoding_rs::REPLACEMENT))`, 218 | /// this method returns `None` instead. 219 | /// 220 | /// This method is useful in scenarios where a fatal error is required 221 | /// upon invalid label, because in those cases the caller typically wishes 222 | /// to treat the labels that map to the replacement encoding as fatal 223 | /// errors, too. 224 | /// 225 | /// It is not OK to use this method when the action upon the method returning 226 | /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`) with `text/html` 227 | /// email. In such a case, the `for_label()` method should be used instead in 228 | /// order to avoid unsafe fallback for labels that `for_label()` maps to 229 | /// `Some(REPLACEMENT)`. Such fallback might be safe, though not particularly 230 | /// useful for `text/plain` email, though. 231 | #[inline] 232 | pub fn for_label_no_replacement(label: &[u8]) -> Option { 233 | if let Some(encoding) = Encoding::for_label_no_replacement(label) { 234 | Some(Charset::for_encoding(encoding)) 235 | } else if let Some(variant_charset) = for_label_extended(label) { 236 | Some(Charset { 237 | variant: variant_charset, 238 | }) 239 | } else { 240 | None 241 | } 242 | } 243 | 244 | /// Returns the `Charset` corresponding to an `&'static Encoding`. 245 | /// 246 | /// `GBK` is unified with `GB18030`, since those two decode the same 247 | /// and `Charset` only supports decoding. 248 | #[inline] 249 | pub fn for_encoding(encoding: &'static Encoding) -> Charset { 250 | let enc = if encoding == GBK { GB18030 } else { encoding }; 251 | Charset { 252 | variant: VariantCharset::Encoding(enc), 253 | } 254 | } 255 | 256 | /// Performs non-incremental BOM sniffing. 257 | /// 258 | /// The argument must either be a buffer representing the entire input 259 | /// stream (non-streaming case) or a buffer representing at least the first 260 | /// three bytes of the input stream (streaming case). 261 | /// 262 | /// Returns `Some((Charset::for_encoding(encoding_rs::UTF_8), 3))`, 263 | /// `Some((Charset::for_encoding(encoding_rs::UTF_16LE), 2))` or 264 | /// `Some((Charset::for_encoding(encoding_rs::UTF_16BE), 2))` if the 265 | /// argument starts with the UTF-8, UTF-16LE or UTF-16BE BOM or `None` 266 | /// otherwise. 267 | #[inline] 268 | pub fn for_bom(buffer: &[u8]) -> Option<(Charset, usize)> { 269 | if let Some((encoding, length)) = Encoding::for_bom(buffer) { 270 | Some((Charset::for_encoding(encoding), length)) 271 | } else { 272 | None 273 | } 274 | } 275 | 276 | /// Returns the name of this encoding. 277 | /// 278 | /// Mostly useful for debugging 279 | pub fn name(self) -> &'static str { 280 | match self.variant { 281 | VariantCharset::Encoding(encoding) => encoding.name(), 282 | VariantCharset::Utf7 => "UTF-7", 283 | } 284 | } 285 | 286 | /// Checks whether the bytes 0x00...0x7F map exclusively to the characters 287 | /// U+0000...U+007F and vice versa. 288 | #[inline] 289 | pub fn is_ascii_compatible(self) -> bool { 290 | match self.variant { 291 | VariantCharset::Encoding(encoding) => encoding.is_ascii_compatible(), 292 | VariantCharset::Utf7 => false, 293 | } 294 | } 295 | 296 | /// Decode complete input to `Cow<'a, str>` _with BOM sniffing_ and with 297 | /// malformed sequences replaced with the REPLACEMENT CHARACTER when the 298 | /// entire input is available as a single buffer (i.e. the end of the 299 | /// buffer marks the end of the stream). 300 | /// 301 | /// This method implements the (non-streaming version of) the 302 | /// [_decode_](https://encoding.spec.whatwg.org/#decode) spec concept. 303 | /// 304 | /// The second item in the returned tuple is the encoding that was actually 305 | /// used (which may differ from this encoding thanks to BOM sniffing). 306 | /// 307 | /// The third item in the returned tuple indicates whether there were 308 | /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER). 309 | /// 310 | /// _Note:_ It is wrong to use this when the input buffer represents only 311 | /// a segment of the input instead of the whole input. 312 | /// 313 | /// # Panics 314 | /// 315 | /// If the size calculation for a heap-allocated backing buffer overflows 316 | /// `usize`. 317 | #[inline] 318 | pub fn decode<'a>(self, bytes: &'a [u8]) -> (Cow<'a, str>, Charset, bool) { 319 | let (charset, without_bom) = match Charset::for_bom(bytes) { 320 | Some((charset, bom_length)) => (charset, &bytes[bom_length..]), 321 | None => (self, bytes), 322 | }; 323 | let (cow, had_errors) = charset.decode_without_bom_handling(without_bom); 324 | (cow, charset, had_errors) 325 | } 326 | 327 | /// Decode complete input to `Cow<'a, str>` _with BOM removal_ and with 328 | /// malformed sequences replaced with the REPLACEMENT CHARACTER when the 329 | /// entire input is available as a single buffer (i.e. the end of the 330 | /// buffer marks the end of the stream). 331 | /// 332 | /// When invoked on `UTF_8`, this method implements the (non-streaming 333 | /// version of) the 334 | /// [_UTF-8 decode_](https://encoding.spec.whatwg.org/#utf-8-decode) spec 335 | /// concept. 336 | /// 337 | /// The second item in the returned pair indicates whether there were 338 | /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER). 339 | /// 340 | /// _Note:_ It is wrong to use this when the input buffer represents only 341 | /// a segment of the input instead of the whole input. 342 | /// 343 | /// # Panics 344 | /// 345 | /// If the size calculation for a heap-allocated backing buffer overflows 346 | /// `usize`. 347 | #[inline] 348 | pub fn decode_with_bom_removal<'a>(self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) { 349 | match self.variant { 350 | VariantCharset::Encoding(encoding) => encoding.decode_with_bom_removal(bytes), 351 | VariantCharset::Utf7 => decode_utf7(bytes), 352 | } 353 | } 354 | 355 | /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and 356 | /// with malformed sequences replaced with the REPLACEMENT CHARACTER when 357 | /// the entire input is available as a single buffer (i.e. the end of the 358 | /// buffer marks the end of the stream). 359 | /// 360 | /// When invoked on `UTF_8`, this method implements the (non-streaming 361 | /// version of) the 362 | /// [_UTF-8 decode without BOM_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) 363 | /// spec concept. 364 | /// 365 | /// The second item in the returned pair indicates whether there were 366 | /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER). 367 | /// 368 | /// _Note:_ It is wrong to use this when the input buffer represents only 369 | /// a segment of the input instead of the whole input. 370 | /// 371 | /// # Panics 372 | /// 373 | /// If the size calculation for a heap-allocated backing buffer overflows 374 | /// `usize`. 375 | #[inline] 376 | pub fn decode_without_bom_handling<'a>(self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) { 377 | match self.variant { 378 | VariantCharset::Encoding(encoding) => encoding.decode_without_bom_handling(bytes), 379 | VariantCharset::Utf7 => decode_utf7(bytes), 380 | } 381 | } 382 | } 383 | 384 | impl From<&'static Encoding> for Charset { 385 | fn from(encoding: &'static Encoding) -> Self { 386 | Charset::for_encoding(encoding) 387 | } 388 | } 389 | 390 | #[cfg(feature = "serde")] 391 | impl Serialize for Charset { 392 | #[inline] 393 | fn serialize(&self, serializer: S) -> Result 394 | where 395 | S: Serializer, 396 | { 397 | serializer.serialize_str(self.name()) 398 | } 399 | } 400 | 401 | #[cfg(feature = "serde")] 402 | struct CharsetVisitor; 403 | 404 | #[cfg(feature = "serde")] 405 | impl<'de> Visitor<'de> for CharsetVisitor { 406 | type Value = Charset; 407 | 408 | fn expecting(&self, formatter: &mut core::fmt::Formatter) -> core::fmt::Result { 409 | formatter.write_str("a valid charset label") 410 | } 411 | 412 | fn visit_str(self, value: &str) -> Result 413 | where 414 | E: serde::de::Error, 415 | { 416 | if let Some(charset) = Charset::for_label(value.as_bytes()) { 417 | Ok(charset) 418 | } else { 419 | Err(E::custom(format!("invalid charset label: {}", value))) 420 | } 421 | } 422 | } 423 | 424 | #[cfg(feature = "serde")] 425 | impl<'de> Deserialize<'de> for Charset { 426 | fn deserialize(deserializer: D) -> Result 427 | where 428 | D: Deserializer<'de>, 429 | { 430 | deserializer.deserialize_str(CharsetVisitor) 431 | } 432 | } 433 | 434 | static LABELS_SORTED: [&'static str; 29] = [ 435 | "ms950", 436 | "ms874", 437 | "ms936", 438 | "utf-7", 439 | "ms949", 440 | "tis620", 441 | "euc_cn", 442 | "euc_jp", 443 | "koi8_r", 444 | "euc_kr", 445 | "koi8_u", 446 | "iso8859_1", 447 | "iso8859_2", 448 | "iso8859_3", 449 | "iso8859_4", 450 | "iso8859_5", 451 | "iso8859_6", 452 | "iso8859_7", 453 | "iso8859_9", 454 | "iso2022jp", 455 | "iso8859_13", 456 | "iso8859_15", 457 | "ms950_hkscs", 458 | "x-windows-950", 459 | "x-windows-874", 460 | "x-windows-949", 461 | "csunicode11utf7", 462 | "unicode-1-1-utf-7", 463 | "x-unicode-2-0-utf-7", 464 | ]; 465 | 466 | static ENCODINGS_IN_LABEL_SORT: [VariantCharset; 29] = [ 467 | VariantCharset::Encoding(&encoding_rs::BIG5_INIT), 468 | VariantCharset::Encoding(&encoding_rs::WINDOWS_874_INIT), 469 | VariantCharset::Encoding(&encoding_rs::GB18030_INIT), 470 | VariantCharset::Utf7, 471 | VariantCharset::Encoding(&encoding_rs::EUC_KR_INIT), 472 | VariantCharset::Encoding(&encoding_rs::WINDOWS_874_INIT), 473 | VariantCharset::Encoding(&encoding_rs::GB18030_INIT), 474 | VariantCharset::Encoding(&encoding_rs::EUC_JP_INIT), 475 | VariantCharset::Encoding(&encoding_rs::KOI8_R_INIT), 476 | VariantCharset::Encoding(&encoding_rs::EUC_KR_INIT), 477 | VariantCharset::Encoding(&encoding_rs::KOI8_U_INIT), 478 | VariantCharset::Encoding(&encoding_rs::WINDOWS_1252_INIT), 479 | VariantCharset::Encoding(&encoding_rs::ISO_8859_2_INIT), 480 | VariantCharset::Encoding(&encoding_rs::ISO_8859_3_INIT), 481 | VariantCharset::Encoding(&encoding_rs::ISO_8859_4_INIT), 482 | VariantCharset::Encoding(&encoding_rs::ISO_8859_5_INIT), 483 | VariantCharset::Encoding(&encoding_rs::ISO_8859_6_INIT), 484 | VariantCharset::Encoding(&encoding_rs::ISO_8859_7_INIT), 485 | VariantCharset::Encoding(&encoding_rs::WINDOWS_1254_INIT), 486 | VariantCharset::Encoding(&encoding_rs::ISO_2022_JP_INIT), 487 | VariantCharset::Encoding(&encoding_rs::ISO_8859_13_INIT), 488 | VariantCharset::Encoding(&encoding_rs::ISO_8859_15_INIT), 489 | VariantCharset::Encoding(&encoding_rs::BIG5_INIT), 490 | VariantCharset::Encoding(&encoding_rs::BIG5_INIT), 491 | VariantCharset::Encoding(&encoding_rs::WINDOWS_874_INIT), 492 | VariantCharset::Encoding(&encoding_rs::EUC_KR_INIT), 493 | VariantCharset::Utf7, 494 | VariantCharset::Utf7, 495 | VariantCharset::Utf7, 496 | ]; 497 | 498 | const LONGEST_LABEL_LENGTH: usize = 19; // x-unicode-2-0-utf-7 499 | 500 | /// Copypaste from encoding_rs to search over the labels known to this 501 | /// crate but not encoding_rs. 502 | #[inline(never)] 503 | fn for_label_extended(label: &[u8]) -> Option { 504 | let mut trimmed = [0u8; LONGEST_LABEL_LENGTH]; 505 | let mut trimmed_pos = 0usize; 506 | let mut iter = label.into_iter(); 507 | // before 508 | loop { 509 | match iter.next() { 510 | None => { 511 | return None; 512 | } 513 | Some(byte) => { 514 | // The characters used in labels are: 515 | // a-z (except q, but excluding it below seems excessive) 516 | // 0-9 517 | // . _ - : 518 | match *byte { 519 | 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => { 520 | continue; 521 | } 522 | b'A'..=b'Z' => { 523 | trimmed[trimmed_pos] = *byte + 0x20u8; 524 | trimmed_pos = 1usize; 525 | break; 526 | } 527 | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => { 528 | trimmed[trimmed_pos] = *byte; 529 | trimmed_pos = 1usize; 530 | break; 531 | } 532 | _ => { 533 | return None; 534 | } 535 | } 536 | } 537 | } 538 | } 539 | // inside 540 | loop { 541 | match iter.next() { 542 | None => { 543 | break; 544 | } 545 | Some(byte) => { 546 | match *byte { 547 | 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => { 548 | break; 549 | } 550 | b'A'..=b'Z' => { 551 | if trimmed_pos == LONGEST_LABEL_LENGTH { 552 | // There's no encoding with a label this long 553 | return None; 554 | } 555 | trimmed[trimmed_pos] = *byte + 0x20u8; 556 | trimmed_pos += 1usize; 557 | continue; 558 | } 559 | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => { 560 | if trimmed_pos == LONGEST_LABEL_LENGTH { 561 | // There's no encoding with a label this long 562 | return None; 563 | } 564 | trimmed[trimmed_pos] = *byte; 565 | trimmed_pos += 1usize; 566 | continue; 567 | } 568 | _ => { 569 | return None; 570 | } 571 | } 572 | } 573 | } 574 | } 575 | // after 576 | loop { 577 | match iter.next() { 578 | None => { 579 | break; 580 | } 581 | Some(byte) => { 582 | match *byte { 583 | 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => { 584 | continue; 585 | } 586 | _ => { 587 | // There's no label with space in the middle 588 | return None; 589 | } 590 | } 591 | } 592 | } 593 | } 594 | let candidate = &trimmed[..trimmed_pos]; 595 | match LABELS_SORTED.binary_search_by(|probe| { 596 | let bytes = probe.as_bytes(); 597 | let c = bytes.len().cmp(&candidate.len()); 598 | if c != Ordering::Equal { 599 | return c; 600 | } 601 | let probe_iter = bytes.iter().rev(); 602 | let candidate_iter = candidate.iter().rev(); 603 | probe_iter.cmp(candidate_iter) 604 | }) { 605 | Ok(i) => Some(ENCODINGS_IN_LABEL_SORT[i]), 606 | Err(_) => None, 607 | } 608 | } 609 | 610 | #[inline] 611 | fn utf7_ascii_up_to(bytes: &[u8]) -> usize { 612 | for (i, &byte) in bytes.into_iter().enumerate() { 613 | if byte == b'+' || byte >= 0x80 { 614 | return i; 615 | } 616 | } 617 | bytes.len() 618 | } 619 | 620 | #[inline] 621 | fn utf7_base64_up_to(bytes: &[u8]) -> usize { 622 | for (i, &byte) in bytes.into_iter().enumerate() { 623 | match byte { 624 | b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'+' | b'/' => {} 625 | _ => { 626 | return i; 627 | } 628 | } 629 | } 630 | bytes.len() 631 | } 632 | 633 | #[inline] 634 | fn utf7_base64_decode(bytes: &[u8], string: &mut String) -> bool { 635 | // The intermediate buffer should be long enough to fit a line 636 | // of 80 base64 bytes and should also be a multiple of 3. This 637 | // way, normal email lines will be handled in one go, but 638 | // longer sequences won't get split between base64 groups of 639 | // 4 input / 3 output bytes. 640 | let mut decoder = UTF_16BE.new_decoder_without_bom_handling(); 641 | let mut buf = [0u8; 60]; 642 | let mut tail = bytes; 643 | let mut had_errors = false; 644 | let mut trailing_error = false; 645 | loop { 646 | let (last, mut cap) = if tail.len() <= 80 { 647 | (true, tail.len()) 648 | } else { 649 | (false, 80) 650 | }; 651 | let len; 652 | loop { 653 | match STANDARD_NO_PAD.decode_slice(&tail[..cap], &mut buf[..]) { 654 | Ok(l) => { 655 | len = l; 656 | break; 657 | } 658 | Err(_) => { 659 | assert!(last); 660 | had_errors = true; 661 | trailing_error = true; 662 | tail = &tail[..tail.len() - 1]; 663 | cap -= 1; 664 | } 665 | } 666 | } 667 | let mut total_read = 0; 668 | loop { 669 | let (result, read, err) = decoder.decode_to_string(&buf[total_read..len], string, last); 670 | total_read += read; 671 | had_errors |= err; 672 | match result { 673 | CoderResult::InputEmpty => { 674 | if last { 675 | if trailing_error { 676 | string.push_str("\u{FFFD}"); 677 | } 678 | return had_errors; 679 | } 680 | break; 681 | } 682 | CoderResult::OutputFull => { 683 | let left = len - total_read; 684 | let needed = decoder.max_utf8_buffer_length(left).unwrap(); 685 | string.reserve(needed); 686 | } 687 | } 688 | } 689 | tail = &tail[80..]; 690 | } 691 | } 692 | 693 | #[inline(never)] 694 | fn decode_utf7<'a>(bytes: &'a [u8]) -> (Cow<'a, str>, bool) { 695 | let up_to = utf7_ascii_up_to(bytes); 696 | if up_to == bytes.len() { 697 | let s: &str = unsafe { core::str::from_utf8_unchecked(bytes) }; 698 | return (Cow::Borrowed(s), false); 699 | } 700 | let mut had_errors = false; 701 | let mut out = String::with_capacity(bytes.len()); 702 | out.push_str(unsafe { core::str::from_utf8_unchecked(&bytes[..up_to]) }); 703 | 704 | let mut tail = &bytes[up_to..]; 705 | loop { 706 | // `tail[0]` is now either a plus sign or non-ASCII 707 | let first = tail[0]; 708 | tail = &tail[1..]; 709 | if first == b'+' { 710 | let up_to = utf7_base64_up_to(tail); 711 | had_errors |= utf7_base64_decode(&tail[..up_to], &mut out); 712 | if up_to == tail.len() { 713 | if up_to == 0 { 714 | // Plus sign didn't start a base64 run and also 715 | // wasn't followed by a minus. 716 | had_errors = true; 717 | out.push_str("\u{FFFD}"); 718 | } 719 | return (Cow::Owned(out), had_errors); 720 | } 721 | if up_to == 0 { 722 | if tail[up_to] == b'-' { 723 | // There was no base64 data between 724 | // plus and minus, so we had the sequence 725 | // meaning the plus sign itself. 726 | out.push_str("+"); 727 | tail = &tail[up_to + 1..]; 728 | } else { 729 | // Plus sign didn't start a base64 run and also 730 | // wasn't followed by a minus. 731 | had_errors = true; 732 | out.push_str("\u{FFFD}"); 733 | } 734 | } else if tail[up_to] == b'-' { 735 | tail = &tail[up_to + 1..]; 736 | } else { 737 | tail = &tail[up_to..]; 738 | } 739 | } else { 740 | had_errors = true; 741 | out.push_str("\u{FFFD}"); 742 | } 743 | let up_to = utf7_ascii_up_to(tail); 744 | out.push_str(unsafe { core::str::from_utf8_unchecked(&tail[..up_to]) }); 745 | if up_to == tail.len() { 746 | return (Cow::Owned(out), had_errors); 747 | } 748 | tail = &tail[up_to..]; 749 | } 750 | } 751 | 752 | #[derive(PartialEq, Debug, Copy, Clone, Hash)] 753 | enum VariantCharset { 754 | Utf7, 755 | Encoding(&'static Encoding), 756 | } 757 | 758 | #[cfg(all(test, feature = "serde"))] 759 | #[derive(Serialize, Deserialize, Debug, PartialEq)] 760 | struct Demo { 761 | num: u32, 762 | name: String, 763 | charset: Charset, 764 | } 765 | 766 | #[cfg(test)] 767 | mod tests { 768 | use super::*; 769 | 770 | fn utf7_no_err(bytes: &[u8]) -> String { 771 | let (cow, had_errors) = UTF_7.decode_without_bom_handling(bytes); 772 | assert!(!had_errors); 773 | cow.into() 774 | } 775 | 776 | fn utf7_err(bytes: &[u8]) -> String { 777 | let (cow, had_errors) = UTF_7.decode_without_bom_handling(bytes); 778 | assert!(had_errors); 779 | cow.into() 780 | } 781 | 782 | // Any copyright to the test code below this comment is dedicated to the 783 | // Public Domain. https://creativecommons.org/publicdomain/zero/1.0/ 784 | 785 | #[test] 786 | fn test_for_label() { 787 | assert_eq!(Charset::for_label(b" uTf-7\t "), Some(UTF_7)); 788 | assert_eq!( 789 | Charset::for_label(b" uTf-8\t "), 790 | Some(Charset::for_encoding(encoding_rs::UTF_8)) 791 | ); 792 | assert_eq!( 793 | Charset::for_label(b" iSo-8859-1\t "), 794 | Some(Charset::for_encoding(encoding_rs::WINDOWS_1252)) 795 | ); 796 | assert_eq!( 797 | Charset::for_label(b" gb2312\t "), 798 | Some(Charset::for_encoding(encoding_rs::GB18030)) 799 | ); 800 | assert_eq!( 801 | Charset::for_label(b" ISO-2022-KR\t "), 802 | Some(Charset::for_encoding(encoding_rs::REPLACEMENT)) 803 | ); 804 | 805 | assert_eq!(Charset::for_label(b"u"), None); 806 | assert_eq!(Charset::for_label(b"ut"), None); 807 | assert_eq!(Charset::for_label(b"utf"), None); 808 | assert_eq!(Charset::for_label(b"utf-"), None); 809 | } 810 | 811 | #[test] 812 | fn test_for_label_no_replacement() { 813 | assert_eq!( 814 | Charset::for_label_no_replacement(b" uTf-7\t "), 815 | Some(UTF_7) 816 | ); 817 | assert_eq!( 818 | Charset::for_label_no_replacement(b" uTf-8\t "), 819 | Some(Charset::for_encoding(encoding_rs::UTF_8)) 820 | ); 821 | assert_eq!( 822 | Charset::for_label_no_replacement(b" iSo-8859-1\t "), 823 | Some(Charset::for_encoding(encoding_rs::WINDOWS_1252)) 824 | ); 825 | assert_eq!( 826 | Charset::for_label_no_replacement(b" Gb2312\t "), 827 | Some(Charset::for_encoding(encoding_rs::GB18030)) 828 | ); 829 | assert_eq!(Charset::for_label_no_replacement(b" ISO-2022-KR\t "), None); 830 | 831 | assert_eq!(Charset::for_label_no_replacement(b"u"), None); 832 | assert_eq!(Charset::for_label_no_replacement(b"ut"), None); 833 | assert_eq!(Charset::for_label_no_replacement(b"utf"), None); 834 | assert_eq!(Charset::for_label_no_replacement(b"utf-"), None); 835 | } 836 | 837 | #[test] 838 | fn test_for_label_and_name() { 839 | assert_eq!(Charset::for_label(b" uTf-7\t ").unwrap().name(), "UTF-7"); 840 | assert_eq!(Charset::for_label(b" uTf-8\t ").unwrap().name(), "UTF-8"); 841 | assert_eq!( 842 | Charset::for_label(b" Gb2312\t ").unwrap().name(), 843 | "gb18030" 844 | ); 845 | } 846 | 847 | #[test] 848 | fn test_extended_labels() { 849 | let cases: [(&'static str, VariantCharset); 29] = [ 850 | ( 851 | "iso8859_1", 852 | VariantCharset::Encoding(&encoding_rs::WINDOWS_1252_INIT), 853 | ), 854 | ( 855 | "iso8859_2", 856 | VariantCharset::Encoding(&encoding_rs::ISO_8859_2_INIT), 857 | ), 858 | ( 859 | "iso8859_3", 860 | VariantCharset::Encoding(&encoding_rs::ISO_8859_3_INIT), 861 | ), 862 | ( 863 | "iso8859_4", 864 | VariantCharset::Encoding(&encoding_rs::ISO_8859_4_INIT), 865 | ), 866 | ( 867 | "iso8859_5", 868 | VariantCharset::Encoding(&encoding_rs::ISO_8859_5_INIT), 869 | ), 870 | ( 871 | "iso8859_6", 872 | VariantCharset::Encoding(&encoding_rs::ISO_8859_6_INIT), 873 | ), 874 | ( 875 | "iso8859_7", 876 | VariantCharset::Encoding(&encoding_rs::ISO_8859_7_INIT), 877 | ), 878 | ( 879 | "iso8859_9", 880 | VariantCharset::Encoding(&encoding_rs::WINDOWS_1254_INIT), 881 | ), 882 | ( 883 | "iso8859_13", 884 | VariantCharset::Encoding(&encoding_rs::ISO_8859_13_INIT), 885 | ), 886 | ( 887 | "iso8859_15", 888 | VariantCharset::Encoding(&encoding_rs::ISO_8859_15_INIT), 889 | ), 890 | ( 891 | "ms936", 892 | VariantCharset::Encoding(&encoding_rs::GB18030_INIT), 893 | ), 894 | ("ms949", VariantCharset::Encoding(&encoding_rs::EUC_KR_INIT)), 895 | ("ms950", VariantCharset::Encoding(&encoding_rs::BIG5_INIT)), 896 | ( 897 | "ms950_hkscs", 898 | VariantCharset::Encoding(&encoding_rs::BIG5_INIT), 899 | ), 900 | ( 901 | "ms874", 902 | VariantCharset::Encoding(&encoding_rs::WINDOWS_874_INIT), 903 | ), 904 | ( 905 | "euc_jp", 906 | VariantCharset::Encoding(&encoding_rs::EUC_JP_INIT), 907 | ), 908 | ( 909 | "euc_kr", 910 | VariantCharset::Encoding(&encoding_rs::EUC_KR_INIT), 911 | ), 912 | ( 913 | "euc_cn", 914 | VariantCharset::Encoding(&encoding_rs::GB18030_INIT), 915 | ), 916 | ( 917 | "koi8_r", 918 | VariantCharset::Encoding(&encoding_rs::KOI8_R_INIT), 919 | ), 920 | ( 921 | "koi8_u", 922 | VariantCharset::Encoding(&encoding_rs::KOI8_U_INIT), 923 | ), 924 | ( 925 | "x-windows-874", 926 | VariantCharset::Encoding(&encoding_rs::WINDOWS_874_INIT), 927 | ), 928 | ( 929 | "x-windows-949", 930 | VariantCharset::Encoding(&encoding_rs::EUC_KR_INIT), 931 | ), 932 | ( 933 | "x-windows-950", 934 | VariantCharset::Encoding(&encoding_rs::BIG5_INIT), 935 | ), 936 | ( 937 | "tis620", 938 | VariantCharset::Encoding(&encoding_rs::WINDOWS_874_INIT), 939 | ), 940 | ( 941 | "iso2022jp", 942 | VariantCharset::Encoding(&encoding_rs::ISO_2022_JP_INIT), 943 | ), 944 | ("x-unicode-2-0-utf-7", VariantCharset::Utf7), // Netscape 4.0 per https://jkorpela.fi/chars.html 945 | ("unicode-1-1-utf-7", VariantCharset::Utf7), // https://www.iana.org/assignments/character-sets/character-sets.xhtml 946 | ("csunicode11utf7", VariantCharset::Utf7), // https://www.iana.org/assignments/character-sets/character-sets.xhtml 947 | ("utf-7", VariantCharset::Utf7), 948 | ]; 949 | for (label, expected) in cases.iter() { 950 | assert_eq!( 951 | Charset::for_label(label.as_bytes()), 952 | Some(Charset { variant: *expected }) 953 | ); 954 | } 955 | } 956 | 957 | #[test] 958 | fn test_utf7_decode() { 959 | assert_eq!(utf7_no_err(b""), ""); 960 | assert_eq!(utf7_no_err(b"ab"), "ab"); 961 | assert_eq!(utf7_no_err(b"+-"), "+"); 962 | assert_eq!(utf7_no_err(b"a+-b"), "a+b"); 963 | 964 | assert_eq!(utf7_no_err(b"+ACs-"), "+"); 965 | assert_eq!(utf7_no_err(b"+AGEAKwBi-"), "a+b"); 966 | 967 | assert_eq!(utf7_no_err(b"+JgM-"), "\u{2603}"); 968 | assert_eq!(utf7_no_err(b"+JgM."), "\u{2603}."); 969 | assert_eq!(utf7_no_err(b"+JgM "), "\u{2603} "); 970 | assert_eq!(utf7_no_err(b"+JgM--"), "\u{2603}-"); 971 | assert_eq!(utf7_no_err(b"+JgM"), "\u{2603}"); 972 | 973 | assert_eq!(utf7_no_err(b"+JgMmAw-"), "\u{2603}\u{2603}"); 974 | assert_eq!(utf7_no_err(b"+JgMmAw."), "\u{2603}\u{2603}."); 975 | assert_eq!(utf7_no_err(b"+JgMmAw "), "\u{2603}\u{2603} "); 976 | assert_eq!(utf7_no_err(b"+JgMmAw--"), "\u{2603}\u{2603}-"); 977 | assert_eq!(utf7_no_err(b"+JgMmAw"), "\u{2603}\u{2603}"); 978 | 979 | assert_eq!(utf7_no_err(b"+2D3cqQ-"), "\u{1F4A9}"); 980 | assert_eq!(utf7_no_err(b"+2D3cqQ."), "\u{1F4A9}."); 981 | assert_eq!(utf7_no_err(b"+2D3cqQ "), "\u{1F4A9} "); 982 | assert_eq!(utf7_no_err(b"+2D3cqQ--"), "\u{1F4A9}-"); 983 | assert_eq!(utf7_no_err(b"+2D3cqQ"), "\u{1F4A9}"); 984 | 985 | assert_eq!(utf7_no_err(b"+JgPYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp2D3cqdg93KnYPdyp"), "\u{2603}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}\u{1F4A9}"); 986 | 987 | assert_eq!(utf7_err(b"+"), "\u{FFFD}"); 988 | 989 | assert_eq!(utf7_err(b"+J-"), "\u{FFFD}"); 990 | assert_eq!(utf7_err(b"+Jg-"), "\u{FFFD}"); 991 | assert_eq!(utf7_err(b"+J"), "\u{FFFD}"); 992 | assert_eq!(utf7_err(b"+Jg"), "\u{FFFD}"); 993 | assert_eq!(utf7_err(b"+."), "\u{FFFD}."); 994 | assert_eq!(utf7_err(b"+J."), "\u{FFFD}."); 995 | assert_eq!(utf7_err(b"+Jg."), "\u{FFFD}."); 996 | assert_eq!(utf7_err(b"+ "), "\u{FFFD} "); 997 | assert_eq!(utf7_err(b"+J "), "\u{FFFD} "); 998 | assert_eq!(utf7_err(b"+Jg "), "\u{FFFD} "); 999 | 1000 | assert_eq!(utf7_err(b"+JgMmA-"), "\u{2603}\u{FFFD}\u{FFFD}"); 1001 | assert_eq!(utf7_err(b"+JgMmA"), "\u{2603}\u{FFFD}\u{FFFD}"); 1002 | assert_eq!(utf7_err(b"+JgMmA."), "\u{2603}\u{FFFD}\u{FFFD}."); 1003 | assert_eq!(utf7_err(b"+JgMmA "), "\u{2603}\u{FFFD}\u{FFFD} "); 1004 | 1005 | assert_eq!(utf7_err(b"+JgMm-"), "\u{2603}\u{FFFD}"); 1006 | assert_eq!(utf7_err(b"+JgMm"), "\u{2603}\u{FFFD}"); 1007 | assert_eq!(utf7_err(b"+JgMm."), "\u{2603}\u{FFFD}."); 1008 | assert_eq!(utf7_err(b"+JgMm "), "\u{2603}\u{FFFD} "); 1009 | 1010 | assert_eq!(utf7_err(b"+2D3cq-"), "\u{FFFD}\u{FFFD}"); 1011 | assert_eq!(utf7_err(b"+2D3cq"), "\u{FFFD}\u{FFFD}"); 1012 | assert_eq!(utf7_err(b"+2D3cq."), "\u{FFFD}\u{FFFD}."); 1013 | assert_eq!(utf7_err(b"+2D3cq "), "\u{FFFD}\u{FFFD} "); 1014 | 1015 | assert_eq!(utf7_err(b"+2D3c-"), "\u{FFFD}"); 1016 | assert_eq!(utf7_err(b"+2D3c"), "\u{FFFD}"); 1017 | assert_eq!(utf7_err(b"+2D3c."), "\u{FFFD}."); 1018 | assert_eq!(utf7_err(b"+2D3c "), "\u{FFFD} "); 1019 | 1020 | assert_eq!(utf7_err(b"+2D3-"), "\u{FFFD}"); 1021 | assert_eq!(utf7_err(b"+2D3"), "\u{FFFD}"); 1022 | assert_eq!(utf7_err(b"+2D3."), "\u{FFFD}."); 1023 | assert_eq!(utf7_err(b"+2D3 "), "\u{FFFD} "); 1024 | 1025 | assert_eq!(utf7_err(b"+2D-"), "\u{FFFD}"); 1026 | assert_eq!(utf7_err(b"+2D"), "\u{FFFD}"); 1027 | assert_eq!(utf7_err(b"+2D."), "\u{FFFD}."); 1028 | assert_eq!(utf7_err(b"+2D "), "\u{FFFD} "); 1029 | 1030 | assert_eq!(utf7_err(b"+2-"), "\u{FFFD}"); 1031 | assert_eq!(utf7_err(b"+2"), "\u{FFFD}"); 1032 | assert_eq!(utf7_err(b"+2."), "\u{FFFD}."); 1033 | assert_eq!(utf7_err(b"+2 "), "\u{FFFD} "); 1034 | 1035 | // Lone high surrogate 1036 | assert_eq!(utf7_err(b"+2D0-"), "\u{FFFD}"); 1037 | assert_eq!(utf7_err(b"+2D0"), "\u{FFFD}"); 1038 | assert_eq!(utf7_err(b"+2D0."), "\u{FFFD}."); 1039 | assert_eq!(utf7_err(b"+2D0 "), "\u{FFFD} "); 1040 | 1041 | assert_eq!(utf7_err(b"+2D0AYQ-"), "\u{FFFD}a"); 1042 | assert_eq!(utf7_err(b"+2D0AYQ"), "\u{FFFD}a"); 1043 | assert_eq!(utf7_err(b"+2D0AYQ."), "\u{FFFD}a."); 1044 | assert_eq!(utf7_err(b"+2D0AYQ "), "\u{FFFD}a "); 1045 | 1046 | assert_eq!(utf7_err(b"+2D3/QQ-"), "\u{FFFD}\u{FF41}"); 1047 | assert_eq!(utf7_err(b"+2D3/QQ"), "\u{FFFD}\u{FF41}"); 1048 | assert_eq!(utf7_err(b"+2D3/QQ."), "\u{FFFD}\u{FF41}."); 1049 | assert_eq!(utf7_err(b"+2D3/QQ "), "\u{FFFD}\u{FF41} "); 1050 | 1051 | // Lone low surrogate 1052 | assert_eq!(utf7_err(b"+AGHcqQ-"), "a\u{FFFD}"); 1053 | assert_eq!(utf7_err(b"+AGHcqQ"), "a\u{FFFD}"); 1054 | assert_eq!(utf7_err(b"+AGHcqQ."), "a\u{FFFD}."); 1055 | assert_eq!(utf7_err(b"+AGHcqQ "), "a\u{FFFD} "); 1056 | } 1057 | 1058 | #[test] 1059 | fn test_decode_ascii() { 1060 | assert_eq!(decode_ascii(b"aa\x80bb\xFFcc"), "aa\u{FFFD}bb\u{FFFD}cc"); 1061 | } 1062 | 1063 | #[test] 1064 | fn test_from() { 1065 | let _: Charset = encoding_rs::UTF_8.into(); 1066 | } 1067 | 1068 | #[cfg(feature = "serde")] 1069 | #[test] 1070 | fn test_serde_utf7() { 1071 | let demo = Demo { 1072 | num: 42, 1073 | name: "foo".into(), 1074 | charset: UTF_7, 1075 | }; 1076 | 1077 | let serialized = serde_json::to_string(&demo).unwrap(); 1078 | 1079 | let deserialized: Demo = serde_json::from_str(&serialized).unwrap(); 1080 | assert_eq!(deserialized, demo); 1081 | 1082 | let bincoded = bincode::serialize(&demo).unwrap(); 1083 | let debincoded: Demo = bincode::deserialize(&bincoded[..]).unwrap(); 1084 | assert_eq!(debincoded, demo); 1085 | } 1086 | 1087 | #[cfg(feature = "serde")] 1088 | #[test] 1089 | fn test_serde_utf8() { 1090 | let demo = Demo { 1091 | num: 42, 1092 | name: "foo".into(), 1093 | charset: encoding_rs::UTF_8.into(), 1094 | }; 1095 | 1096 | let serialized = serde_json::to_string(&demo).unwrap(); 1097 | 1098 | let deserialized: Demo = serde_json::from_str(&serialized).unwrap(); 1099 | assert_eq!(deserialized, demo); 1100 | 1101 | let bincoded = bincode::serialize(&demo).unwrap(); 1102 | let debincoded: Demo = bincode::deserialize(&bincoded[..]).unwrap(); 1103 | assert_eq!(debincoded, demo); 1104 | } 1105 | } 1106 | --------------------------------------------------------------------------------