├── .gitignore ├── README.tpl ├── README.md ├── Cargo.toml ├── .github └── workflows │ └── rust.yaml ├── src └── lib.rs └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | Cargo.lock 3 | -------------------------------------------------------------------------------- /README.tpl: -------------------------------------------------------------------------------- 1 | ![](https://github.com/Narsil/spm_precompiled/workflows/build/badge.svg) 2 | [![Crate](https://img.shields.io/crates/v/spm_precompiled.svg)](https://crates.io/crates/spm_precompiled) 3 | [![API](https://docs.rs/spm_precompiled/badge.svg)](https://docs.rs/spm_precompiled) 4 | 5 | # {{crate}} 6 | 7 | {{readme}} 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![](https://github.com/Narsil/spm_precompiled/workflows/build/badge.svg) 2 | [![Crate](https://img.shields.io/crates/v/spm_precompiled.svg)](https://crates.io/crates/spm_precompiled) 3 | [![API](https://docs.rs/spm_precompiled/badge.svg)](https://docs.rs/spm_precompiled) 4 | 5 | # spm_precompiled 6 | 7 | This crate aims to emulate https://github.com/google/sentencepiece Dart::DoubleArray 8 | struct and it's Normalizer. It's main intent is to be used with tokenizers 9 | that is a Rust library that aims to provide facilities to tokenize string 10 | for use with HuggingFace's transformers library 11 | 12 | This crate is highly specialized and not intended for general use. 13 | 14 | The core of the algorithm is to read spm's binary `precompiled_charsmap`. 15 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "spm_precompiled" 3 | version = "0.1.4" 4 | authors = ["Nicolas Patry "] 5 | edition = "2018" 6 | homepage = "https://github.com/huggingface/spm_precompiled" 7 | repository = "https://github.com/huggingface/spm_precompiled" 8 | documentation = "https://docs.rs/spm_precompiled/" 9 | license = "Apache-2.0" 10 | keywords = ["SentencePiece", "precompiled_charsmap", "Darts", "DoubleArray"] 11 | readme = "./README.md" 12 | description = """ 13 | This crate aims to emulate https://github.com/google/sentencepiece Dart::DoubleArray 14 | struct and it's Normalizer. 15 | 16 | This crate is highly specialized and not intended for general use. 17 | """ 18 | 19 | 20 | [dependencies] 21 | serde = { version = "1.0", features = [ "derive" ] } 22 | nom = "7.1.1" 23 | unicode-segmentation = "1.9" 24 | base64 = "0.13" 25 | 26 | [dev-dependencies] 27 | serde_json = "1.0" 28 | -------------------------------------------------------------------------------- /.github/workflows/rust.yaml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | 9 | jobs: 10 | build: 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | matrix: 14 | os: [ubuntu-latest, windows-latest, macOS-latest] 15 | 16 | steps: 17 | - uses: actions/checkout@v1 18 | 19 | - name: Install Rust Stable 20 | uses: actions-rs/toolchain@v1 21 | with: 22 | toolchain: stable 23 | components: rustfmt, clippy 24 | override: true 25 | 26 | # Necessary for now for the cargo cache: https://github.com/actions/cache/issues/133#issuecomment-599102035 27 | - if: matrix.os == 'ubuntu-latest' 28 | run: sudo chown -R $(whoami):$(id -ng) ~/.cargo/ 29 | 30 | - name: Install cargo-readme for Ubuntu 31 | if: matrix.os == 'ubuntu-latest' 32 | uses: actions-rs/cargo@v1 33 | with: 34 | command: install 35 | args: cargo-readme 36 | 37 | - name: Build 38 | uses: actions-rs/cargo@v1 39 | with: 40 | command: build 41 | args: --all-targets --verbose 42 | 43 | - name: Lint with RustFmt 44 | uses: actions-rs/cargo@v1 45 | with: 46 | command: fmt 47 | args: -- --check 48 | 49 | - name: Lint with Clippy 50 | uses: actions-rs/cargo@v1 51 | with: 52 | command: clippy 53 | args: --all-targets --all-features -- -D warnings 54 | 55 | - name: Run tests 56 | uses: actions-rs/cargo@v1 57 | with: 58 | command: test 59 | args: --verbose 60 | 61 | - name: Make sure, Readme generated from lib.rs matches actual Readme 62 | if: matrix.os == 'ubuntu-latest' 63 | shell: bash 64 | run: cargo readme > must_match_readme.md && diff must_match_readme.md README.md 65 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! This crate aims to emulate https://github.com/google/sentencepiece Dart::DoubleArray 2 | //! struct and it's Normalizer. It's main intent is to be used with tokenizers 3 | //! that is a Rust library that aims to provide facilities to tokenize string 4 | //! for use with HuggingFace's transformers library 5 | //! 6 | //! This crate is highly specialized and not intended for general use. 7 | //! 8 | //! The core of the algorithm is to read spm's binary `precompiled_charsmap`. 9 | use nom::{number::complete::le_u32, IResult, ToUsize}; 10 | use serde::{de::Error, Deserialize, Deserializer, Serialize, Serializer}; 11 | use std::convert::TryFrom; 12 | use unicode_segmentation::UnicodeSegmentation; 13 | 14 | /// This struct is specifically done to be compatible with SentencePiece 15 | /// SentencePiece models embed their Normalizer within a `precompiled_charsmap` 16 | /// that both represents a Trie, and embedded rewrite rules. 17 | /// In order to be 100% compliant we need to interpret that binary format too. 18 | /// The format is [u32 (length of trie), trie: [u32], normalized: String] 19 | /// The trie has u8 as entries, and u32 as values, those u32 values 20 | /// point to offsets withing the String that correspond to the real replace value 21 | /// The normalized string contains '\0' that should indicate the end of an entry. 22 | /// 23 | /// Hence, normalized could be "abc\0", some entry in the trie could be 0 meaning 24 | /// the value is "abc" and another one be 1 meaning the actual entry was "bc". 25 | #[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)] 26 | #[serde(tag = "type", try_from = "PrecompiledDeserializer")] 27 | pub struct Precompiled { 28 | #[serde(serialize_with = "as_base64", deserialize_with = "from_base64")] 29 | precompiled_charsmap: Vec, 30 | #[serde(skip)] 31 | normalized: String, 32 | #[serde(skip)] 33 | trie: DoubleArray, 34 | } 35 | 36 | #[doc(hidden)] 37 | #[derive(Deserialize)] 38 | #[serde(tag = "type")] 39 | struct PrecompiledDeserializer { 40 | #[serde(deserialize_with = "from_base64")] 41 | precompiled_charsmap: Vec, 42 | } 43 | 44 | fn as_base64(key: &T, serializer: S) -> Result 45 | where 46 | T: AsRef<[u8]>, 47 | S: Serializer, 48 | { 49 | serializer.serialize_str(&base64::encode(key.as_ref())) 50 | } 51 | 52 | fn from_base64<'de, D>(deserializer: D) -> Result, D::Error> 53 | where 54 | D: Deserializer<'de>, 55 | { 56 | let s: &str = Deserialize::deserialize(deserializer)?; 57 | let precompiled_charsmap = base64::decode(s).map_err(|err| Error::custom(err.to_string()))?; 58 | Ok(precompiled_charsmap) 59 | } 60 | 61 | impl TryFrom for Precompiled { 62 | type Error = PrecompiledError; 63 | 64 | fn try_from(t: PrecompiledDeserializer) -> Result { 65 | Self::from(&t.precompiled_charsmap) 66 | } 67 | } 68 | 69 | pub type ArrayUnit = usize; 70 | 71 | trait ArrayUnitTrait { 72 | fn has_leaf(&self) -> bool; 73 | fn value(&self) -> isize; 74 | fn label(&self) -> usize; 75 | fn offset(&self) -> usize; 76 | } 77 | 78 | impl ArrayUnitTrait for ArrayUnit { 79 | fn has_leaf(&self) -> bool { 80 | (self >> 8) & 1 == 1 81 | } 82 | 83 | fn value(&self) -> isize { 84 | (self & ((1usize << 31) - 1)) as isize 85 | } 86 | 87 | fn label(&self) -> usize { 88 | self & ((1usize << 31) | 0xFF) 89 | } 90 | 91 | fn offset(&self) -> usize { 92 | (self >> 10) << ((self & (1usize << 9)) >> 6) 93 | } 94 | } 95 | 96 | type Array = Vec; 97 | 98 | #[derive(Default, Clone, Debug, Serialize, Deserialize, PartialEq)] 99 | pub struct DoubleArray { 100 | array: Array, 101 | } 102 | 103 | impl DoubleArray { 104 | fn from(array: Array) -> Self { 105 | Self { array } 106 | } 107 | 108 | pub fn common_prefix_search(&self, key: &[u8]) -> Vec { 109 | let mut node_pos = 0; 110 | let mut results = vec![]; 111 | 112 | let mut unit = self.array[node_pos]; 113 | node_pos ^= unit.offset(); 114 | for c in key { 115 | if *c == 0u8 { 116 | break; 117 | } 118 | node_pos ^= *c as usize; 119 | unit = self.array[node_pos]; 120 | if unit.label() != *c as usize { 121 | return results; 122 | } 123 | node_pos ^= unit.offset(); 124 | if unit.has_leaf() { 125 | results.push(self.array[node_pos].value()); 126 | } 127 | } 128 | results 129 | } 130 | } 131 | 132 | fn parse(precompiled_charsmap: &[u8]) -> IResult<&[u8], Array> { 133 | let (mut rest, trie_size) = le_u32(precompiled_charsmap)?; 134 | // u8 to u32. 135 | let trie_char_size = trie_size / 4; 136 | let mut trie_blob = Vec::with_capacity(trie_char_size as usize); 137 | for _ in 0..trie_char_size { 138 | let (rest2, n) = le_u32(rest)?; 139 | rest = rest2; 140 | trie_blob.push(n.to_usize()); 141 | } 142 | let normalized_blob = rest; 143 | Ok((normalized_blob, trie_blob)) 144 | } 145 | 146 | #[derive(Debug)] 147 | pub enum PrecompiledError { 148 | ParseError, 149 | NormalizedInvalidUtf8, 150 | } 151 | 152 | impl std::fmt::Display for PrecompiledError { 153 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { 154 | write!(f, "Cannot parse precompiled_charsmap") 155 | } 156 | } 157 | 158 | impl std::error::Error for PrecompiledError {} 159 | 160 | impl Precompiled { 161 | pub fn from(precompiled_charsmap: &[u8]) -> Result { 162 | let (normalized_blob, trie_blob) = 163 | parse(precompiled_charsmap).map_err(|_| PrecompiledError::ParseError)?; 164 | let normalized = String::from_utf8(normalized_blob.to_vec()) 165 | .map_err(|_| PrecompiledError::NormalizedInvalidUtf8)?; 166 | let trie = DoubleArray::from(trie_blob); 167 | let precompiled = Precompiled { 168 | precompiled_charsmap: precompiled_charsmap.to_vec(), 169 | normalized, 170 | trie, 171 | }; 172 | Ok(precompiled) 173 | } 174 | 175 | pub fn transform(&self, chunk: &str) -> Option<&str> { 176 | let results = self.trie.common_prefix_search(chunk.as_bytes()); 177 | if results.is_empty() { 178 | None 179 | } else { 180 | let index = results[0] as usize; 181 | let mut index2 = index; 182 | while index2 < self.normalized.len() { 183 | if *self.normalized.as_bytes().get(index2)? == 0u8 { 184 | break; 185 | } 186 | index2 += 1; 187 | } 188 | let normalized = &self.normalized[index..index2]; 189 | Some(normalized) 190 | } 191 | } 192 | 193 | pub fn normalize_string(&self, original: &str) -> String { 194 | let mut string = String::with_capacity(original.len()); 195 | // Future reader. From @Narsil. 196 | // Yes, this is weird, 197 | // Yes, this seems broken 198 | // No, I don't know why Google did this. 199 | // If you question this code, check this normalizer against 200 | // XNLI database (all languages) with Unigram model against 201 | // Mbart, XLMRoberta *AND* Marian. If you don't get 100% or 202 | // break a single test. 203 | // You don't pass. 204 | original.graphemes(true).for_each(|grapheme| { 205 | if grapheme.len() < 6 { 206 | if let Some(norm) = self.transform(grapheme) { 207 | for c in norm.chars() { 208 | string.push(c); 209 | } 210 | return; 211 | } 212 | } 213 | for (char_index, c) in grapheme.char_indices() { 214 | let part = &grapheme[char_index..char_index + c.len_utf8()]; 215 | if let Some(norm) = self.transform(part) { 216 | for c in norm.chars() { 217 | string.push(c); 218 | } 219 | } else { 220 | string.push(c); 221 | } 222 | } 223 | }); 224 | string 225 | } 226 | } 227 | 228 | #[cfg(test)] 229 | mod tests; 230 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------