├── .gitignore ├── .travis.yml ├── Cargo.toml ├── LICENSE ├── README.md ├── build.rs ├── src ├── grapheme.rs ├── lib.rs └── util.rs └── update-docs.py /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | Cargo.lock 3 | .settings 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | rust: 3 | - 1.1.0 4 | - 1.2.0 5 | - 1.3.0 6 | - 1.4.0 7 | - 1.5.0 8 | - 1.6.0 9 | - 1.7.0 10 | - stable 11 | - beta 12 | - nightly 13 | matrix: 14 | allow_failures: 15 | - rust: nightly 16 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "strcursor" 3 | version = "0.2.4" 4 | authors = ["Daniel Keep "] 5 | 6 | description = "Provides a string cursor type for seeking through a string whilst respecting grapheme cluster and code point boundaries." 7 | repository = "https://github.com/DanielKeep/strcursor" 8 | documentation = "https://danielkeep.github.io/strcursor/doc/strcursor/index.html" 9 | readme = "README.md" 10 | keywords = ["str", "string", "cursor", "grapheme", "unicode"] 11 | license = "MIT/Apache-2.0" 12 | 13 | build = "build.rs" 14 | 15 | exclude = [ 16 | "update-docs.py", 17 | ] 18 | 19 | [dependencies] 20 | # 0.1.3 breaks semver 21 | unicode-segmentation = "0.1.0, <0.1.3" 22 | 23 | [build-dependencies] 24 | rustc_version = "0.1.4" 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright ⓒ 2015, 2016 Daniel Keep. 2 | 3 | Licensed under either of: 4 | 5 | * MIT license, or 6 | * Apache License, Version 2.0 7 | 8 | at your option. 9 | 10 | Unless you explicitly state otherwise, any contribution intentionally 11 | submitted for inclusion in the work by you shall be dual licensed as 12 | above, without any additional terms or conditions. 13 | 14 | # MIT License 15 | 16 | Permission is hereby granted, free of charge, to any person obtaining 17 | a copy of this software and associated documentation files (the 18 | "Software"), to deal in the Software without restriction, including 19 | without limitation the rights to use, copy, modify, merge, publish, 20 | distribute, sublicense, and/or sell copies of the Software, and to 21 | permit persons to whom the Software is furnished to do so, subject 22 | to the following conditions: 23 | 24 | The above copyright notice and this permission notice shall be included 25 | in all copies or substantial portions of the Software. 26 | 27 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 28 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 29 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 30 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 31 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 32 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 33 | OTHER DEALINGS IN THE SOFTWARE. 34 | 35 | # Apache License, Version 2.0 36 | 37 | Apache License 38 | Version 2.0, January 2004 39 | http://www.apache.org/licenses/ 40 | 41 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 42 | 43 | 1. Definitions. 44 | 45 | "License" shall mean the terms and conditions for use, reproduction, 46 | and distribution as defined by Sections 1 through 9 of this document. 47 | 48 | "Licensor" shall mean the copyright owner or entity authorized by 49 | the copyright owner that is granting the License. 50 | 51 | "Legal Entity" shall mean the union of the acting entity and all 52 | other entities that control, are controlled by, or are under common 53 | control with that entity. For the purposes of this definition, 54 | "control" means (i) the power, direct or indirect, to cause the 55 | direction or management of such entity, whether by contract or 56 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 57 | outstanding shares, or (iii) beneficial ownership of such entity. 58 | 59 | "You" (or "Your") shall mean an individual or Legal Entity 60 | exercising permissions granted by this License. 61 | 62 | "Source" form shall mean the preferred form for making modifications, 63 | including but not limited to software source code, documentation 64 | source, and configuration files. 65 | 66 | "Object" form shall mean any form resulting from mechanical 67 | transformation or translation of a Source form, including but 68 | not limited to compiled object code, generated documentation, 69 | and conversions to other media types. 70 | 71 | "Work" shall mean the work of authorship, whether in Source or 72 | Object form, made available under the License, as indicated by a 73 | copyright notice that is included in or attached to the work 74 | (an example is provided in the Appendix below). 75 | 76 | "Derivative Works" shall mean any work, whether in Source or Object 77 | form, that is based on (or derived from) the Work and for which the 78 | editorial revisions, annotations, elaborations, or other modifications 79 | represent, as a whole, an original work of authorship. For the purposes 80 | of this License, Derivative Works shall not include works that remain 81 | separable from, or merely link (or bind by name) to the interfaces of, 82 | the Work and Derivative Works thereof. 83 | 84 | "Contribution" shall mean any work of authorship, including 85 | the original version of the Work and any modifications or additions 86 | to that Work or Derivative Works thereof, that is intentionally 87 | submitted to Licensor for inclusion in the Work by the copyright owner 88 | or by an individual or Legal Entity authorized to submit on behalf of 89 | the copyright owner. For the purposes of this definition, "submitted" 90 | means any form of electronic, verbal, or written communication sent 91 | to the Licensor or its representatives, including but not limited to 92 | communication on electronic mailing lists, source code control systems, 93 | and issue tracking systems that are managed by, or on behalf of, the 94 | Licensor for the purpose of discussing and improving the Work, but 95 | excluding communication that is conspicuously marked or otherwise 96 | designated in writing by the copyright owner as "Not a Contribution." 97 | 98 | "Contributor" shall mean Licensor and any individual or Legal Entity 99 | on behalf of whom a Contribution has been received by Licensor and 100 | subsequently incorporated within the Work. 101 | 102 | 2. Grant of Copyright License. Subject to the terms and conditions of 103 | this License, each Contributor hereby grants to You a perpetual, 104 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 105 | copyright license to reproduce, prepare Derivative Works of, 106 | publicly display, publicly perform, sublicense, and distribute the 107 | Work and such Derivative Works in Source or Object form. 108 | 109 | 3. Grant of Patent License. Subject to the terms and conditions of 110 | this License, each Contributor hereby grants to You a perpetual, 111 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 112 | (except as stated in this section) patent license to make, have made, 113 | use, offer to sell, sell, import, and otherwise transfer the Work, 114 | where such license applies only to those patent claims licensable 115 | by such Contributor that are necessarily infringed by their 116 | Contribution(s) alone or by combination of their Contribution(s) 117 | with the Work to which such Contribution(s) was submitted. If You 118 | institute patent litigation against any entity (including a 119 | cross-claim or counterclaim in a lawsuit) alleging that the Work 120 | or a Contribution incorporated within the Work constitutes direct 121 | or contributory patent infringement, then any patent licenses 122 | granted to You under this License for that Work shall terminate 123 | as of the date such litigation is filed. 124 | 125 | 4. Redistribution. You may reproduce and distribute copies of the 126 | Work or Derivative Works thereof in any medium, with or without 127 | modifications, and in Source or Object form, provided that You 128 | meet the following conditions: 129 | 130 | (a) You must give any other recipients of the Work or 131 | Derivative Works a copy of this License; and 132 | 133 | (b) You must cause any modified files to carry prominent notices 134 | stating that You changed the files; and 135 | 136 | (c) You must retain, in the Source form of any Derivative Works 137 | that You distribute, all copyright, patent, trademark, and 138 | attribution notices from the Source form of the Work, 139 | excluding those notices that do not pertain to any part of 140 | the Derivative Works; and 141 | 142 | (d) If the Work includes a "NOTICE" text file as part of its 143 | distribution, then any Derivative Works that You distribute must 144 | include a readable copy of the attribution notices contained 145 | within such NOTICE file, excluding those notices that do not 146 | pertain to any part of the Derivative Works, in at least one 147 | of the following places: within a NOTICE text file distributed 148 | as part of the Derivative Works; within the Source form or 149 | documentation, if provided along with the Derivative Works; or, 150 | within a display generated by the Derivative Works, if and 151 | wherever such third-party notices normally appear. The contents 152 | of the NOTICE file are for informational purposes only and 153 | do not modify the License. You may add Your own attribution 154 | notices within Derivative Works that You distribute, alongside 155 | or as an addendum to the NOTICE text from the Work, provided 156 | that such additional attribution notices cannot be construed 157 | as modifying the License. 158 | 159 | You may add Your own copyright statement to Your modifications and 160 | may provide additional or different license terms and conditions 161 | for use, reproduction, or distribution of Your modifications, or 162 | for any such Derivative Works as a whole, provided Your use, 163 | reproduction, and distribution of the Work otherwise complies with 164 | the conditions stated in this License. 165 | 166 | 5. Submission of Contributions. Unless You explicitly state otherwise, 167 | any Contribution intentionally submitted for inclusion in the Work 168 | by You to the Licensor shall be under the terms and conditions of 169 | this License, without any additional terms or conditions. 170 | Notwithstanding the above, nothing herein shall supersede or modify 171 | the terms of any separate license agreement you may have executed 172 | with Licensor regarding such Contributions. 173 | 174 | 6. Trademarks. This License does not grant permission to use the trade 175 | names, trademarks, service marks, or product names of the Licensor, 176 | except as required for reasonable and customary use in describing the 177 | origin of the Work and reproducing the content of the NOTICE file. 178 | 179 | 7. Disclaimer of Warranty. Unless required by applicable law or 180 | agreed to in writing, Licensor provides the Work (and each 181 | Contributor provides its Contributions) on an "AS IS" BASIS, 182 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 183 | implied, including, without limitation, any warranties or conditions 184 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 185 | PARTICULAR PURPOSE. You are solely responsible for determining the 186 | appropriateness of using or redistributing the Work and assume any 187 | risks associated with Your exercise of permissions under this License. 188 | 189 | 8. Limitation of Liability. In no event and under no legal theory, 190 | whether in tort (including negligence), contract, or otherwise, 191 | unless required by applicable law (such as deliberate and grossly 192 | negligent acts) or agreed to in writing, shall any Contributor be 193 | liable to You for damages, including any direct, indirect, special, 194 | incidental, or consequential damages of any character arising as a 195 | result of this License or out of the use or inability to use the 196 | Work (including but not limited to damages for loss of goodwill, 197 | work stoppage, computer failure or malfunction, or any and all 198 | other commercial damages or losses), even if such Contributor 199 | has been advised of the possibility of such damages. 200 | 201 | 9. Accepting Warranty or Additional Liability. While redistributing 202 | the Work or Derivative Works thereof, You may choose to offer, 203 | and charge a fee for, acceptance of support, warranty, indemnity, 204 | or other liability obligations and/or rights consistent with this 205 | License. However, in accepting such obligations, You may act only 206 | on Your own behalf and on Your sole responsibility, not on behalf 207 | of any other Contributor, and only if You agree to indemnify, 208 | defend, and hold each Contributor harmless for any liability 209 | incurred by, or claims asserted against, such Contributor by reason 210 | of your accepting any such warranty or additional liability. 211 | 212 | END OF TERMS AND CONDITIONS 213 | 214 | APPENDIX: How to apply the Apache License to your work. 215 | 216 | To apply the Apache License to your work, attach the following 217 | boilerplate notice, with the fields enclosed by brackets "[]" 218 | replaced with your own identifying information. (Don't include 219 | the brackets!) The text should be enclosed in the appropriate 220 | comment syntax for the file format. We also recommend that a 221 | file or class name and description of purpose be included on the 222 | same "printed page" as the copyright notice for easier 223 | identification within third-party archives. 224 | 225 | Copyright [yyyy] [name of copyright owner] 226 | 227 | Licensed under the Apache License, Version 2.0 (the "License"); 228 | you may not use this file except in compliance with the License. 229 | You may obtain a copy of the License at 230 | 231 | http://www.apache.org/licenses/LICENSE-2.0 232 | 233 | Unless required by applicable law or agreed to in writing, software 234 | distributed under the License is distributed on an "AS IS" BASIS, 235 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 236 | See the License for the specific language governing permissions and 237 | limitations under the License. 238 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `strcursor` 2 | 3 | **Note**: This is something of a work-in-progress. It has tests, but hasn't been exhaustively vetted. 4 | 5 | This crate provides a "cursor" type for string slices. It provides the ability to safely seek back and forth through a string without worrying about producing invalid UTF-8 sequences, or splitting grapheme clusters. 6 | 7 | In addition, it provides types to represent single grapheme clusters (`Gc`) and `GcBuf`) as distinct from arbitrary string slices. 8 | 9 | See the `StrCursor` type for details. 10 | 11 | **Links** 12 | 13 | * [Latest Release](https://crates.io/crates/strcursor/) 14 | * [Latest Docs](https://danielkeep.github.io/strcursor/doc/strcursor/index.html) 15 | * [Repository](https://github.com/DanielKeep/strcursor) 16 | 17 | ## Compatibility 18 | 19 | `strcursor` is currently supported on `rustc` version 1.1.0 and higher. 20 | 21 | * `rustc` < 1.4 will use a larger, less space-efficient implementation of `GcBuf`; rather than being the same size as `Box`, it will be the same size as `String`. 22 | 23 | * `rustc` < 1.1 is not supported, due to a mysterious compiler crash. 24 | 25 | ## License 26 | 27 | Licensed under either of 28 | 29 | * MIT license (see [LICENSE](LICENSE) or ) 30 | * Apache License, Version 2.0 (see [LICENSE](LICENSE) or ) 31 | 32 | at your option. 33 | 34 | ### Contribution 35 | 36 | Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you shall be dual licensed as above, without any additional terms or conditions. 37 | -------------------------------------------------------------------------------- /build.rs: -------------------------------------------------------------------------------- 1 | extern crate rustc_version; 2 | use rustc_version::{version_matches}; 3 | 4 | fn main() { 5 | println!("cargo:rerun-if-changed=build.rs"); 6 | 7 | if version_matches("1.4.0") { 8 | println!("cargo:rustc-cfg=has_string_into_boxed_string"); 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /src/grapheme.rs: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright ⓒ 2015, 2016 Daniel Keep. 3 | 4 | Licensed under the MIT license (see LICENSE or ) or the Apache License, Version 2.0 (see LICENSE of 6 | ), at your option. All 7 | files in the project carrying such notice may not be copied, modified, 8 | or distributed except according to those terms. 9 | */ 10 | /*! 11 | Defines types for representing single grapheme clusters. 12 | */ 13 | use std::borrow::{Borrow, Cow, ToOwned}; 14 | use std::convert::AsRef; 15 | use std::cmp::Ordering; 16 | use std::fmt::{self, Debug, Display}; 17 | use std::mem::transmute; 18 | use std::ops::Deref; 19 | use uniseg::UnicodeSegmentation as UniSeg; 20 | 21 | /** 22 | An iterator over the lower case mapping of a given grapheme cluster, returned from [`Gc::to_lowercase`](struct.Gc.html#method.to_lowercase). 23 | */ 24 | pub type ToLowercase<'a> = ::std::iter::FlatMap<::std::str::Chars<'a>, ::std::char::ToLowercase, fn(char) -> ::std::char::ToLowercase>; 25 | 26 | /** 27 | An iterator over the lower case mapping of a given grapheme cluster, returned from [`Gc::to_uppercase`](struct.Gc.html#method.to_uppercase). 28 | */ 29 | pub type ToUppercase<'a> = ::std::iter::FlatMap<::std::str::Chars<'a>, ::std::char::ToUppercase, fn(char) -> ::std::char::ToUppercase>; 30 | 31 | /** 32 | A slice of a single Unicode grapheme cluster (GC) (akin to `str`). 33 | 34 | A grapheme cluster is a single visual "unit" in Unicode text, and is composed of *at least* one Unicode code point, possibly more. 35 | 36 | This type is a wrapper around `str` that enforces the additional invariant that it will *always* contain *exactly* one grapheme cluster. This allows some operations (such as extracting the base code point) simpler. 37 | 38 | ## Why Grapheme Clusters? 39 | 40 | The simplest example is the distinction between "é" ("Latin Small Letter E with Acute") and "é" ("Latin Small Letter E", "Combining Acute Accent"): the first is *one* code point, the second is *two*. 41 | 42 | In Rust, the `char` type is a single code point. As a result, treating it as a "character" is incorrect for the same reason that using `u8` is: it excludes many legitimate characters. It can also cause issues whereby naive algorithms may corrupt text by considering components of a grapheme cluster separately. For example, truncating a string to "10 characters" using `char`s can lead to logical characters being broken apart, potentially changing their meaning. 43 | 44 | One inconvenience when dealing with grapheme clusters in Rust is that they are not accurately represented by any type more-so than a regular `&str`. However, operations that might make sense on an individual character (such as asking whether it is in the ASCII range, or is numeric) don't make sense on a full string. In addition, a `&str` can be empty or contain more than one grapheme cluster. 45 | 46 | Hence, this type guarantees that it always represents *exactly* one Unicode grapheme cluster. 47 | */ 48 | #[derive(Eq, PartialEq, Ord, PartialOrd, Hash)] 49 | pub struct Gc(str); 50 | 51 | impl Gc { 52 | /** 53 | Create a new `Gc` from the given string slice. 54 | 55 | The slice must contain *exactly* one grapheme cluster. In the event that the input is empty, or contains more than one grapheme cluster, this function will return `None`. 56 | 57 | See: [`split_from`](#method.split_from). 58 | */ 59 | pub fn from_str(s: &str) -> Option<&Gc> { 60 | match Gc::split_from(s) { 61 | Some((gc, tail)) => if tail.len() == 0 { Some(gc) } else { None }, 62 | None => None 63 | } 64 | } 65 | 66 | /** 67 | Create a new `Gc` from the given string slice. 68 | 69 | This function *does not* check to ensure the provided slice is a single, valid grapheme cluster. 70 | */ 71 | pub unsafe fn from_str_unchecked(s: &str) -> &Gc { 72 | transmute(s) 73 | } 74 | 75 | /** 76 | Try to split a single grapheme cluster from the start of `s`. 77 | 78 | Returns `None` if the given string was empty. 79 | */ 80 | pub fn split_from(s: &str) -> Option<(&Gc, &str)> { 81 | unsafe { 82 | let gr = match UniSeg::graphemes(s, /*is_extended:*/true).next() { 83 | Some(gr) => gr, 84 | None => return None, 85 | }; 86 | Some((Gc::from_str_unchecked(gr), s.slice_unchecked(gr.len(), s.len()))) 87 | } 88 | } 89 | 90 | /** 91 | Returns the length of this grapheme cluster in bytes. 92 | */ 93 | pub fn len(&self) -> usize { 94 | self.0.len() 95 | } 96 | 97 | /** 98 | Does this grapheme cluster have additional marks applied to it? 99 | 100 | This is `true` if the cluster is comprised of more than a single code point. 101 | */ 102 | pub fn has_marks(&self) -> bool { 103 | self.base_char().len_utf8() != self.as_str().len() 104 | } 105 | 106 | /** 107 | Converts this to a byte slice. 108 | */ 109 | pub fn as_bytes(&self) -> &[u8] { 110 | self.0.as_bytes() 111 | } 112 | 113 | /** 114 | Converts this to a string slice. 115 | */ 116 | pub fn as_str(&self) -> &str { 117 | &self.0 118 | } 119 | 120 | /** 121 | Returns the "base" code point. 122 | 123 | That is, this returns the first code point in the cluster. 124 | */ 125 | pub fn base_char(&self) -> char { 126 | unsafe { 127 | match self.0.chars().next() { 128 | Some(cp) => cp, 129 | None => debug_unreachable!(), 130 | } 131 | } 132 | } 133 | 134 | /** 135 | Returns the "base" code point as a grapheme cluster. 136 | 137 | This is equivalent to converting this GC into a string slice, then slicing off the bytes that make up the first code point. 138 | */ 139 | pub fn base(&self) -> &Gc { 140 | unsafe { 141 | let base_cp = self.base_char(); 142 | let base_len = base_cp.len_utf8(); 143 | Gc::from_str_unchecked(self.0.slice_unchecked(base_len, self.0.len())) 144 | } 145 | } 146 | 147 | /** 148 | Returns the combining marks as a string slice. 149 | 150 | The result of this method may be empty, or of arbitrary length. 151 | */ 152 | pub fn mark_str(&self) -> &str { 153 | unsafe { 154 | let base_cp = self.base_char(); 155 | let base_len = base_cp.len_utf8(); 156 | self.0.slice_unchecked(base_len, self.0.len()) 157 | } 158 | } 159 | 160 | /** 161 | An iterator over the code points of this grapheme cluster. 162 | */ 163 | pub fn chars(&self) -> ::std::str::Chars { 164 | self.0.chars() 165 | } 166 | 167 | /** 168 | An iterator over the code points of this grapheme cluster, and their associated byte offsets. 169 | */ 170 | pub fn char_indices(&self) -> ::std::str::CharIndices { 171 | self.0.char_indices() 172 | } 173 | 174 | /** 175 | An iterator over the bytes of this grapheme cluster. 176 | */ 177 | pub fn bytes(&self) -> ::std::str::Bytes { 178 | self.0.bytes() 179 | } 180 | 181 | /** 182 | Returns an iterator over the code points in the lower case equivalent of this grapheme cluster. 183 | */ 184 | pub fn to_lowercase(&self) -> ToLowercase { 185 | self.0.chars().flat_map(char::to_lowercase) 186 | } 187 | 188 | /** 189 | Returns an iterator over the code points in the upper case equivalent of this grapheme cluster. 190 | */ 191 | pub fn to_uppercase(&self) -> ToUppercase { 192 | self.0.chars().flat_map(char::to_uppercase) 193 | } 194 | } 195 | 196 | impl AsRef for Gc { 197 | fn as_ref(&self) -> &str { 198 | self.as_str() 199 | } 200 | } 201 | 202 | impl AsRef<[u8]> for Gc { 203 | fn as_ref(&self) -> &[u8] { 204 | self.as_str().as_bytes() 205 | } 206 | } 207 | 208 | impl Debug for Gc { 209 | fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { 210 | Debug::fmt(&self.0, fmt) 211 | } 212 | } 213 | 214 | impl Display for Gc { 215 | fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { 216 | Display::fmt(&self.0, fmt) 217 | } 218 | } 219 | 220 | impl<'a> PartialEq<&'a Gc> for Gc { 221 | fn eq(&self, other: &&'a Gc) -> bool { 222 | self.eq(*other) 223 | } 224 | } 225 | 226 | impl<'a> PartialEq for &'a Gc { 227 | fn eq(&self, other: &Gc) -> bool { 228 | (*self).eq(other) 229 | } 230 | } 231 | 232 | impl PartialEq for Gc { 233 | fn eq(&self, other: &char) -> bool { 234 | !self.has_marks() && self.base_char().eq(other) 235 | } 236 | } 237 | 238 | impl PartialEq for Gc { 239 | fn eq(&self, other: &str) -> bool { 240 | self.0.eq(other) 241 | } 242 | } 243 | 244 | impl<'a> PartialEq<&'a str> for Gc { 245 | fn eq(&self, other: &&'a str) -> bool { 246 | self.0.eq(*other) 247 | } 248 | } 249 | 250 | impl PartialEq for Gc { 251 | fn eq(&self, other: &GcBuf) -> bool { 252 | self.0.eq(other.as_gc()) 253 | } 254 | } 255 | 256 | impl PartialEq for Gc { 257 | fn eq(&self, other: &String) -> bool { 258 | self.0.eq(&**other) 259 | } 260 | } 261 | 262 | impl<'a> PartialEq> for Gc { 263 | fn eq(&self, other: &Cow<'a, Gc>) -> bool { 264 | self.0.eq((*other).deref()) 265 | } 266 | } 267 | 268 | impl<'a> PartialEq for &'a Gc { 269 | fn eq(&self, other: &char) -> bool { 270 | !self.has_marks() && self.base_char().eq(other) 271 | } 272 | } 273 | 274 | impl<'a> PartialEq for &'a Gc { 275 | fn eq(&self, other: &str) -> bool { 276 | self.0.eq(other) 277 | } 278 | } 279 | 280 | impl<'a> PartialEq for &'a Gc { 281 | fn eq(&self, other: &GcBuf) -> bool { 282 | self.0.eq(other.as_gc()) 283 | } 284 | } 285 | 286 | impl<'a> PartialEq for &'a Gc { 287 | fn eq(&self, other: &String) -> bool { 288 | self.0.eq(&**other) 289 | } 290 | } 291 | 292 | impl<'a> PartialEq> for &'a Gc { 293 | fn eq(&self, other: &Cow<'a, Gc>) -> bool { 294 | self.0.eq((*other).deref()) 295 | } 296 | } 297 | 298 | impl PartialEq for char { 299 | fn eq(&self, other: &Gc) -> bool { 300 | self.eq(&other.base_char()) 301 | } 302 | } 303 | 304 | impl PartialEq for str { 305 | fn eq(&self, other: &Gc) -> bool { 306 | self.eq(&other.0) 307 | } 308 | } 309 | 310 | impl<'a> PartialEq for &'a str { 311 | fn eq(&self, other: &Gc) -> bool { 312 | self.eq(&&other.0) 313 | } 314 | } 315 | 316 | impl PartialEq for String { 317 | fn eq(&self, other: &Gc) -> bool { 318 | self.eq(&other.as_str()) 319 | } 320 | } 321 | 322 | impl<'a> PartialEq for Cow<'a, Gc> { 323 | fn eq(&self, other: &Gc) -> bool { 324 | (**self).eq(other) 325 | } 326 | } 327 | 328 | impl<'a> PartialEq<&'a Gc> for char { 329 | fn eq(&self, other: &&'a Gc) -> bool { 330 | self.eq(&other.base_char()) 331 | } 332 | } 333 | 334 | impl<'a> PartialEq<&'a Gc> for str { 335 | fn eq(&self, other: &&'a Gc) -> bool { 336 | self.eq(&other.0) 337 | } 338 | } 339 | 340 | impl<'a> PartialEq<&'a Gc> for String { 341 | fn eq(&self, other: &&'a Gc) -> bool { 342 | self.eq(&other.as_str()) 343 | } 344 | } 345 | 346 | impl<'a> PartialEq<&'a Gc> for Cow<'a, Gc> { 347 | fn eq(&self, other: &&'a Gc) -> bool { 348 | (**self).eq(*other) 349 | } 350 | } 351 | 352 | impl<'a> PartialOrd<&'a Gc> for Gc { 353 | fn partial_cmp(&self, other: &&'a Gc) -> Option { 354 | self.partial_cmp(*other) 355 | } 356 | } 357 | 358 | impl<'a> PartialOrd for &'a Gc { 359 | fn partial_cmp(&self, other: &Gc) -> Option { 360 | (*self).partial_cmp(other) 361 | } 362 | } 363 | 364 | impl PartialOrd for Gc { 365 | fn partial_cmp(&self, other: &char) -> Option { 366 | if !self.has_marks() { 367 | self.base_char().partial_cmp(other) 368 | } else { 369 | match self.base_char().partial_cmp(other) { 370 | Some(Ordering::Equal) => Some(Ordering::Less), 371 | other => other 372 | } 373 | } 374 | } 375 | } 376 | 377 | impl PartialOrd for Gc { 378 | fn partial_cmp(&self, other: &str) -> Option { 379 | self.0.partial_cmp(other) 380 | } 381 | } 382 | 383 | impl<'a> PartialOrd<&'a str> for Gc { 384 | fn partial_cmp(&self, other: &&'a str) -> Option { 385 | self.0.partial_cmp(*other) 386 | } 387 | } 388 | 389 | impl PartialOrd for Gc { 390 | fn partial_cmp(&self, other: &GcBuf) -> Option { 391 | self.0.partial_cmp(other.as_gc()) 392 | } 393 | } 394 | 395 | impl PartialOrd for Gc { 396 | fn partial_cmp(&self, other: &String) -> Option { 397 | self.0.partial_cmp(&**other) 398 | } 399 | } 400 | 401 | impl<'a> PartialOrd> for Gc { 402 | fn partial_cmp(&self, other: &Cow<'a, Gc>) -> Option { 403 | self.0.partial_cmp((*other).deref()) 404 | } 405 | } 406 | 407 | impl<'a> PartialOrd for &'a Gc { 408 | fn partial_cmp(&self, other: &char) -> Option { 409 | other.partial_cmp(self).map(Ordering::reverse) 410 | } 411 | } 412 | 413 | impl<'a> PartialOrd for &'a Gc { 414 | fn partial_cmp(&self, other: &str) -> Option { 415 | self.0.partial_cmp(other) 416 | } 417 | } 418 | 419 | impl<'a> PartialOrd for &'a Gc { 420 | fn partial_cmp(&self, other: &GcBuf) -> Option { 421 | self.0.partial_cmp(other.as_gc()) 422 | } 423 | } 424 | 425 | impl<'a> PartialOrd for &'a Gc { 426 | fn partial_cmp(&self, other: &String) -> Option { 427 | self.0.partial_cmp(&**other) 428 | } 429 | } 430 | 431 | impl<'a> PartialOrd> for &'a Gc { 432 | fn partial_cmp(&self, other: &Cow<'a, Gc>) -> Option { 433 | self.0.partial_cmp((*other).deref()) 434 | } 435 | } 436 | 437 | impl PartialOrd for char { 438 | fn partial_cmp(&self, other: &Gc) -> Option { 439 | self.partial_cmp(&other.base_char()) 440 | } 441 | } 442 | 443 | impl PartialOrd for str { 444 | fn partial_cmp(&self, other: &Gc) -> Option { 445 | self.partial_cmp(&other.0) 446 | } 447 | } 448 | 449 | impl<'a> PartialOrd for &'a str { 450 | fn partial_cmp(&self, other: &Gc) -> Option { 451 | self.partial_cmp(&&other.0) 452 | } 453 | } 454 | 455 | impl PartialOrd for String { 456 | fn partial_cmp(&self, other: &Gc) -> Option { 457 | (&**self).partial_cmp(other.as_str()) 458 | } 459 | } 460 | 461 | impl<'a> PartialOrd for Cow<'a, Gc> { 462 | fn partial_cmp(&self, other: &Gc) -> Option { 463 | (**self).partial_cmp(other) 464 | } 465 | } 466 | 467 | impl<'a> PartialOrd<&'a Gc> for char { 468 | fn partial_cmp(&self, other: &&'a Gc) -> Option { 469 | self.partial_cmp(&other.base_char()) 470 | } 471 | } 472 | 473 | impl<'a> PartialOrd<&'a Gc> for str { 474 | fn partial_cmp(&self, other: &&'a Gc) -> Option { 475 | self.partial_cmp(&other.0) 476 | } 477 | } 478 | 479 | impl<'a> PartialOrd<&'a Gc> for String { 480 | fn partial_cmp(&self, other: &&'a Gc) -> Option { 481 | (&**self).partial_cmp(other.as_str()) 482 | } 483 | } 484 | 485 | impl<'a> PartialOrd<&'a Gc> for Cow<'a, Gc> { 486 | fn partial_cmp(&self, other: &&'a Gc) -> Option { 487 | (**self).partial_cmp(*other) 488 | } 489 | } 490 | 491 | impl ToOwned for Gc { 492 | type Owned = GcBuf; 493 | fn to_owned(&self) -> Self::Owned { 494 | unsafe { 495 | GcBuf::from_string_unchecked(self.0.to_owned()) 496 | } 497 | } 498 | } 499 | 500 | /** 501 | An owned, single Unicode grapheme cluster (akin to `String`). 502 | 503 | See [`Gc`](struct.Gc.html) for more details. 504 | */ 505 | #[cfg(has_string_into_boxed_string)] 506 | #[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] 507 | pub struct GcBuf(Box); 508 | 509 | /** 510 | An owned, single Unicode grapheme cluster (akin to `String`). 511 | 512 | See [`Gc`](struct.Gc.html) for more details. 513 | */ 514 | #[cfg(not(has_string_into_boxed_string))] 515 | #[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash)] 516 | pub struct GcBuf(String); 517 | 518 | impl GcBuf { 519 | /** 520 | Create a new `GcBuf` from the given `String`. 521 | 522 | This function *does not* check to ensure the provided string is a single, valid grapheme cluster. 523 | */ 524 | pub unsafe fn from_string_unchecked(s: String) -> GcBuf { 525 | Self::from_string_unchecked_impl(s) 526 | } 527 | 528 | #[cfg(has_string_into_boxed_string)] 529 | unsafe fn from_string_unchecked_impl(s: String) -> GcBuf { 530 | GcBuf(s.into_boxed_str()) 531 | } 532 | 533 | #[cfg(not(has_string_into_boxed_string))] 534 | unsafe fn from_string_unchecked_impl(s: String) -> GcBuf { 535 | GcBuf(s) 536 | } 537 | 538 | /** 539 | Returns a borrowed grapheme cluster slice. 540 | */ 541 | pub fn as_gc(&self) -> &Gc { 542 | unsafe { 543 | Gc::from_str_unchecked(&self.0) 544 | } 545 | } 546 | } 547 | 548 | impl AsRef for GcBuf { 549 | fn as_ref(&self) -> &Gc { 550 | self.as_gc() 551 | } 552 | } 553 | 554 | impl AsRef for GcBuf { 555 | fn as_ref(&self) -> &str { 556 | self.as_str() 557 | } 558 | } 559 | 560 | impl AsRef<[u8]> for GcBuf { 561 | fn as_ref(&self) -> &[u8] { 562 | self.as_str().as_bytes() 563 | } 564 | } 565 | 566 | impl Borrow for GcBuf { 567 | fn borrow(&self) -> &Gc { 568 | self.as_gc() 569 | } 570 | } 571 | 572 | impl Debug for GcBuf { 573 | fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { 574 | Debug::fmt(&self.0, fmt) 575 | } 576 | } 577 | 578 | impl Default for GcBuf { 579 | fn default() -> Self { 580 | unsafe { 581 | GcBuf::from_string_unchecked(String::from("\u{0}")) 582 | } 583 | } 584 | } 585 | 586 | impl Deref for GcBuf { 587 | type Target = Gc; 588 | fn deref(&self) -> &Gc { 589 | self.as_gc() 590 | } 591 | } 592 | 593 | impl Display for GcBuf { 594 | fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { 595 | Display::fmt(&self.0, fmt) 596 | } 597 | } 598 | 599 | impl<'a> From<&'a Gc> for GcBuf { 600 | fn from(v: &'a Gc) -> Self { 601 | unsafe { 602 | GcBuf::from_string_unchecked(v.as_str().to_owned()) 603 | } 604 | } 605 | } 606 | 607 | impl From for GcBuf { 608 | fn from(v: char) -> Self { 609 | unsafe { 610 | let mut buf = [0; 4]; 611 | let bs = match ::util::encode_utf8_raw(v as u32, &mut buf) { 612 | Some(len) => { 613 | if len < 4 { 614 | &buf[..len] 615 | } else { 616 | debug_unreachable!(); 617 | } 618 | }, 619 | None => debug_unreachable!(), 620 | }; 621 | let s: &str = transmute(bs); 622 | let s = s.to_owned(); 623 | GcBuf::from_string_unchecked(s) 624 | } 625 | } 626 | } 627 | 628 | #[cfg(has_string_into_boxed_string)] 629 | impl Into> for GcBuf { 630 | fn into(self) -> Box { 631 | self.0 632 | } 633 | } 634 | 635 | #[cfg(has_string_into_boxed_string)] 636 | impl Into for GcBuf { 637 | fn into(self) -> String { 638 | self.0.into_string() 639 | } 640 | } 641 | 642 | #[cfg(has_string_into_boxed_string)] 643 | impl Into> for GcBuf { 644 | fn into(self) -> Vec { 645 | self.0.into_string().into() 646 | } 647 | } 648 | 649 | #[cfg(not(has_string_into_boxed_string))] 650 | impl Into for GcBuf { 651 | fn into(self) -> String { 652 | self.0 653 | } 654 | } 655 | 656 | #[cfg(not(has_string_into_boxed_string))] 657 | impl Into> for GcBuf { 658 | fn into(self) -> Vec { 659 | self.0.into() 660 | } 661 | } 662 | 663 | macro_rules! as_item { 664 | ($i:item) => { $i }; 665 | } 666 | 667 | macro_rules! forward_partial_eq { 668 | (~ <$lt:tt> $lhs:ty, $rhs:ty) => { 669 | as_item! { 670 | impl<$lt> PartialEq<$rhs> for $lhs { 671 | fn eq(&self, other: &$rhs) -> bool { 672 | other.as_gc().eq(self) 673 | } 674 | } 675 | } 676 | }; 677 | 678 | (~ $lhs:ty, $rhs:ty) => { 679 | impl PartialEq<$rhs> for $lhs { 680 | fn eq(&self, other: &$rhs) -> bool { 681 | other.as_gc().eq(self) 682 | } 683 | } 684 | }; 685 | 686 | (<$lt:tt> $lhs:ty, $rhs:ty) => { 687 | as_item! { 688 | impl<$lt> PartialEq<$rhs> for $lhs { 689 | fn eq(&self, other: &$rhs) -> bool { 690 | self.as_gc().eq(other) 691 | } 692 | } 693 | } 694 | }; 695 | 696 | ($lhs:ty, $rhs:ty) => { 697 | impl PartialEq<$rhs> for $lhs { 698 | fn eq(&self, other: &$rhs) -> bool { 699 | self.as_gc().eq(other) 700 | } 701 | } 702 | }; 703 | } 704 | 705 | forward_partial_eq! { GcBuf, char } 706 | forward_partial_eq! { GcBuf, str } 707 | forward_partial_eq! { GcBuf, Gc } 708 | forward_partial_eq! { GcBuf, String } 709 | forward_partial_eq! { <'a> GcBuf, &'a str } 710 | forward_partial_eq! { <'a> GcBuf, &'a Gc } 711 | forward_partial_eq! { <'a> GcBuf, Cow<'a, Gc> } 712 | 713 | forward_partial_eq! { ~ char, GcBuf } 714 | forward_partial_eq! { ~ str, GcBuf } 715 | forward_partial_eq! { ~ String, GcBuf } 716 | forward_partial_eq! { ~ <'a> &'a str, GcBuf } 717 | forward_partial_eq! { ~ <'a> Cow<'a, Gc>, GcBuf } 718 | 719 | macro_rules! forward_partial_ord { 720 | (~ <$lt:tt> $lhs:ty, $rhs:ty) => { 721 | as_item! { 722 | impl<$lt> PartialOrd<$rhs> for $lhs { 723 | fn partial_cmp(&self, other: &$rhs) -> Option { 724 | other.as_gc().partial_cmp(self).map(Ordering::reverse) 725 | } 726 | } 727 | } 728 | }; 729 | 730 | (~ $lhs:ty, $rhs:ty) => { 731 | impl PartialOrd<$rhs> for $lhs { 732 | fn partial_cmp(&self, other: &$rhs) -> Option { 733 | other.as_gc().partial_cmp(self).map(Ordering::reverse) 734 | } 735 | } 736 | }; 737 | 738 | (<$lt:tt> $lhs:ty, $rhs:ty) => { 739 | as_item! { 740 | impl<$lt> PartialOrd<$rhs> for $lhs { 741 | fn partial_cmp(&self, other: &$rhs) -> Option { 742 | self.as_gc().partial_cmp(other) 743 | } 744 | } 745 | } 746 | }; 747 | 748 | ($lhs:ty, $rhs:ty) => { 749 | impl PartialOrd<$rhs> for $lhs { 750 | fn partial_cmp(&self, other: &$rhs) -> Option { 751 | self.as_gc().partial_cmp(other) 752 | } 753 | } 754 | }; 755 | } 756 | 757 | forward_partial_ord! { GcBuf, char } 758 | forward_partial_ord! { GcBuf, str } 759 | forward_partial_ord! { GcBuf, Gc } 760 | forward_partial_ord! { GcBuf, String } 761 | forward_partial_ord! { <'a> GcBuf, &'a str } 762 | forward_partial_ord! { <'a> GcBuf, &'a Gc } 763 | forward_partial_ord! { <'a> GcBuf, Cow<'a, Gc> } 764 | 765 | forward_partial_ord! { ~ char, GcBuf } 766 | forward_partial_ord! { ~ str, GcBuf } 767 | forward_partial_ord! { ~ String, GcBuf } 768 | forward_partial_ord! { ~ <'a> &'a str, GcBuf } 769 | forward_partial_ord! { ~ <'a> Cow<'a, Gc>, GcBuf } 770 | 771 | #[cfg(test)] 772 | mod gc_tests { 773 | use super::Gc; 774 | 775 | fn gc(s: &str) -> &Gc { 776 | Gc::from_str(s).unwrap() 777 | } 778 | 779 | #[test] 780 | fn test_from_str() { 781 | assert_eq!(Gc::from_str("a").map(Gc::as_str), Some("a")); 782 | assert_eq!(Gc::from_str("á").map(Gc::as_str), Some("á")); 783 | assert_eq!(Gc::from_str("ä").map(Gc::as_str), Some("ä")); 784 | assert_eq!(Gc::from_str("̈").map(Gc::as_str), Some("̈")); // NB: there is a single combining diaereses in the string. 785 | assert_eq!(Gc::from_str("字").map(Gc::as_str), Some("字")); 786 | assert_eq!(Gc::from_str("").map(Gc::as_str), None); 787 | assert_eq!(Gc::from_str("ab").map(Gc::as_str), None); 788 | } 789 | 790 | #[test] 791 | fn test_split_from() { 792 | fn map<'a>((gr, s): (&'a Gc, &'a str)) -> (&'a str, &'a str) { 793 | (gr.as_str(), s) 794 | } 795 | 796 | assert_eq!(Gc::split_from("a").map(map), Some(("a", ""))); 797 | assert_eq!(Gc::split_from("á").map(map), Some(("á", ""))); 798 | assert_eq!(Gc::split_from("ä").map(map), Some(("ä", ""))); 799 | assert_eq!(Gc::split_from("̈").map(map), Some(("̈", ""))); // NB: there is a single combining diaereses in the string. 800 | assert_eq!(Gc::split_from("字").map(map), Some(("字", ""))); 801 | assert_eq!(Gc::split_from("").map(map), None); 802 | assert_eq!(Gc::split_from("ab").map(map), Some(("a", "b"))); 803 | } 804 | 805 | #[test] 806 | fn test_has_marks() { 807 | assert!(!gc("a").has_marks()); 808 | assert!(!gc("á").has_marks()); 809 | assert!(gc("ä").has_marks()); 810 | assert!(!gc("̈").has_marks()); 811 | assert!(!gc("字").has_marks()); 812 | } 813 | 814 | #[test] 815 | fn test_base_char() { 816 | assert_eq!(gc("a").base_char(), 'a'); 817 | assert_eq!(gc("á").base_char(), 'á'); 818 | assert_eq!(gc("ä").base_char(), 'a'); 819 | assert_eq!(gc("̈").base_char(), '̈'); 820 | assert_eq!(gc("字").base_char(), '字'); 821 | } 822 | 823 | #[test] 824 | fn test_mark_str() { 825 | assert_eq!(gc("a").mark_str(), ""); 826 | assert_eq!(gc("á").mark_str(), ""); 827 | assert_eq!(gc("ä").mark_str(), "̈"); 828 | assert_eq!(gc("̈").mark_str(), ""); 829 | assert_eq!(gc("字").mark_str(), ""); 830 | } 831 | } -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright ⓒ 2015, 2016 Daniel Keep. 3 | 4 | Licensed under the MIT license (see LICENSE or ) or the Apache License, Version 2.0 (see LICENSE of 6 | ), at your option. All 7 | files in the project carrying such notice may not be copied, modified, 8 | or distributed except according to those terms. 9 | */ 10 | /*! 11 | This crate provides a "cursor" type for string slices. It provides the ability to safely seek back and forth through a string without worrying about producing invalid UTF-8 sequences, or splitting grapheme clusters. 12 | 13 | In addition, it provides types to represent single grapheme clusters ([`Gc`](struct.Gc.html) and [`GcBuf`](struct.GcBuf.html)) as distinct from arbitrary string slices. 14 | 15 | See the [`StrCursor`](struct.StrCursor.html) type for details. 16 | 17 | 30 | 39 | 40 | ## Compatibility 41 | 42 | `strcursor` is currently supported on `rustc` version 1.1.0 and higher. 43 | 44 | * `rustc` < 1.4 will use a larger, less space-efficient implementation of `GcBuf`; rather than being the same size as `Box`, it will be the same size as `String`. 45 | 46 | * `rustc` < 1.1 is not supported, due to a mysterious compiler crash. 47 | 48 | */ 49 | extern crate unicode_segmentation as uniseg; 50 | 51 | /** 52 | Inserts a panic in debug builds, an optimisation hint in release builds. 53 | 54 | **Do not replace this with the `debug_unreachable` crate.** Recent versions of that crate do not build under Rust < 1.6, and old versions that used to no longer will, as they have sufficiently vague dependency version specifiers. 55 | */ 56 | #[doc(hidden)] 57 | macro_rules! debug_unreachable { 58 | () => { 59 | if cfg!(ndebug) { 60 | ::util::unreachable() 61 | } else { 62 | panic!("entered unreachable code") 63 | } 64 | }; 65 | } 66 | 67 | pub use grapheme::{Gc, GcBuf}; 68 | 69 | pub mod grapheme; 70 | mod util; 71 | 72 | use uniseg::UnicodeSegmentation as UniSeg; 73 | 74 | /** 75 | This type represents a cursor into a string slice; that is, in addition to having a beginning and end, it also has a current position between those two. This position can be seeked left and right within those bounds. 76 | 77 | > **Note**: the cursor may validly be positioned *at* the end of the string. That is, in a position where there are no code points or grapheme clusters to the right of the cursor, and the entire contents of the string is to the left of the cursor. 78 | 79 | The main reason for this is that *sometimes*, you want the ability to do things like "advance a character", and the existing APIs for this can be somewhat verbose. 80 | 81 | In addition, *unstable* support for grapheme clusters is exposed by the standard library, which conflicts with the *stable* support provided by the `unicode-segmentation` crate, which makes doing "the right thing" painful. `StrCursor` exposes grapheme clusters by default, and makes them cleaner to work with. 82 | 83 | The cursor guarantees the following at all times: 84 | 85 | * The cursor position *cannot* be outside of the original string slice it was constructed with. 86 | * The cursor position *cannot* lie between Unicode code points, meaning that you *cannot* generate an invalid string slice from a cursor. 87 | * If the code point-specific methods are *not* used, the cursor will always lie between grapheme clusters. 88 | 89 | This last point is somewhat important: the cursor is designed to favour operating on grapheme clusters, rather than code points. If you misalign the cursor with respect to grapheme clusters, the behaviour of methods that deal with grapheme clusters is officially *undefined*, but is generally well-behaved. 90 | 91 | The methods that operate on the cursor will either return a fresh `Option` (depending on whether the seek operation is valid or not), or mutate the existing cursor (in which case, they will *panic* if the seek operation is not valid). 92 | */ 93 | pub struct StrCursor<'a> { 94 | s: &'a str, 95 | at: *const u8, 96 | } 97 | 98 | impl<'a> StrCursor<'a> { 99 | /** 100 | Create a new cursor at the start of `s`. 101 | */ 102 | #[inline] 103 | pub fn new_at_start(s: &'a str) -> StrCursor<'a> { 104 | StrCursor { 105 | s: s, 106 | at: s.as_ptr(), 107 | } 108 | } 109 | 110 | /** 111 | Create a new cursor past at the end of `s`. 112 | */ 113 | #[inline] 114 | pub fn new_at_end(s: &'a str) -> StrCursor<'a> { 115 | StrCursor { 116 | s: s, 117 | at: byte_pos_to_ptr(s, s.len()), 118 | } 119 | } 120 | 121 | /** 122 | Create a new cursor at the first grapheme cluster which begins at or to the left of the given byte position. 123 | */ 124 | #[inline] 125 | pub fn new_at_left_of_byte_pos(s: &'a str, byte_pos: usize) -> StrCursor<'a> { 126 | // Start at a codepoint. 127 | let cur = StrCursor::new_at_cp_left_of_byte_pos(s, byte_pos); 128 | 129 | // Seek back to the previous grapheme. 130 | let prev = cur.at_prev(); 131 | 132 | let prev = match prev { 133 | None => return cur, // We were already at the start. 134 | Some(c) => c 135 | }; 136 | 137 | // unwrap should be OK here. 138 | if prev.byte_pos() + prev.after().unwrap().len() > byte_pos { 139 | prev 140 | } else { 141 | cur 142 | } 143 | } 144 | 145 | /** 146 | Create a new cursor at the first grapheme cluster which begins at or to the right of the given byte position. 147 | */ 148 | #[inline] 149 | pub fn new_at_right_of_byte_pos(s: &'a str, byte_pos: usize) -> StrCursor<'a> { 150 | // I don't know how robust the grapheme iteration rules are when trying to step forward from a (potentially) invalid position. As such, I'm *instead* going to start from a known-good position. 151 | let cur = StrCursor::new_at_left_of_byte_pos(s, byte_pos); 152 | if cur.byte_pos() == byte_pos { 153 | return cur; 154 | } 155 | 156 | // This unwrap shouldn't be able to fail. 157 | cur.at_next().unwrap() 158 | } 159 | 160 | /** 161 | Create a new cursor at the first code point which begins at or to the left of the given byte position. 162 | 163 | # Note 164 | 165 | Where possible, you should prefer `new_at_left_of_byte_pos`. 166 | */ 167 | #[inline] 168 | pub fn new_at_cp_left_of_byte_pos(s: &'a str, byte_pos: usize) -> StrCursor<'a> { 169 | StrCursor { 170 | s: s, 171 | at: unsafe { seek_utf8_cp_start_left(s, byte_pos_to_ptr(s, byte_pos)) }, 172 | } 173 | } 174 | 175 | /** 176 | Create a new cursor at the first code point which begins at or to the right of the given byte position. 177 | 178 | # Note 179 | 180 | Where possible, you should prefer `new_at_right_of_byte_pos`. 181 | */ 182 | #[inline] 183 | pub fn new_at_cp_right_of_byte_pos(s: &'a str, byte_pos: usize) -> StrCursor<'a> { 184 | StrCursor { 185 | s: s, 186 | at: unsafe { seek_utf8_cp_start_right(s, byte_pos_to_ptr(s, byte_pos)) }, 187 | } 188 | } 189 | 190 | /** 191 | Returns a new cursor at the beginning of the previous grapheme cluster, or `None` if the cursor is currently positioned at the beginning of the string. 192 | */ 193 | #[inline] 194 | pub fn at_prev(mut self) -> Option> { 195 | match self.try_seek_left_gr() { 196 | true => Some(self), 197 | false => None 198 | } 199 | } 200 | 201 | /** 202 | Returns a new cursor at the beginning of the next grapheme cluster, or `None` if the cursor is currently positioned at the end of the string. 203 | */ 204 | #[inline] 205 | pub fn at_next(mut self) -> Option> { 206 | match self.try_seek_right_gr() { 207 | true => Some(self), 208 | false => None 209 | } 210 | } 211 | 212 | /** 213 | Returns a new cursor at the beginning of the previous code point, or `None` if the cursor is currently positioned at the beginning of the string. 214 | 215 | # Note 216 | 217 | Where possible, you should prefer `at_prev`. 218 | */ 219 | #[inline] 220 | pub fn at_prev_cp(mut self) -> Option> { 221 | match self.try_seek_left_cp() { 222 | true => Some(self), 223 | false => None 224 | } 225 | } 226 | 227 | /** 228 | Returns a new cursor at the beginning of the next code point, or `None` if the cursor is currently positioned at the end of the string. 229 | 230 | # Note 231 | 232 | Where possible, you should prefer `at_next`. 233 | */ 234 | #[inline] 235 | pub fn at_next_cp(mut self) -> Option> { 236 | match self.try_seek_right_cp() { 237 | true => Some(self), 238 | false => None 239 | } 240 | } 241 | 242 | /** 243 | Seeks the cursor to the beginning of the previous grapheme cluster. 244 | 245 | # Panics 246 | 247 | If the cursor is currently at the start of the string, then this function will panic. 248 | */ 249 | #[inline] 250 | pub fn seek_prev(&mut self) { 251 | if !self.try_seek_right_gr() { 252 | panic!("cannot seek past the beginning of a string"); 253 | } 254 | } 255 | 256 | /** 257 | Seeks the cursor to the beginning of the next grapheme cluster. 258 | 259 | # Panics 260 | 261 | If the cursor is currently at the end of the string, then this function will panic. 262 | */ 263 | #[inline] 264 | pub fn seek_next(&mut self) { 265 | if !self.try_seek_right_gr() { 266 | panic!("cannot seek past the end of a string"); 267 | } 268 | } 269 | 270 | /** 271 | Seeks the cursor to the beginning of the previous code point. 272 | 273 | # Panics 274 | 275 | If the cursor is currently at the start of the string, then this function will panic. 276 | 277 | # Note 278 | 279 | Where possible, you should prefer `seek_prev`. 280 | */ 281 | #[inline] 282 | pub fn seek_prev_cp(&mut self) { 283 | if !self.try_seek_left_cp() { 284 | panic!("cannot seek past the beginning of a string"); 285 | } 286 | } 287 | 288 | /** 289 | Seeks the cursor to the beginning of the next code point. 290 | 291 | # Panics 292 | 293 | If the cursor is currently at the end of the string, then this function will panic. 294 | 295 | # Note 296 | 297 | Where possible, you should prefer `seek_next`. 298 | */ 299 | #[inline] 300 | pub fn seek_next_cp(&mut self) { 301 | if !self.try_seek_right_cp() { 302 | panic!("cannot seek past the end of a string"); 303 | } 304 | } 305 | 306 | /** 307 | Returns both the previous grapheme cluster and the cursor having seeked before it. 308 | 309 | This may be more efficient than doing both operations individually. 310 | */ 311 | #[inline] 312 | pub fn prev(mut self) -> Option<(&'a Gc, StrCursor<'a>)> { 313 | unsafe { 314 | let g = match self.before() { 315 | Some(g) => g, 316 | None => return None, 317 | }; 318 | self.unsafe_set_at(g.as_str()); 319 | Some((g, self)) 320 | } 321 | } 322 | 323 | /** 324 | Returns both the previous code point and the cursor having seeked before it. 325 | 326 | This may be more efficient than doing both operations individually. 327 | 328 | # Note 329 | 330 | Where possible, you should prefer `prev`. 331 | */ 332 | #[inline] 333 | pub fn prev_cp(mut self) -> Option<(char, StrCursor<'a>)> { 334 | unsafe { 335 | let cp = match self.cp_before() { 336 | Some(cp) => cp, 337 | None => return None, 338 | }; 339 | self.unsafe_seek_left(cp.len_utf8()); 340 | Some((cp, self)) 341 | } 342 | } 343 | 344 | /** 345 | Returns both the next grapheme cluster and the cursor having seeked past it. 346 | 347 | This may be more efficient than doing both operations individually. 348 | */ 349 | #[inline] 350 | pub fn next(mut self) -> Option<(&'a Gc, StrCursor<'a>)> { 351 | unsafe { 352 | let g = match self.after() { 353 | Some(g) => g, 354 | None => return None, 355 | }; 356 | self.unsafe_seek_right(g.len()); 357 | Some((g, self)) 358 | } 359 | } 360 | 361 | /** 362 | Returns both the next code point and the cursor having seeked past it. 363 | 364 | This may be more efficient than doing both operations individually. 365 | 366 | # Note 367 | 368 | Where possible, you should prefer `next`. 369 | */ 370 | #[inline] 371 | pub fn next_cp(mut self) -> Option<(char, StrCursor<'a>)> { 372 | unsafe { 373 | let cp = match self.cp_after() { 374 | Some(cp) => cp, 375 | None => return None, 376 | }; 377 | self.unsafe_seek_right(cp.len_utf8()); 378 | Some((cp, self)) 379 | } 380 | } 381 | 382 | /** 383 | Returns the grapheme cluster immediately to the left of the cursor, or `None` is the cursor is at the start of the string. 384 | */ 385 | #[inline] 386 | pub fn before(&self) -> Option<&'a Gc> { 387 | self.at_prev().and_then(|cur| cur.after()) 388 | } 389 | 390 | /** 391 | Returns the grapheme cluster immediately to the right of the cursor, or `None` is the cursor is at the end of the string. 392 | */ 393 | #[inline] 394 | pub fn after(&self) -> Option<&'a Gc> { 395 | Gc::split_from(self.slice_after()).map(|(gc, _)| gc) 396 | } 397 | 398 | /** 399 | Returns the contents of the string to the left of the cursor. 400 | */ 401 | #[inline] 402 | pub fn slice_before(&self) -> &'a str { 403 | unsafe { 404 | self.s.slice_unchecked(0, self.byte_pos()) 405 | } 406 | } 407 | 408 | /** 409 | Returns the contents of the string to the right of the cursor. 410 | */ 411 | #[inline] 412 | pub fn slice_after(&self) -> &'a str { 413 | unsafe { 414 | self.s.slice_unchecked(self.byte_pos(), self.s.len()) 415 | } 416 | } 417 | 418 | /** 419 | Returns the contents of the string *between* this cursor and another cursor. 420 | 421 | Returns `None` if the cursors are from different strings (even different subsets of the same string). 422 | */ 423 | #[inline] 424 | pub fn slice_between(&self, until: StrCursor<'a>) -> Option<&'a str> { 425 | if !str_eq_literal(self.s, until.s) { 426 | None 427 | } else { 428 | use std::cmp::{max, min}; 429 | unsafe { 430 | let beg = min(self.at, until.at); 431 | let end = max(self.at, until.at); 432 | let len = end as usize - beg as usize; 433 | let bytes = ::std::slice::from_raw_parts(beg, len); 434 | Some(::std::str::from_utf8_unchecked(bytes)) 435 | } 436 | } 437 | } 438 | 439 | /** 440 | Returns the code point immediately to the left of the cursor, or `None` is the cursor is at the start of the string. 441 | */ 442 | #[inline] 443 | pub fn cp_before(&self) -> Option { 444 | self.at_prev_cp().and_then(|cur| cur.cp_after()) 445 | } 446 | 447 | /** 448 | Returns the code point immediately to the right of the cursor, or `None` is the cursor is at the end of the string. 449 | */ 450 | #[inline] 451 | pub fn cp_after(&self) -> Option { 452 | self.slice_after().chars().next() 453 | } 454 | 455 | /** 456 | Returns the entire string slice behind the cursor. 457 | */ 458 | #[inline] 459 | pub fn slice_all(&self) -> &'a str { 460 | self.s 461 | } 462 | 463 | /** 464 | Returns the cursor's current position within the string as the number of UTF-8 code units from the beginning of the string. 465 | */ 466 | #[inline] 467 | pub fn byte_pos(&self) -> usize { 468 | self.at as usize - self.s.as_ptr() as usize 469 | } 470 | 471 | #[inline] 472 | fn try_seek_left_cp(&mut self) -> bool { 473 | unsafe { 474 | // We just have to ensure that offsetting the `at` pointer *at all* is safe. 475 | if self.byte_pos() == 0 { 476 | return false; 477 | } 478 | self.at = seek_utf8_cp_start_left(self.s, self.at.offset(-1)); 479 | true 480 | } 481 | } 482 | 483 | #[inline] 484 | fn try_seek_right_cp(&mut self) -> bool { 485 | unsafe { 486 | // We just have to ensure that offsetting the `at` pointer *at all* is safe. 487 | if self.byte_pos() == self.s.len() { 488 | return false; 489 | } 490 | self.at = seek_utf8_cp_start_right(self.s, self.at.offset(1)); 491 | true 492 | } 493 | } 494 | 495 | #[inline] 496 | fn try_seek_left_gr(&mut self) -> bool { 497 | let len = { 498 | let gr = UniSeg::graphemes(self.slice_before(), /*is_extended:*/true).next_back(); 499 | gr.map(|gr| gr.len()) 500 | }; 501 | match len { 502 | Some(len) => { 503 | unsafe { 504 | self.at = self.at.offset(-(len as isize)); 505 | } 506 | true 507 | }, 508 | None => false 509 | } 510 | } 511 | 512 | #[inline] 513 | fn try_seek_right_gr(&mut self) -> bool { 514 | let len = { 515 | let gr = UniSeg::graphemes(self.slice_after(), /*is_extended:*/true).next(); 516 | gr.map(|gr| gr.len()) 517 | }; 518 | match len { 519 | Some(len) => { 520 | unsafe { 521 | self.at = self.at.offset(len as isize); 522 | } 523 | true 524 | }, 525 | None => false 526 | } 527 | } 528 | 529 | /** 530 | Seeks exactly `bytes` left, without performing any bounds or validity checks. 531 | */ 532 | #[inline] 533 | pub unsafe fn unsafe_seek_left(&mut self, bytes: usize) { 534 | self.at = self.at.offset(-(bytes as isize)); 535 | } 536 | 537 | /** 538 | Seeks exactly `bytes` right, without performing any bounds or validity checks. 539 | */ 540 | #[inline] 541 | pub unsafe fn unsafe_seek_right(&mut self, bytes: usize) { 542 | self.at = self.at.offset(bytes as isize); 543 | } 544 | 545 | /** 546 | Seeks to the start of `s`, without performing any bounds or validity checks. 547 | */ 548 | #[inline] 549 | pub unsafe fn unsafe_set_at(&mut self, s: &'a str) { 550 | self.at = s.as_bytes().as_ptr(); 551 | } 552 | } 553 | 554 | impl<'a> Copy for StrCursor<'a> {} 555 | 556 | impl<'a> Clone for StrCursor<'a> { 557 | fn clone(&self) -> StrCursor<'a> { 558 | *self 559 | } 560 | } 561 | 562 | impl<'a> std::fmt::Debug for StrCursor<'a> { 563 | fn fmt(&self, fmt: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> { 564 | write!(fmt, "StrCursor({:?} | {:?})", self.slice_before(), self.slice_after()) 565 | } 566 | } 567 | 568 | impl<'a> Eq for StrCursor<'a> {} 569 | 570 | impl<'a> PartialEq for StrCursor<'a> { 571 | fn eq(&self, other: &StrCursor<'a>) -> bool { 572 | (self.at == other.at) 573 | && (self.s.as_ptr() == other.s.as_ptr()) 574 | && (self.s.len() == other.s.len()) 575 | } 576 | 577 | fn ne(&self, other: &StrCursor<'a>) -> bool { 578 | (self.at != other.at) 579 | || (self.s.as_ptr() != other.s.as_ptr()) 580 | || (self.s.len() != other.s.len()) 581 | } 582 | } 583 | 584 | impl<'a> PartialOrd for StrCursor<'a> { 585 | fn partial_cmp(&self, other: &StrCursor<'a>) -> Option { 586 | // If the cursors are from different strings, they are unordered. 587 | if (self.s.as_ptr() != other.s.as_ptr()) || (self.s.len() != other.s.len()) { 588 | None 589 | } else { 590 | self.at.partial_cmp(&other.at) 591 | } 592 | } 593 | } 594 | 595 | impl<'a> std::hash::Hash for StrCursor<'a> { 596 | fn hash(&self, state: &mut H) 597 | where H: std::hash::Hasher { 598 | self.s.as_ptr().hash(state); 599 | self.s.len().hash(state); 600 | self.at.hash(state); 601 | } 602 | } 603 | 604 | #[cfg(test)] 605 | #[test] 606 | fn test_new_at_start() { 607 | let cur = StrCursor::new_at_start("abcdef"); 608 | assert_eq!(cur.slice_before(), ""); 609 | assert_eq!(cur.slice_after(), "abcdef"); 610 | } 611 | 612 | #[cfg(test)] 613 | #[test] 614 | fn test_new_at_end() { 615 | let cur = StrCursor::new_at_end("abcdef"); 616 | assert_eq!(cur.slice_before(), "abcdef"); 617 | assert_eq!(cur.slice_after(), ""); 618 | } 619 | 620 | #[cfg(test)] 621 | #[test] 622 | fn test_new_at_cp_left_of_byte_pos() { 623 | let s = "This is a 本当 test."; 624 | let cur = StrCursor::new_at_cp_left_of_byte_pos(s, 11); 625 | assert_eq!(cur.slice_before(), "This is a "); 626 | assert_eq!(cur.slice_after(), "本当 test."); 627 | } 628 | 629 | #[cfg(test)] 630 | #[test] 631 | fn test_new_at_cp_right_of_byte_pos() { 632 | let s = "This is a 本当 test."; 633 | let cur = StrCursor::new_at_cp_right_of_byte_pos(s, 11); 634 | assert_eq!(cur.slice_before(), "This is a 本"); 635 | assert_eq!(cur.slice_after(), "当 test."); 636 | } 637 | 638 | #[cfg(test)] 639 | #[test] 640 | fn test_new_at_left_of_byte_pos() { 641 | let s = "Jäger,Jäger,大嫌い,💪❤!"; 642 | let r = (0..s.len()+1).map(|i| (i, StrCursor::new_at_left_of_byte_pos(s, i))) 643 | .map(|(i, cur)| (i, cur.byte_pos(), cur.after().map(Gc::as_str))) 644 | .collect::>(); 645 | assert_eq!(r, vec![ 646 | (0, 0, Some("J")), 647 | (1, 1, Some("ä")), 648 | (2, 1, Some("ä")), 649 | (3, 3, Some("g")), 650 | (4, 4, Some("e")), 651 | (5, 5, Some("r")), 652 | (6, 6, Some(",")), 653 | (7, 7, Some("J")), 654 | (8, 8, Some("ä")), 655 | (9, 8, Some("ä")), 656 | (10, 8, Some("ä")), 657 | (11, 11, Some("g")), 658 | (12, 12, Some("e")), 659 | (13, 13, Some("r")), 660 | (14, 14, Some(",")), 661 | (15, 15, Some("大")), 662 | (16, 15, Some("大")), 663 | (17, 15, Some("大")), 664 | (18, 18, Some("嫌")), 665 | (19, 18, Some("嫌")), 666 | (20, 18, Some("嫌")), 667 | (21, 21, Some("い")), 668 | (22, 21, Some("い")), 669 | (23, 21, Some("い")), 670 | (24, 24, Some(",")), 671 | (25, 25, Some("💪")), 672 | (26, 25, Some("💪")), 673 | (27, 25, Some("💪")), 674 | (28, 25, Some("💪")), 675 | (29, 29, Some("❤")), 676 | (30, 29, Some("❤")), 677 | (31, 29, Some("❤")), 678 | (32, 32, Some("!")), 679 | (33, 33, None), 680 | ]); 681 | } 682 | 683 | #[cfg(test)] 684 | #[test] 685 | fn test_new_at_right_of_byte_pos() { 686 | let s = "Jäger,Jäger,大嫌い,💪❤!"; 687 | let r = (0..s.len()+1).map(|i| (i, StrCursor::new_at_right_of_byte_pos(s, i))) 688 | .map(|(i, cur)| (i, cur.byte_pos(), cur.after().map(Gc::as_str))) 689 | .collect::>(); 690 | assert_eq!(r, vec![ 691 | (0, 0, Some("J")), 692 | (1, 1, Some("ä")), 693 | (2, 3, Some("g")), 694 | (3, 3, Some("g")), 695 | (4, 4, Some("e")), 696 | (5, 5, Some("r")), 697 | (6, 6, Some(",")), 698 | (7, 7, Some("J")), 699 | (8, 8, Some("ä")), 700 | (9, 11, Some("g")), 701 | (10, 11, Some("g")), 702 | (11, 11, Some("g")), 703 | (12, 12, Some("e")), 704 | (13, 13, Some("r")), 705 | (14, 14, Some(",")), 706 | (15, 15, Some("大")), 707 | (16, 18, Some("嫌")), 708 | (17, 18, Some("嫌")), 709 | (18, 18, Some("嫌")), 710 | (19, 21, Some("い")), 711 | (20, 21, Some("い")), 712 | (21, 21, Some("い")), 713 | (22, 24, Some(",")), 714 | (23, 24, Some(",")), 715 | (24, 24, Some(",")), 716 | (25, 25, Some("💪")), 717 | (26, 29, Some("❤")), 718 | (27, 29, Some("❤")), 719 | (28, 29, Some("❤")), 720 | (29, 29, Some("❤")), 721 | (30, 32, Some("!")), 722 | (31, 32, Some("!")), 723 | (32, 32, Some("!")), 724 | (33, 33, None), 725 | ]); 726 | } 727 | 728 | #[cfg(test)] 729 | #[test] 730 | fn test_at_prev_cp() { 731 | let s = "大嫌い,💪❤"; 732 | let cur = StrCursor::new_at_end(s); 733 | let bps = test_util::finite_iterate(cur, StrCursor::at_prev_cp) 734 | .map(|cur| cur.byte_pos()) 735 | .collect::>(); 736 | assert_eq!(bps, vec![14, 10, 9, 6, 3, 0]); 737 | } 738 | 739 | #[cfg(test)] 740 | #[test] 741 | fn test_at_next_cp() { 742 | let s = "大嫌い,💪❤"; 743 | let cur = StrCursor::new_at_start(s); 744 | let bps = test_util::finite_iterate(cur, StrCursor::at_next_cp) 745 | .map(|cur| cur.byte_pos()) 746 | .collect::>(); 747 | assert_eq!(bps, vec![3, 6, 9, 10, 14, 17]); 748 | } 749 | 750 | #[cfg(test)] 751 | #[test] 752 | fn test_at_prev_and_before() { 753 | let s = "noe\u{0308}l"; 754 | let cur = StrCursor::new_at_end(s); 755 | let bps = test_util::finite_iterate_lead(cur, StrCursor::at_prev) 756 | .map(|cur| (cur.byte_pos(), cur.after().map(Gc::as_str))) 757 | .collect::>(); 758 | assert_eq!(bps, vec![ 759 | (6, None), 760 | (5, Some("l")), 761 | (2, Some("e\u{0308}")), 762 | (1, Some("o")), 763 | (0, Some("n")), 764 | ]); 765 | } 766 | 767 | #[cfg(test)] 768 | #[test] 769 | fn test_at_next_and_after() { 770 | let s = "noe\u{0308}l"; 771 | let cur = StrCursor::new_at_start(s); 772 | let bps = test_util::finite_iterate_lead(cur, StrCursor::at_next) 773 | .map(|cur| (cur.byte_pos(), cur.after().map(Gc::as_str))) 774 | .collect::>(); 775 | assert_eq!(bps, vec![ 776 | (0, Some("n")), 777 | (1, Some("o")), 778 | (2, Some("e\u{0308}")), 779 | (5, Some("l")), 780 | (6, None), 781 | ]); 782 | } 783 | 784 | #[cfg(test)] 785 | #[test] 786 | fn test_prev() { 787 | let s = "Jäger,Jäger,大嫌い,💪❤!"; 788 | let cur = StrCursor::new_at_end(s); 789 | let r = test_util::finite_iterate_lead(cur, StrCursor::at_prev) 790 | .map(|cur| cur.prev().map(|(gr, cur)| (gr.as_str(), cur.byte_pos()))) 791 | .collect::>(); 792 | assert_eq!(r, vec![ 793 | Some(("!", 32)), 794 | Some(("❤", 29)), 795 | Some(("💪", 25)), 796 | Some((",", 24)), 797 | Some(("い", 21)), 798 | Some(("嫌", 18)), 799 | Some(("大", 15)), 800 | Some((",", 14)), 801 | Some(("r", 13)), 802 | Some(("e", 12)), 803 | Some(("g", 11)), 804 | Some(("ä", 8)), 805 | Some(("J", 7)), 806 | Some((",", 6)), 807 | Some(("r", 5)), 808 | Some(("e", 4)), 809 | Some(("g", 3)), 810 | Some(("ä", 1)), 811 | Some(("J", 0)), 812 | None, 813 | ]); 814 | } 815 | 816 | #[cfg(test)] 817 | #[test] 818 | fn test_prev_cp() { 819 | let s = "Jäger,Jäger,大嫌い,💪❤!"; 820 | let cur = StrCursor::new_at_end(s); 821 | let r = test_util::finite_iterate_lead(cur, StrCursor::at_prev_cp) 822 | .map(|cur| cur.prev_cp().map(|(cp, cur)| (cp, cur.byte_pos()))) 823 | .collect::>(); 824 | assert_eq!(r, vec![ 825 | Some(('!', 32)), 826 | Some(('❤', 29)), 827 | Some(('💪', 25)), 828 | Some((',', 24)), 829 | Some(('い', 21)), 830 | Some(('嫌', 18)), 831 | Some(('大', 15)), 832 | Some((',', 14)), 833 | Some(('r', 13)), 834 | Some(('e', 12)), 835 | Some(('g', 11)), 836 | Some(('̈', 9)), 837 | Some(('a', 8)), 838 | Some(('J', 7)), 839 | Some((',', 6)), 840 | Some(('r', 5)), 841 | Some(('e', 4)), 842 | Some(('g', 3)), 843 | Some(('ä', 1)), 844 | Some(('J', 0)), 845 | None, 846 | ]); 847 | } 848 | 849 | #[cfg(test)] 850 | #[test] 851 | fn test_next() { 852 | let s = "Jäger,Jäger,大嫌い,💪❤!"; 853 | let cur = StrCursor::new_at_start(s); 854 | let r = test_util::finite_iterate_lead(cur, StrCursor::at_next) 855 | .map(|cur| cur.next().map(|(gr, cur)| (gr.as_str(), cur.byte_pos()))) 856 | .collect::>(); 857 | assert_eq!(r, vec![ 858 | Some(("J", 1)), 859 | Some(("ä", 3)), 860 | Some(("g", 4)), 861 | Some(("e", 5)), 862 | Some(("r", 6)), 863 | Some((",", 7)), 864 | Some(("J", 8)), 865 | Some(("ä", 11)), 866 | Some(("g", 12)), 867 | Some(("e", 13)), 868 | Some(("r", 14)), 869 | Some((",", 15)), 870 | Some(("大", 18)), 871 | Some(("嫌", 21)), 872 | Some(("い", 24)), 873 | Some((",", 25)), 874 | Some(("💪", 29)), 875 | Some(("❤", 32)), 876 | Some(("!", 33)), 877 | None, 878 | ]); 879 | } 880 | 881 | #[cfg(test)] 882 | #[test] 883 | fn test_next_cp() { 884 | let s = "Jäger,Jäger,大嫌い,💪❤!"; 885 | let cur = StrCursor::new_at_start(s); 886 | let r = test_util::finite_iterate_lead(cur, StrCursor::at_next_cp) 887 | .map(|cur| cur.next_cp().map(|(cp, cur)| (cp, cur.byte_pos()))) 888 | .collect::>(); 889 | assert_eq!(r, vec![ 890 | Some(('J', 1)), 891 | Some(('ä', 3)), 892 | Some(('g', 4)), 893 | Some(('e', 5)), 894 | Some(('r', 6)), 895 | Some((',', 7)), 896 | Some(('J', 8)), 897 | Some(('a', 9)), 898 | Some(('̈', 11)), 899 | Some(('g', 12)), 900 | Some(('e', 13)), 901 | Some(('r', 14)), 902 | Some((',', 15)), 903 | Some(('大', 18)), 904 | Some(('嫌', 21)), 905 | Some(('い', 24)), 906 | Some((',', 25)), 907 | Some(('💪', 29)), 908 | Some(('❤', 32)), 909 | Some(('!', 33)), 910 | None, 911 | ]); 912 | } 913 | 914 | #[cfg(test)] 915 | #[test] 916 | fn test_char_before_and_after() { 917 | let s = "大嫌い,💪❤"; 918 | let cur = StrCursor::new_at_start(s); 919 | let r = test_util::finite_iterate_lead(cur, StrCursor::at_next_cp) 920 | .map(|cur| (cur.byte_pos(), cur.cp_before(), cur.cp_after())) 921 | .collect::>(); 922 | assert_eq!(r, vec![ 923 | (0, None, Some('大')), 924 | (3, Some('大'), Some('嫌')), 925 | (6, Some('嫌'), Some('い')), 926 | (9, Some('い'), Some(',')), 927 | (10, Some(','), Some('💪')), 928 | (14, Some('💪'), Some('❤')), 929 | (17, Some('❤'), None) 930 | ]); 931 | } 932 | 933 | #[cfg(test)] 934 | #[test] 935 | fn test_slice_between() { 936 | let s = "they hit, fight, kick, wreak havoc, and rejoice"; 937 | let cur0 = StrCursor::new_at_start(s); 938 | let cur1 = StrCursor::new_at_end(s); 939 | let cur2 = StrCursor::new_at_end("nobody knows what they're lookin' for"); 940 | let cur3 = StrCursor::new_at_end(&s[1..]); 941 | assert_eq!(cur0.slice_between(cur1), Some(s)); 942 | assert_eq!(cur1.slice_between(cur0), Some(s)); 943 | assert_eq!(cur0.slice_between(cur2), None); 944 | assert_eq!(cur0.slice_between(cur3), None); 945 | } 946 | 947 | #[inline] 948 | fn byte_pos_to_ptr(s: &str, byte_pos: usize) -> *const u8 { 949 | if s.len() < byte_pos { 950 | panic!("byte position out of bounds: the len is {} but the position is {}", 951 | s.len(), byte_pos); 952 | } 953 | unsafe { s.as_ptr().offset(byte_pos as isize) } 954 | } 955 | 956 | #[inline] 957 | unsafe fn seek_utf8_cp_start_left(s: &str, mut from: *const u8) -> *const u8 { 958 | let beg = s.as_ptr(); 959 | while from > beg && (*from & 0b11_00_0000 == 0b10_00_0000) { 960 | from = from.offset(-1); 961 | } 962 | from 963 | } 964 | 965 | #[cfg(test)] 966 | #[test] 967 | fn test_seek_utf8_cp_start_left() { 968 | let s = "カブム!"; 969 | let b = s.as_bytes(); 970 | assert_eq!(unsafe { seek_utf8_cp_start_left(s, &b[0]) }, &b[0]); 971 | assert_eq!(unsafe { seek_utf8_cp_start_left(s, &b[1]) }, &b[0]); 972 | assert_eq!(unsafe { seek_utf8_cp_start_left(s, &b[2]) }, &b[0]); 973 | assert_eq!(unsafe { seek_utf8_cp_start_left(s, &b[3]) }, &b[3]); 974 | assert_eq!(unsafe { seek_utf8_cp_start_left(s, &b[4]) }, &b[3]); 975 | assert_eq!(unsafe { seek_utf8_cp_start_left(s, &b[5]) }, &b[3]); 976 | } 977 | 978 | #[inline] 979 | unsafe fn seek_utf8_cp_start_right(s: &str, mut from: *const u8) -> *const u8 { 980 | let end = s.as_ptr().offset(s.len() as isize); 981 | while from < end && (*from & 0b11_00_0000 == 0b10_00_0000) { 982 | from = from.offset(1); 983 | } 984 | from 985 | } 986 | 987 | #[cfg(test)] 988 | #[test] 989 | fn test_seek_utf8_cp_start_right() { 990 | let s = "カブム!"; 991 | let b = s.as_bytes(); 992 | assert_eq!(unsafe { seek_utf8_cp_start_right(s, &b[0]) }, &b[0]); 993 | assert_eq!(unsafe { seek_utf8_cp_start_right(s, &b[1]) }, &b[3]); 994 | assert_eq!(unsafe { seek_utf8_cp_start_right(s, &b[2]) }, &b[3]); 995 | assert_eq!(unsafe { seek_utf8_cp_start_right(s, &b[3]) }, &b[3]); 996 | assert_eq!(unsafe { seek_utf8_cp_start_right(s, &b[4]) }, &b[6]); 997 | assert_eq!(unsafe { seek_utf8_cp_start_right(s, &b[5]) }, &b[6]); 998 | } 999 | 1000 | #[inline] 1001 | fn str_eq_literal(a: &str, b: &str) -> bool { 1002 | a.as_bytes().as_ptr() == b.as_bytes().as_ptr() 1003 | && a.len() == b.len() 1004 | } 1005 | 1006 | #[cfg(test)] 1007 | #[test] 1008 | fn test_str_eq_literal() { 1009 | let s = "hare hare yukai"; 1010 | assert!(str_eq_literal(s, s)); 1011 | assert!(str_eq_literal(&s[0..4], &s[0..4])); 1012 | assert!(!str_eq_literal(&s[0..4], &s[5..9])); 1013 | assert!(!str_eq_literal(&s[0..4], &s[0..3])); 1014 | } 1015 | 1016 | #[cfg(test)] 1017 | mod test_util { 1018 | pub struct FiniteIter(Option, F); 1019 | 1020 | impl Iterator for FiniteIter 1021 | where 1022 | F: FnMut(T) -> Option, 1023 | T: Clone, 1024 | { 1025 | type Item = T; 1026 | 1027 | fn next(&mut self) -> Option { 1028 | self.0.take().and_then(|last| { 1029 | match (self.1)(last) { 1030 | Some(e) => { 1031 | self.0 = Some(e); 1032 | self.0.clone() 1033 | }, 1034 | None => None 1035 | } 1036 | }) 1037 | } 1038 | } 1039 | 1040 | pub fn finite_iterate(seed: T, f: F) -> FiniteIter 1041 | where 1042 | F: FnMut(T) -> Option, 1043 | T: Clone, 1044 | { 1045 | FiniteIter(Some(seed), f) 1046 | } 1047 | pub struct FiniteIterLead(Option, F, bool); 1048 | 1049 | impl Iterator for FiniteIterLead 1050 | where 1051 | F: FnMut(T) -> Option, 1052 | T: Clone, 1053 | { 1054 | type Item = T; 1055 | 1056 | fn next(&mut self) -> Option { 1057 | if !self.2 { 1058 | self.2 = true; 1059 | return self.0.clone(); 1060 | } 1061 | 1062 | self.0.take().and_then(|last| { 1063 | match (self.1)(last) { 1064 | Some(e) => { 1065 | self.0 = Some(e); 1066 | self.0.clone() 1067 | }, 1068 | None => None 1069 | } 1070 | }) 1071 | } 1072 | } 1073 | 1074 | pub fn finite_iterate_lead(seed: T, f: F) -> FiniteIterLead 1075 | where 1076 | F: FnMut(T) -> Option, 1077 | T: Clone, 1078 | { 1079 | FiniteIterLead(Some(seed), f, false) 1080 | } 1081 | } 1082 | -------------------------------------------------------------------------------- /src/util.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | Miscellaneous stuff. 3 | */ 4 | 5 | #[inline(always)] 6 | pub unsafe fn unreachable() -> ! { 7 | enum Knowledge {} 8 | #[inline(always)] 9 | fn nirvana(knowledge: Knowledge) -> ! { 10 | match knowledge {} 11 | } 12 | nirvana(::std::mem::transmute(())) 13 | } 14 | 15 | /* 16 | 17 | TODO: The following code is nicked from libcore, owing to `encode_utf8` not being stable yet. Specifically, . 18 | 19 | This should all be removed as soon as `encode_utf8` *is* stable. 20 | 21 | */ 22 | 23 | // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT 24 | // file at the top-level directory of this distribution and at 25 | // http://rust-lang.org/COPYRIGHT. 26 | // 27 | // Licensed under the Apache License, Version 2.0 or the MIT license 29 | // , at your 30 | // option. This file may not be copied, modified, or distributed 31 | // except according to those terms. 32 | 33 | // UTF-8 ranges and tags for encoding characters 34 | const TAG_CONT: u8 = 0b1000_0000; 35 | const TAG_TWO_B: u8 = 0b1100_0000; 36 | const TAG_THREE_B: u8 = 0b1110_0000; 37 | const TAG_FOUR_B: u8 = 0b1111_0000; 38 | const MAX_ONE_B: u32 = 0x80; 39 | const MAX_TWO_B: u32 = 0x800; 40 | const MAX_THREE_B: u32 = 0x10000; 41 | 42 | pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option { 43 | // Marked #[inline] to allow llvm optimizing it away 44 | if code < MAX_ONE_B && !dst.is_empty() { 45 | dst[0] = code as u8; 46 | Some(1) 47 | } else if code < MAX_TWO_B && dst.len() >= 2 { 48 | dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; 49 | dst[1] = (code & 0x3F) as u8 | TAG_CONT; 50 | Some(2) 51 | } else if code < MAX_THREE_B && dst.len() >= 3 { 52 | dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; 53 | dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT; 54 | dst[2] = (code & 0x3F) as u8 | TAG_CONT; 55 | Some(3) 56 | } else if dst.len() >= 4 { 57 | dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; 58 | dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT; 59 | dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT; 60 | dst[3] = (code & 0x3F) as u8 | TAG_CONT; 61 | Some(4) 62 | } else { 63 | None 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /update-docs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding: utf-8 3 | 4 | # Copyright ⓒ 2016 Daniel Keep. 5 | # 6 | # Licensed under the MIT license (see LICENSE or ) or the Apache License, Version 2.0 (see LICENSE of 8 | # ), at your option. All 9 | # files in the project carrying such notice may not be copied, modified, 10 | # or distributed except according to those terms. 11 | 12 | import distutils.dir_util 13 | import os 14 | import shutil 15 | import subprocess 16 | import sys 17 | import tempfile 18 | import time 19 | 20 | DOC_ARGS = '--no-deps' 21 | DOC_FEATURES = "" 22 | DOC_TARGET_BRANCH = 'gh-pages' 23 | TEMP_CHECKOUT_PREFIX = 'gh-pages-checkout-' 24 | TEMP_OUTPUT_PREFIX = 'gh-pages-generated-' 25 | 26 | USE_ANSI = True if sys.platform != 'win32' else os.environ.get('FORCE_ANSI', '') != '' 27 | TRACE_UPDATE_DOCS = os.environ.get('TRACE_UPDATE_DOCS', '') != '' 28 | 29 | def sh(cmd): 30 | msg_trace('sh(%r)' % cmd) 31 | try: 32 | subprocess.check_call(cmd, shell=True) 33 | except: 34 | msg_trace('FAILED!') 35 | raise 36 | 37 | def sh_eval(cmd, codec='utf-8', dont_strip=False): 38 | msg_trace('sh_eval(%r)' % cmd) 39 | result = None 40 | try: 41 | result = subprocess.check_output(cmd, shell=True).decode(codec) 42 | if not dont_strip: 43 | result = result.strip() 44 | except: 45 | msg_trace('FAILED!') 46 | raise 47 | return result 48 | 49 | def msg(*args): 50 | if USE_ANSI: sys.stdout.write('\x1b[1;34m') 51 | sys.stdout.write('> ') 52 | if USE_ANSI: sys.stdout.write('\x1b[1;32m') 53 | for arg in args: 54 | sys.stdout.write(str(arg)) 55 | if USE_ANSI: sys.stdout.write('\x1b[0m') 56 | sys.stdout.write('\n') 57 | sys.stdout.flush() 58 | 59 | def msg_trace(*args): 60 | if TRACE_UPDATE_DOCS: 61 | if USE_ANSI: sys.stderr.write('\x1b[1;31m') 62 | sys.stderr.write('$ ') 63 | if USE_ANSI: sys.stderr.write('\x1b[0m') 64 | for arg in args: 65 | sys.stderr.write(str(arg)) 66 | sys.stderr.write('\n') 67 | sys.stderr.flush() 68 | 69 | def copytree(src, dst): 70 | msg_trace('copytree(%r, %r)' % (src, dst)) 71 | distutils.dir_util.copy_tree(src=src, dst=dst) 72 | 73 | def really_rmtree(path): 74 | msg_trace('really_rmtree(%r)' % path) 75 | 76 | WAIT_TIME_SECS = 1.0 77 | MAX_TRIES = 10 78 | 79 | def on_error(func, path, exc_info): 80 | """ 81 | Error handler for ``shutil.rmtree``. 82 | 83 | If the error is due to an access error (read only file) 84 | it attempts to add write permission and then retries. 85 | 86 | If the error is for another reason it re-raises the error. 87 | 88 | Usage: ``shutil.rmtree(path, onerror=on_error)`` 89 | 90 | From _. 91 | """ 92 | import stat 93 | if not os.access(path, os.W_OK): 94 | # Is the error an access error ? 95 | os.chmod(path, stat.S_IWUSR) 96 | func(path) 97 | else: 98 | raise 99 | 100 | for _ in range(MAX_TRIES): 101 | failed = True 102 | try: 103 | msg_trace('shutil.rmtree(%r)' % path) 104 | shutil.rmtree(path, onerror=on_error) 105 | failed = False 106 | except WindowsError: 107 | time.sleep(WAIT_TIME_SECS) 108 | if not failed: return 109 | 110 | msg('Warning: failed to remove directory %r' % path) 111 | 112 | def init_doc_branch(): 113 | msg("Initialising %s branch" % DOC_TARGET_BRANCH) 114 | 115 | dir = os.getcwd() 116 | msg_trace('dir = %r' % dir) 117 | 118 | tmp = tempfile.mkdtemp(prefix=TEMP_CHECKOUT_PREFIX) 119 | msg_trace('tmp = %r' % tmp) 120 | 121 | try: 122 | msg("Cloning into a temporary directory...") 123 | sh('git init -q "%s"' % tmp) 124 | msg_trace('os.chdir(%r)' % tmp) 125 | os.chdir(tmp) 126 | sh('git checkout -q --orphan "%s"' % DOC_TARGET_BRANCH) 127 | sh('git commit -qm "Initial commit." --allow-empty') 128 | sh('git remote add origin "%s"' % dir) 129 | sh('git push -q origin gh-pages') 130 | 131 | finally: 132 | msg('Cleaning up...') 133 | msg_trace('os.chdir(%r)' % dir) 134 | os.chdir(dir) 135 | msg_trace('shutil.rmtree(%r)' % tmp) 136 | really_rmtree(tmp) 137 | 138 | msg('%s is ready. Continuing.' % DOC_TARGET_BRANCH) 139 | 140 | def main(): 141 | if sh_eval('git symbolic-ref --short HEAD') != u'master': 142 | msg('Not on master; doing nothing.') 143 | return 0 144 | 145 | # Sanity check: does the doc branch exist at all? 146 | branches = {b[2:].strip() for b in sh_eval('git branch', dont_strip=True).splitlines()} 147 | msg_trace('branches = %r' % branches) 148 | if DOC_TARGET_BRANCH not in branches: 149 | init_doc_branch() 150 | 151 | last_rev = sh_eval('git rev-parse HEAD') 152 | last_msg = sh_eval('git log -1 --pretty=%B') 153 | msg_trace('last_rev = %r' % last_rev) 154 | msg_trace('last_msg = %r' % last_msg) 155 | 156 | dir = os.getcwd() 157 | msg_trace('dir = %r' % dir) 158 | 159 | tmp1 = tempfile.mkdtemp(prefix=TEMP_CHECKOUT_PREFIX) 160 | tmp2 = tempfile.mkdtemp(prefix=TEMP_OUTPUT_PREFIX) 161 | msg_trace('tmp1 = %r' % tmp1) 162 | msg_trace('tmp2 = %r' % tmp2) 163 | 164 | try: 165 | msg("Cloning into a temporary directory...") 166 | sh('git clone -qb "%s" "%s" "%s"' % (DOC_TARGET_BRANCH, dir, tmp1)) 167 | msg_trace('os.chdir(%r)' % tmp1) 168 | os.chdir(tmp1) 169 | sh('git checkout -q master') 170 | 171 | msg("Generating documentation...") 172 | args = '%s --features="%s"' % (DOC_ARGS, DOC_FEATURES) 173 | sh('cargo doc %s' % args) 174 | tmp1_target_doc = '%s/target/doc' % tmp1 175 | msg_trace('shutil.move(%r, %r)' % (tmp1_target_doc, tmp2)) 176 | shutil.move(tmp1_target_doc, tmp2) 177 | 178 | msg('Updating %s...' % DOC_TARGET_BRANCH) 179 | sh('git checkout -q "%s"' % DOC_TARGET_BRANCH) 180 | sh('git clean -dfq') 181 | tmp2_doc = '%s/doc' % tmp2 182 | 183 | msg_trace('copytree(%r, %r)' % (tmp2_doc, './doc')) 184 | copytree(tmp2_doc, './doc') 185 | 186 | msg('Committing changes...') 187 | sh('git add .') 188 | sh('git commit --amend -m "Update docs for %s" -m "%s"' % (last_rev[:7], last_msg)) 189 | 190 | sh('git push -fqu origin "%s"' % DOC_TARGET_BRANCH) 191 | 192 | finally: 193 | msg('Cleaning up...') 194 | msg_trace('os.chdir(%r)' % dir) 195 | os.chdir(dir) 196 | msg_trace('shutil.rmtree(%r)' % tmp2) 197 | really_rmtree(tmp2) 198 | msg_trace('shutil.rmtree(%r)' % tmp1) 199 | really_rmtree(tmp1) 200 | 201 | msg('Publishing...') 202 | sh('git push -f origin "%s"' % DOC_TARGET_BRANCH) 203 | 204 | msg('Done.') 205 | 206 | 207 | if __name__ == '__main__': 208 | sys.exit(main()) 209 | --------------------------------------------------------------------------------