├── .github └── workflows │ └── test.yml ├── .gitignore ├── CHANGELOG.md ├── Cargo.toml ├── LICENSE.txt ├── README.md ├── benches └── benches.rs ├── benchmark_data ├── automerge-paper.json.gz ├── rustcode.json.gz ├── seph-blog1.json.gz └── sveltecomponent.json.gz ├── crdt-testdata ├── Cargo.toml └── src │ └── lib.rs ├── jumprope-wasm ├── Cargo.toml ├── build_wasm.sh └── src │ └── lib.rs ├── rope_benches ├── Cargo.toml ├── README.md ├── build.rs ├── explore_parameters.js ├── rope.c ├── rope.h ├── src │ ├── edittablestr.rs │ ├── main.rs │ └── rope.rs └── table.js ├── src ├── buffered.rs ├── fast_str_tools.rs ├── gapbuffer.rs ├── iter.rs ├── jumprope.rs ├── lib.rs └── utils.rs └── tests └── test.rs /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Build 20 | run: cargo build 21 | - name: Run tests (base) 22 | run: cargo test 23 | - name: Run tests (wchar) 24 | run: cargo test --features "wchar_conversion" 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | Cargo.lock 3 | .idea 4 | yarn.lock 5 | node_modules 6 | src/params.rs 7 | stats.md 8 | bench/*.json 9 | .*.swp 10 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # CURRENT 2 | 3 | - Added `is_ascii_only` - mainly only useful for optimizing lookups for some strings. 4 | - Microoptimizations of some wchar functions when the rope is ascii only 5 | 6 | # 1.1.2 7 | 8 | - Added explicit `rope.to_string()` method, because doing so is smaller and faster than going through the `Display` trait. Wasm bundle -1.5kb 9 | - Removed `wee_alloc` from wasm bundle. This makes the wasm bundle +6kb in size, but apparently wee_alloc [has memory leaks and is unmaintained](https://github.com/josephg/jumprope-rs/security/dependabot/1). 10 | - Fixed a terrible bug where `chars_to_wchars` returned the number of surrogate pairs rather than the wchar position. The return value would have been wrong in all cases. Testing fail - ouch! 11 | 12 | # 1.1.1 13 | 14 | - Fixed bug where reflexive eq (a == a) would fail for `&JumpRopeBuf`. 15 | 16 | # 1.1.0 17 | 18 | - The JumpRopeBuf feature has a lot more methods and is now stable, and included by default. The `buffered` feature flag is no longer needed. It now has no effect, and it will be removed in JumpRope 2.0 (whenever that happens). Please file issues if other useful methods are missing. 19 | - Added Send and Sync markers to `JumpRope`. Thanks to P. Vijay for the suggestion! 20 | 21 | # 1.0.0 22 | 23 | - Woohoo! 24 | - **Breaking API change**: Renamed the iterator methods. `rope.chunks()` -> `rope.substrings_with_len()`. Added `rope.substrings()` and `rope.slice_substrings()`. 25 | - Added buffered API, though for now its experimental and behind a feature flag. 26 | - Made miri pass against jumprope. This involved some changes: 27 | - The dynamically allocated heights in node.nexts lists have been removed. This results in less unsafe code, but increases the memory overhead of the library. 28 | - Wasm bundle size has grown 29 | - Performance is mostly unaffected. 30 | - Bumped to str_indices 0.3.2 31 | - Added Eq trait support to all the combinations of `rope` / `&rope` vs `&str` / `String` / `&String`. 32 | 33 | 34 | # 0.5.3 35 | 36 | - Made Jumprope::new() use a hardcoded seed when ddos_protection is disabled. This makes the module 5kb smaller in wasm and avoids getrandom. 37 | 38 | # 0.5.2 39 | 40 | - Swapped from inlined string methods to [`str_indices`](https://crates.io/crates/str_indices). Thanks @cessen! 41 | 42 | # 0.5.1 43 | 44 | - Only cosmetic (documentation) changes. 45 | 46 | # 0.5.0 47 | 48 | - Added support for wchar based indexing, behind a feature flag. (See documentation for details) 49 | - General performance improvements 50 | - Removed ropey as an explicit dependency, inlining the borrowed methods (for now). 51 | 52 | # 0.4.0 53 | 54 | - Breaking API change: Renamed `rope.len()` to `rope.len_bytes()` 55 | - Added `rope.mem_size() -> usize` method for debugging 56 | 57 | # 0.3.1 58 | 59 | - Fixed a few critical bugs in iterator code which caused slice_chars() to return incorrect results or crash 60 | 61 | # 0.3.0 62 | 63 | - Added iterator support (to iterate by character range) 64 | - Added proper rustdocs for core methods 65 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "jumprope" 3 | version = "1.1.2" 4 | authors = ["Joseph Gentle "] 5 | edition = "2021" 6 | description = "Simple, fast rope (fancy string) library built on top of Skiplists" 7 | repository = "https://github.com/josephg/jumprope-rs" 8 | license = "ISC OR Apache-2.0" 9 | exclude = [ 10 | "benchmark_data/*", 11 | ".github/**", 12 | ] 13 | 14 | [workspace] 15 | members = ["jumprope-wasm", "rope_benches", "crdt-testdata"] 16 | 17 | [features] 18 | default = ["ddos_protection"] 19 | # ddos_protection makes jumprope use a better RNG algorithm to avoid DOS 20 | # attacks 21 | ddos_protection = [] 22 | # wchar_conversion adds support for converting wchar positions (eg from JS or 23 | # ObjC) into local unicode offsets. 24 | wchar_conversion = [] 25 | 26 | # Line conversion adds support for editing using line/column offsets instead of character offsets. 27 | line_conversion = [] 28 | 29 | # TODO: Remove me for 2.0 - the buffered feature is no longer needed. 30 | buffered = [] 31 | 32 | [dependencies] 33 | rand = { version = "0.8", features = ["small_rng"] } 34 | str_indices = "0.4.0" 35 | 36 | [dev-dependencies] 37 | criterion = "0.4.0" 38 | crdt-testdata = { path = "./crdt-testdata" } 39 | 40 | [[bench]] 41 | name = "benches" 42 | harness = false 43 | 44 | [profile.release] 45 | lto = true 46 | codegen-units = 1 47 | 48 | [profile.release.package.jumprope-wasm] 49 | #opt-level = "s" 50 | opt-level = 2 51 | 52 | 53 | [package.metadata.docs.rs] 54 | features = ["wchar_conversion"] -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | ISC License 2 | 3 | Copyright 2022 Joseph Gentle 4 | 5 | Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, 6 | provided that the above copyright notice and this permission notice appear in all copies. 7 | 8 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL 9 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 11 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 12 | THIS SOFTWARE. 13 | 14 | --- 15 | 16 | Some code in fast_str_tools licensed separately (as follows): 17 | 18 | Copyright (c) 2017 Nathan Vegdahl 19 | 20 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and 21 | associated documentation files (the "Software"), to deal in the Software without restriction, 22 | including without limitation the rights to use, copy, modify, merge, publish, distribute, 23 | sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is 24 | furnished to do so, subject to the following conditions: 25 | 26 | The above copyright notice and this permission notice shall be included in all copies or substantial 27 | portions of the Software. 28 | 29 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT 30 | NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 31 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 32 | OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 33 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # JumpRope 2 | 3 | Because inserting into a string should be fast. 4 | 5 | A [rope](https://en.wikipedia.org/wiki/Rope_(data_structure)) is a data structure for efficiently editing large strings, or for processing editing traces. 6 | 7 | As far as I know, JumpRope is the world's fastest rope implementation. 8 | 9 | Unlike traditional strings, JumpRope allows you to: 10 | 11 | - Efficiently insert or delete arbitrary keystrokes from anywhere in the document. Using real world editing traces, jumprope can process about 35-40 million edits per second. 12 | - Index using unicode character offsets or wchar offsets (like you find in JS and other languages). Jumprope can efficiently convert between these formats. 13 | 14 | JumpRope is optimized for large strings like source code files and text documents. If your strings are very small (less than 100 bytes), you should probably just use Rust's built in [std String](https://doc.rust-lang.org/std/string/struct.String.html) or a small-string-optimized string library like [SmartString](https://crates.io/crates/smartstring). 15 | 16 | JumpRope is similar to [ropey](https://crates.io/crates/ropey). Ropey supports a few more features (like converting line/column positions). However, jumprope is about 3x faster than ropey when processing real editing operations (see below) and jumprope compiles to a smaller wasm bundle. (Ropey is 30kb brotli compressed, vs 18kb for jumprope). 17 | 18 | [API documentation](https://docs.rs/jumprope/) 19 | 20 | [Jumprope on crates.io](https://crates.io/crates/jumprope) 21 | 22 | Add this to Cargo.toml to use: 23 | 24 | ```toml 25 | jumprope = "1.0.0" 26 | ``` 27 | 28 | 29 | # Usage 30 | 31 | JumpRope isn't a drop-in replacement for string, but it supports many similar methods. The most important additions are the [`insert`](https://docs.rs/jumprope/latest/jumprope/struct.JumpRope.html#method.insert), [`remove`](https://docs.rs/jumprope/latest/jumprope/struct.JumpRope.html#method.remove) and [`replace`](https://docs.rs/jumprope/latest/jumprope/struct.JumpRope.html#method.replace) methods - which let you edit strings in-place in (typically) `log(n)` time relative to the size of the existing document. 32 | 33 | ```rust 34 | use jumprope::JumpRope; 35 | 36 | fn main() { 37 | let mut rope = JumpRope::from("Some large text document"); 38 | rope.insert(5, "really "); // "Some really large text document" 39 | rope.replace(0..4, "My rad"); // "My rad really large text document" 40 | assert_eq!(rope, "My rad really large text document"); 41 | 42 | // Extract to a string 43 | let s: String = rope.to_string(); 44 | assert_eq!(s, "My rad really large text document"); 45 | } 46 | ``` 47 | 48 | You can read content back out of a rope by: 49 | 50 | - Converting the rope to a string using `rope.to_string()` (requires allocations) 51 | - Iterating over characters using [`rope.chars()`](https://docs.rs/jumprope/latest/jumprope/struct.JumpRope.html#method.chars) 52 | - (Fastest) iterating over &str chunks with [`rope.substrings()`](https://docs.rs/jumprope/latest/jumprope/struct.JumpRope.html#method.substrings). This returns an iterator over contained `&str` items in the document. 53 | 54 | If you want to read a subsection of the rope, you can use [`rope.slice_substrings(10..20)`](https://docs.rs/jumprope/latest/jumprope/struct.JumpRope.html#method.slice_chunks) to read all the content within a given range in the rope. Eg: 55 | 56 | ```rust 57 | fn main() { 58 | let rope = JumpRope::from("xxxGreetings!xxx"); 59 | 60 | let string = rope.slice_substrings(3..13).collect::(); 61 | assert_eq!(string, "Greetings!"); 62 | } 63 | ``` 64 | 65 | For more details, see [JumpRope API documentation](https://docs.rs/jumprope/latest/jumprope/struct.JumpRope.html) 66 | 67 | 68 | ## Wchar conversion 69 | 70 | In some languages (notably Javascript, Java and C#) strings are measured by the number of 2-byte "characters" needed when encoding the string using UTF16. 71 | 72 | This is awkward because its difficult to efficiently convert between unicode character offsets (used by jumprope, diamond types and other editors) and these editing locations. The naive approach is an O(n) operation. 73 | 74 | Jumprope supports doing this conversion in `O(log n)` time, by adding extra indexing information to the skip list. This feature is disabled by default, because the extra bookkeeping slows down jumprope by about 15%. 75 | 76 | To use this feature, enable the `wchar_conversion` feature flag: 77 | 78 | ```toml 79 | jumprope = { version = "1.0.0", features = ["wchar_conversion"] } 80 | ``` 81 | 82 | This feature flag enables a bunch of extra wchar-related methods for interacting with a document: 83 | 84 | - `rope.len_wchars() -> usize`: Return the length of the string in wchars. 85 | - `rope.chars_to_wchars(chars: usize) -> usize`: Convert a char offset to a wchar offset 86 | - `rope.wchars_to_chars(wchars: usize) -> usize`: Convert a wchar index back to a unicode character count 87 | - `rope.insert_at_wchar(pos_wchar: usize, content: &str)`: Insert `content` at the specified wchar offset 88 | - `rope.remove_at_wchar(range: Range)`: Remove the specified range, specified using wchar offsets 89 | - `rope.replace_at_wchar(range: Range, content: &str)`: Replace the specified range with `content` 90 | 91 | See [documentation on docs.rs](https://docs.rs/jumprope/latest/jumprope/struct.JumpRope.html) for more information about these methods. 92 | 93 | 94 | ## Buffered strings 95 | 96 | JumpRope also has an API for buffered edits. Usually when humans edit a string, they insert or delete runs of characters. If you merge these editing runs together before applying them, jumprope is about 10x faster again. 97 | 98 | Jumprope provides a wrapper API to do this transparently in the form of [JumpRopeBuf](https://docs.rs/jumprope/latest/jumprope/struct.JumpRopeBuf.html). JumpRopeBuf does a best-effort attempt to merge incoming writes together before flushing (writing) them to the contained jumprope object. 99 | 100 | This API may be missing some methods found on `JumpRope`. You can usually work around any missing methods by calling `rope.borrow()` or `rope.as_mut()` to flush pending changes and access a pointer to the underlying rope. But please file issues if you find any missing functions, because adding direct implementations will usually result in better performance. 101 | 102 | See [JumpRopeBuf module documentation](https://docs.rs/jumprope/latest/jumprope/struct.JumpRopeBuf.html) for usage. 103 | 104 | 105 | ## History / motivation 106 | 107 | This code is based on an older [skiplist based C rope library](https://github.com/josephg/librope) I wrote several years ago as an excuse to play with skip lists. It has a few notable differences: 108 | 109 | - Instead of simply being implemented as a skiplist, jumprope is a skiplist where each leaf node contains a [Gap Buffer](https://en.wikipedia.org/wiki/Gap_buffer). 110 | - Jumprope is faster. (See table below) 111 | 112 | 113 | ## Benchmarks 114 | 115 | Running the [editing traces from crdt-benchmarks](https://github.com/josephg/crdt-benchmarks), jumprope is faster than any other library in cargo that I know of: 116 | 117 | Running on a single core of a Ryzen 5800X: 118 | 119 | | Dataset | Raw string | XiRope | Ropey | librope (C) | Jumprope | 120 | |-----------------|------------|-----------|----------|-------------|----------| 121 | | automerge-paper | 3908.13 ms | 518.75 ms | 25.16 ms | 16.28 ms | 6.66 ms | 122 | | rustcode | 569.44 ms | DNF | 4.71 ms | 3.93 ms | 1.66 ms | 123 | | sveltecomponent | 41.05 ms | 24.83 ms | 2.31 ms | 1.59 ms | 0.59 ms | 124 | | seph-blog1 | 1238.44 ms | DNF | 13.04 ms | 10.01 ms | 3.81 ms | 125 | 126 | Full criterion report is [here](https://home.seph.codes/public/rope_bench/report/). 127 | 128 | I tried AnRope as well, but it crashed while processing these datasets. 129 | 130 | 131 | # LICENSE 132 | 133 | Licensed under the ISC license: 134 | 135 | Copyright 2018 Joseph Gentle 136 | 137 | Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. 138 | 139 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -------------------------------------------------------------------------------- /benches/benches.rs: -------------------------------------------------------------------------------- 1 | use criterion::*; 2 | use crdt_testdata::*; 3 | 4 | use jumprope::{JumpRope, JumpRopeBuf}; 5 | 6 | fn count_chars(s: &String) -> usize { 7 | s.chars().count() 8 | } 9 | 10 | #[derive(Debug, Clone)] 11 | enum Op { 12 | Ins(usize, String), 13 | Del(usize, usize), 14 | } 15 | use Op::*; 16 | 17 | fn collapse(test_data: &TestData) -> Vec { 18 | let mut result = Vec::new(); 19 | 20 | let mut merge = |op: Op| { 21 | let append = match (&op, result.last_mut()) { 22 | (Ins(pos, new_content), Some(Ins(cur_pos, cur_content))) => { 23 | if *pos == *cur_pos + count_chars(&cur_content) { 24 | cur_content.push_str(new_content.as_str()); 25 | false 26 | } else { true } 27 | } 28 | (Del(pos, new_del), Some(Del(cur_pos, cur_del))) => { 29 | if *pos == *cur_pos { 30 | // The new delete follows the old. 31 | *cur_del += *new_del; 32 | false 33 | } else if *pos + *new_del == *cur_pos { 34 | // The new delete is a backspace (before the old) 35 | *cur_pos = *pos; 36 | *cur_del += *new_del; 37 | false 38 | } else { 39 | true 40 | } 41 | } 42 | _ => true, 43 | }; 44 | 45 | if append { result.push(op); } 46 | }; 47 | 48 | for txn in test_data.txns.iter() { 49 | for TestPatch(pos, del_span, ins_content) in &txn.patches { 50 | if *del_span > 0 { 51 | merge(Op::Del(*pos, *del_span)); 52 | } 53 | if !ins_content.is_empty() { 54 | merge(Op::Ins(*pos, ins_content.clone())); 55 | } 56 | } 57 | } 58 | result 59 | } 60 | 61 | fn testing_data(name: &str) -> TestData { 62 | let filename = format!("benchmark_data/{}.json.gz", name); 63 | load_testing_data(&filename) 64 | } 65 | 66 | const DATASETS: &[&str] = &["automerge-paper", "rustcode", "sveltecomponent", "seph-blog1"]; 67 | 68 | fn realworld_benchmarks(c: &mut Criterion) { 69 | for name in DATASETS { 70 | let mut group = c.benchmark_group("testdata"); 71 | // let mut group = c.benchmark_group("local"); 72 | let test_data = testing_data(name); 73 | let merged = collapse(&test_data); 74 | assert_eq!(test_data.start_content.len(), 0); 75 | 76 | let len = test_data.txns.iter() 77 | .flat_map(|txn| txn.patches.iter() ) 78 | .map(|patch| patch.1 + patch.2.len()) 79 | .sum::(); 80 | group.throughput(Throughput::Elements(len as u64)); 81 | 82 | group.bench_function(BenchmarkId::new("direct", name), |b| { 83 | b.iter(|| { 84 | let mut rope = JumpRope::new(); 85 | for txn in test_data.txns.iter() { 86 | for TestPatch(pos, del_span, ins_content) in &txn.patches { 87 | rope.replace(*pos .. *pos + *del_span, ins_content); 88 | // if *del_span > 0 { 89 | // rope.remove(*pos .. *pos + *del_span); 90 | // } 91 | // if !ins_content.is_empty() { 92 | // rope.insert(*pos, ins_content); 93 | // } 94 | } 95 | } 96 | 97 | assert_eq!(rope.len_bytes(), test_data.end_content.len()); 98 | black_box(rope.len_chars()); 99 | }) 100 | }); 101 | 102 | // group.bench_function(BenchmarkId::new("merged", name), |b| { 103 | // b.iter(|| { 104 | // let mut rope = JumpRope::new(); 105 | // for op in merged.iter() { 106 | // match op { 107 | // Ins(pos, content) => { 108 | // rope.insert(*pos, content); 109 | // } 110 | // Del(pos, del_span) => { 111 | // rope.remove(*pos..*pos + *del_span); 112 | // } 113 | // } 114 | // } 115 | // 116 | // // assert_eq!(test_data.end_content, rope.to_string()); 117 | // 118 | // assert_eq!(rope.len_bytes(), test_data.end_content.len()); 119 | // black_box(rope.len_chars()); 120 | // }) 121 | // }); 122 | 123 | group.bench_function(BenchmarkId::new("buffered", name), |b| { 124 | b.iter(|| { 125 | let mut rope = JumpRopeBuf::new(); 126 | for op in merged.iter() { 127 | match op { 128 | Ins(pos, content) => { 129 | rope.insert(*pos, content); 130 | } 131 | Del(pos, del_span) => { 132 | rope.remove(*pos..*pos + *del_span); 133 | } 134 | } 135 | } 136 | 137 | // assert_eq!(test_data.end_content, rope.to_string()); 138 | 139 | let rope = rope.into_inner(); 140 | assert_eq!(rope.len_bytes(), test_data.end_content.len()); 141 | black_box(rope.len_chars()); 142 | }) 143 | }); 144 | 145 | group.finish(); 146 | } 147 | } 148 | 149 | criterion_group!(benches, realworld_benchmarks); 150 | criterion_main!(benches); 151 | -------------------------------------------------------------------------------- /benchmark_data/automerge-paper.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephg/jumprope-rs/3981256e4e741d8b19efe26ffcfcfe178332eda0/benchmark_data/automerge-paper.json.gz -------------------------------------------------------------------------------- /benchmark_data/rustcode.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephg/jumprope-rs/3981256e4e741d8b19efe26ffcfcfe178332eda0/benchmark_data/rustcode.json.gz -------------------------------------------------------------------------------- /benchmark_data/seph-blog1.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephg/jumprope-rs/3981256e4e741d8b19efe26ffcfcfe178332eda0/benchmark_data/seph-blog1.json.gz -------------------------------------------------------------------------------- /benchmark_data/sveltecomponent.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephg/jumprope-rs/3981256e4e741d8b19efe26ffcfcfe178332eda0/benchmark_data/sveltecomponent.json.gz -------------------------------------------------------------------------------- /crdt-testdata/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "crdt-testdata" 3 | version = "0.0.0" 4 | authors = ["Seph Gentle "] 5 | edition = "2021" 6 | 7 | [dependencies] 8 | flate2 = { version = "1.0.22", features = ["zlib-ng-compat"], default-features = false } 9 | serde = { version = "1.0.136", features = ["derive"] } 10 | serde_json = "1.0.79" 11 | ropey = "1.6.0" -------------------------------------------------------------------------------- /crdt-testdata/src/lib.rs: -------------------------------------------------------------------------------- 1 | // use std::time::SystemTime; 2 | use std::fs::File; 3 | use std::io::{BufReader, Read}; 4 | use flate2::bufread::GzDecoder; 5 | use serde::Deserialize; 6 | 7 | /// This file contains some simple helpers for loading test data. Its used by benchmarking and 8 | /// testing code. 9 | 10 | /// (position, delete length, insert content). 11 | #[derive(Debug, Clone, Deserialize, Eq, PartialEq)] 12 | pub struct TestPatch(pub usize, pub usize, pub String); 13 | 14 | #[derive(Debug, Clone, Deserialize, Eq, PartialEq)] 15 | pub struct TestTxn { 16 | // time: String, // ISO String. Unused. 17 | pub patches: Vec 18 | } 19 | 20 | #[derive(Debug, Clone, Deserialize, Eq, PartialEq)] 21 | pub struct TestData { 22 | #[serde(default)] 23 | pub using_byte_positions: bool, 24 | 25 | #[serde(rename = "startContent")] 26 | pub start_content: String, 27 | #[serde(rename = "endContent")] 28 | pub end_content: String, 29 | 30 | pub txns: Vec, 31 | } 32 | 33 | impl TestData { 34 | pub fn len(&self) -> usize { 35 | self.txns.iter() 36 | .map(|txn| { txn.patches.len() }) 37 | .sum::() 38 | } 39 | 40 | pub fn is_empty(&self) -> bool { 41 | !self.txns.iter().any(|txn| !txn.patches.is_empty()) 42 | } 43 | 44 | /// This method returns a clone of the testing data using byte offsets instead of codepoint 45 | /// indexes. 46 | pub fn chars_to_bytes(&self) -> Self { 47 | assert_eq!(false, self.using_byte_positions); 48 | 49 | let mut r = ropey::Rope::new(); 50 | 51 | Self { 52 | using_byte_positions: true, 53 | start_content: self.start_content.clone(), 54 | end_content: self.end_content.clone(), 55 | txns: self.txns.iter().map(|txn| { 56 | TestTxn { 57 | patches: txn.patches.iter().map(|TestPatch(pos_chars, del_chars, ins)| { 58 | let pos_bytes = r.char_to_byte(*pos_chars); 59 | // if *pos_chars != pos_bytes { 60 | // println!("Converted position {} to {}", *pos_chars, pos_bytes); 61 | // } 62 | let del_bytes = if *del_chars > 0 { 63 | let del_end_bytes = r.char_to_byte(pos_chars + *del_chars); 64 | r.remove(*pos_chars..*pos_chars + *del_chars); 65 | del_end_bytes - pos_bytes 66 | } else { 0 }; 67 | if !ins.is_empty() { r.insert(*pos_chars, ins); } 68 | 69 | TestPatch(pos_bytes, del_bytes, ins.clone()) 70 | }).collect(), 71 | } 72 | }).collect() 73 | } 74 | } 75 | 76 | pub fn patches(&self) -> impl Iterator { 77 | self.txns.iter().flat_map(|txn| txn.patches.iter()) 78 | } 79 | } 80 | 81 | // TODO: Make a try_ version of this method, which returns an appropriate Error object. 82 | pub fn load_testing_data(filename: &str) -> TestData { 83 | // let start = SystemTime::now(); 84 | // let mut file = File::open("benchmark_data/automerge-paper.json.gz").unwrap(); 85 | let file = File::open(filename).unwrap(); 86 | 87 | let reader = BufReader::new(file); 88 | // We could pass the GzDecoder straight to serde, but it makes it way slower to parse for 89 | // some reason. 90 | let mut reader = GzDecoder::new(reader); 91 | let mut raw_json = vec!(); 92 | reader.read_to_end(&mut raw_json).unwrap(); 93 | 94 | // println!("uncompress time {}", start.elapsed().unwrap().as_millis()); 95 | 96 | // let start = SystemTime::now(); 97 | let data: TestData = serde_json::from_reader(raw_json.as_slice()).unwrap(); 98 | // println!("JSON parse time {}", start.elapsed().unwrap().as_millis()); 99 | 100 | data 101 | } 102 | 103 | #[cfg(test)] 104 | mod tests { 105 | use crate::{load_testing_data, TestData, TestPatch, TestTxn}; 106 | 107 | #[test] 108 | fn it_works() { 109 | let data = load_testing_data("../benchmark_data/sveltecomponent.json.gz"); 110 | assert!(data.txns.len() > 0); 111 | } 112 | 113 | #[test] 114 | fn convert_chars_to_bytes() { 115 | let data = TestData { 116 | using_byte_positions: false, 117 | start_content: "".to_string(), 118 | end_content: "".to_string(), 119 | txns: vec![ 120 | TestTxn { 121 | patches: vec![ 122 | TestPatch(0, 0, "ツ".into()), 123 | TestPatch(1, 0, "x".into()), 124 | TestPatch(1, 1, "".into()), 125 | TestPatch(0, 1, "".into()), 126 | ], 127 | } 128 | ], 129 | }; 130 | 131 | // let data = load_testing_data("../benchmark_data/seph-blog1.json.gz"); 132 | // let data = load_testing_data("../benchmark_data/sveltecomponent.json.gz"); 133 | let data2 = data.chars_to_bytes(); 134 | dbg!(&data2); 135 | 136 | assert_eq!(data2, TestData { 137 | using_byte_positions: true, 138 | start_content: "".to_string(), 139 | end_content: "".to_string(), 140 | txns: vec![ 141 | TestTxn { 142 | patches: vec![ 143 | // Positions have changed! 144 | TestPatch(0, 0, "ツ".into()), 145 | TestPatch(3, 0, "x".into()), 146 | TestPatch(3, 1, "".into()), 147 | TestPatch(0, 3, "".into()), 148 | ], 149 | } 150 | ], 151 | }); 152 | 153 | // dbg!(&data2); 154 | 155 | for (p1, p2) in data.patches().zip(data2.patches()) { 156 | // assert_eq!(p1.1, p2.1); 157 | assert_eq!(p1.2, p2.2); 158 | if p1.1 != p2.1 { 159 | println!("{} / {} ({} {})", p1.0, p2.0, p1.1, p1.2); 160 | } 161 | 162 | // if p1.2.chars().count() != p1.2.len() { 163 | // println!("unicode! {}", p1.2); 164 | // } 165 | } 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /jumprope-wasm/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "jumprope-wasm" 3 | version = "0.1.0" 4 | edition = "2018" 5 | 6 | [lib] 7 | crate-type = ["cdylib", "rlib"] 8 | 9 | [features] 10 | default = ["ddos_protection", "jumprope/wchar_conversion"] 11 | #default = ["ddos_protection"] 12 | #default = ["jumprope/wchar_conversion"] 13 | ddos_protection = ["jumprope/ddos_protection"] 14 | 15 | [dependencies] 16 | wasm-bindgen = "0.2" 17 | 18 | # Disable default-features to remove obsessive ddos protection by default to 19 | # reduce code size. (42kb -> 33kb). SmallRng is still plenty secure for most 20 | # applications. 21 | jumprope = { path = "..", default-features = false } 22 | 23 | # Needed for rand in wasm, regardless of whether entropy is used. 24 | getrandom = { version = "0.2.3", features = ["js"] } 25 | 26 | # The `console_error_panic_hook` crate provides better debugging of panics by 27 | # logging them with `console.error`. This is great for development, but requires 28 | # all the `std::fmt` and `std::panicking` infrastructure, so isn't great for 29 | # code size when deploying. 30 | #console_error_panic_hook = { version = "0.1.6", optional = true } 31 | -------------------------------------------------------------------------------- /jumprope-wasm/build_wasm.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | RUSTFLAGS="" 4 | #cd crates/diamond-wasm 5 | 6 | echo "=== Before ===" 7 | ls -l pkg 8 | echo "=== After ===" 9 | wasm-pack build --target web 10 | 11 | brotli -f pkg/*.wasm 12 | ls -l pkg 13 | -------------------------------------------------------------------------------- /jumprope-wasm/src/lib.rs: -------------------------------------------------------------------------------- 1 | use wasm_bindgen::prelude::*; 2 | use jumprope::JumpRope; 3 | 4 | #[wasm_bindgen] 5 | pub struct Rope(JumpRope); 6 | 7 | #[wasm_bindgen] 8 | impl Rope { 9 | /// Create a new rope, optionally with initial content. 10 | #[wasm_bindgen(constructor)] 11 | pub fn new(s: Option) -> Self { 12 | // Can't use Option<&str> in wasm-bindgen for some reason. It doesn't matter much - 13 | // the passed string will be heap allocated anyway. 14 | 15 | let mut r = if cfg!(feature = "ddos_protection") { 16 | // Generating a rope from entropy adds 5kb to the binary size. 17 | JumpRope::new() 18 | } else { 19 | JumpRope::new_from_seed(321) 20 | }; 21 | if let Some(str) = s { 22 | r.insert(0, &str); 23 | } 24 | Self(r) 25 | } 26 | 27 | #[wasm_bindgen] 28 | pub fn from(s: String) -> Self { 29 | Self::new(Some(s)) 30 | } 31 | 32 | /// Insert new content at the specified position. 33 | #[wasm_bindgen] 34 | pub fn insert(&mut self, pos: usize, content: &str) { 35 | self.0.insert(pos, content); 36 | } 37 | 38 | /// Remove (splice out) rope content of length del_len at the specified position. 39 | #[wasm_bindgen] 40 | pub fn remove(&mut self, pos: usize, del_len: usize) { 41 | self.0.remove(pos..pos+del_len); 42 | } 43 | 44 | #[wasm_bindgen(js_name=toString)] 45 | pub fn as_string(&self) -> String { 46 | self.0.to_string() 47 | } 48 | 49 | #[wasm_bindgen(getter)] 50 | pub fn length(&self) -> usize { 51 | self.0.len_chars() 52 | } 53 | } 54 | 55 | #[cfg(test)] 56 | mod tests { 57 | use crate::Rope; 58 | 59 | #[test] 60 | fn smoke_test() { 61 | let mut r: Rope = Rope::new(None); 62 | assert_eq!(r.as_string(), ""); 63 | r.insert(0, "hi there"); 64 | assert_eq!(r.as_string(), "hi there"); 65 | r.remove(2, 4); 66 | assert_eq!(r.as_string(), "hire"); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /rope_benches/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors = ["Joseph Gentle "] 3 | edition = "2018" 4 | name = "rope_benches" 5 | version = "0.1.0" 6 | 7 | [build-dependencies] 8 | cc = "1.0" 9 | 10 | [dependencies] 11 | criterion = "0.3" 12 | jumprope = { path = ".." } 13 | rand = { version = "0.8", features = ["small_rng"] } 14 | ropey = "1.6.0" 15 | xi-rope = "0.3.0" 16 | an-rope = { version = "0.3.1", features = ["tendril"] } 17 | crdt-testdata = { path = "../crdt-testdata" } 18 | #crop = { path = "../../../3rdparty/crop" } 19 | crop = { git = "https://github.com/noib3/crop" } 20 | -------------------------------------------------------------------------------- /rope_benches/README.md: -------------------------------------------------------------------------------- 1 | # Rust rope benchmarks 2 | 3 | This is a small collection of benchmarks of various rope implementations in rust. 4 | 5 | I'm comparing: 6 | 7 | - Jumprope (this library) 8 | - The [C version of this rope library](https://github.com/josephg/librope) 9 | - [ropey](https://crates.io/crates/ropey/) 10 | - [xi-rope](https://crates.io/crates/xi-rope) 11 | - [an-rope](https://crates.io/crates/an-rope) 12 | 13 | To run the benchmarks, navigate into this directory and run: 14 | 15 | ``` 16 | cargo run --release -- --bench 17 | ``` 18 | 19 | This will produce a report in *target/criterion/report/index.html*. 20 | 21 | Current benchmark results are published [here](https://home.seph.codes/public/c4/report/) -------------------------------------------------------------------------------- /rope_benches/build.rs: -------------------------------------------------------------------------------- 1 | extern crate cc; 2 | 3 | fn main() { 4 | cc::Build::new() 5 | .file("rope.c") 6 | .compile("librope"); 7 | } -------------------------------------------------------------------------------- /rope_benches/explore_parameters.js: -------------------------------------------------------------------------------- 1 | const asciichart = require('asciichart') 2 | const fs = require('fs') 3 | const {spawnSync} = require('child_process') 4 | 5 | const gmean = list => ( 6 | Math.pow(list.reduce((a, b) => a*b, 1), 1/list.length) 7 | ) 8 | 9 | const names = ["automerge-paper", "rustcode", "sveltecomponent", "seph-blog1"] 10 | const getScore = () => { 11 | 12 | const data = names.map(name => { 13 | const est_file = `../target/criterion/realworld/JumpRope/${name}/new/estimates.json` 14 | const estimates = JSON.parse(fs.readFileSync(est_file, 'utf8')) 15 | 16 | const bench_file = `../target/criterion/realworld/JumpRope/${name}/new/benchmark.json` 17 | const elements = JSON.parse(fs.readFileSync(bench_file, 'utf8')).throughput.Elements 18 | 19 | return elements / (estimates.mean.point_estimate / 1e9) 20 | }) 21 | // console.log(data) 22 | return data 23 | } 24 | 25 | const setSize = size => { 26 | // fs.writeFileSync('../src/params.rs', ` 27 | // pub const XX_SIZE: usize = 380; 28 | // pub const XX_BIAS: u8 = ${size}; 29 | // `) 30 | fs.writeFileSync('../src/params.rs', ` 31 | pub const XX_SIZE: usize = ${size}; 32 | pub const XX_BIAS: u8 = 65; 33 | `) 34 | } 35 | 36 | // const cmd = 'cargo build --release && sleep 3 && taskset 0x1 nice -10 cargo run --release -- --bench --measurement-time=3 -n realworld/JumpRope/automerge-paper' 37 | // const cmd = 'cargo build --release && taskset 0x1 nice -10 cargo run --release -- --bench --measurement-time=10 -n realworld/JumpRope/automerge-paper' 38 | const cmd = 'cargo build --release && taskset 0x1 nice -10 cargo run --release -- --bench --measurement-time=20 -n realworld/JumpRope' 39 | const bench = () => { 40 | spawnSync(cmd, { 41 | shell: true, 42 | stdio: 'inherit', 43 | }) 44 | } 45 | 46 | // setSize(100) 47 | 48 | const scores = {} 49 | // The first row is the sizes. second row contains mean. Then results. 50 | const scores_arr = new Array(names.length + 2).fill().map(() => []) 51 | 52 | const run = size => { 53 | setSize(size) 54 | bench() 55 | const vals = getScore() 56 | const gm = gmean(vals) 57 | scores[size] = gm 58 | scores_arr[0].push(size) 59 | scores_arr[1].push(gm) 60 | for (let i = 0; i < vals.length; i++) { 61 | scores_arr[i+2].push(vals[i]) 62 | } 63 | 64 | console.log(`Registered ${size} => ${gm} (${gm / 1e6})`) 65 | } 66 | 67 | // for (let s = 50; s <= 80; s += 5) { 68 | // run(s) 69 | // } 70 | for (let s = 380; s <= 400; s += 4) { 71 | // console.log(s) 72 | run(s) 73 | } 74 | // for (let s = 300; s <= 400; s += 20) { 75 | // run(s) 76 | // } 77 | console.table(scores) 78 | 79 | // run(200) 80 | // console.log(getScore()) 81 | 82 | 83 | const pad = arr => { 84 | let num = Math.round(80 / (arr.length-1)) 85 | const result = [arr[0]] 86 | for (let i = 1; i < arr.length; i++) { 87 | let prev = arr[i-1] 88 | let next = arr[i] 89 | 90 | for (let j = 1; j <= num; j++) { 91 | let weight = j/num 92 | result.push(next * weight + prev * (1-weight)) 93 | } 94 | } 95 | return result 96 | } 97 | 98 | const drawChart = scores_arr => { 99 | console.log(asciichart.plot(scores_arr.slice(1).map(pad), { 100 | colors: [ 101 | asciichart.white, 102 | asciichart.blue, asciichart.green, asciichart.red, asciichart.yellow 103 | ], 104 | height: 50, 105 | })) 106 | } 107 | 108 | // drawChart(JSON.parse(fs.readFileSync('data.json', 'utf8'))) 109 | 110 | drawChart(scores_arr) 111 | 112 | // console.log(asciichart.plot(pad([0, 2, 3]), { 113 | // colors: [asciichart.blue, asciichart.green, asciichart.red, asciichart.yellow], 114 | // height: 20, 115 | // })) 116 | 117 | fs.writeFileSync('data.json', JSON.stringify(scores_arr)) 118 | console.log('data written to data.json') -------------------------------------------------------------------------------- /rope_benches/rope.c: -------------------------------------------------------------------------------- 1 | // Implementation for rope library. 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | // Needed for VC++, which always compiles in C++ mode and doesn't have stdbool. 8 | #ifndef __cplusplus 9 | #include 10 | #endif 11 | 12 | #include 13 | #include "rope.h" 14 | 15 | // The number of bytes the rope head structure takes up 16 | static const size_t ROPE_SIZE = sizeof(rope) + sizeof(rope_node) * ROPE_MAX_HEIGHT; 17 | 18 | // Create a new rope with no contents 19 | rope *rope_new2(void *(*alloc)(size_t bytes), 20 | void *(*realloc)(void *ptr, size_t newsize), 21 | void (*free)(void *ptr)) { 22 | rope *r = (rope *)alloc(ROPE_SIZE); 23 | r->num_chars = r->num_bytes = 0; 24 | 25 | r->alloc = alloc; 26 | r->realloc = realloc; 27 | r->free = free; 28 | 29 | r->head.height = 1; 30 | r->head.num_bytes = 0; 31 | r->head.nexts[0].node = NULL; 32 | r->head.nexts[0].skip_chars = 0; 33 | #if ROPE_WCHAR 34 | r->head.nexts[0].wchar_size = 0; 35 | #endif 36 | return r; 37 | } 38 | 39 | rope *rope_new() { 40 | return rope_new2(malloc, realloc, free); 41 | } 42 | 43 | // Create a new rope containing the specified string 44 | rope *rope_new_with_utf8(const uint8_t *str) { 45 | rope *r = rope_new(); 46 | ROPE_RESULT result = rope_insert(r, 0, str); 47 | 48 | if (result != ROPE_OK) { 49 | rope_free(r); 50 | return NULL; 51 | } else { 52 | return r; 53 | } 54 | } 55 | 56 | rope *rope_copy(const rope *other) { 57 | rope *r = (rope *)other->alloc(ROPE_SIZE); 58 | 59 | // Just copy most of the head's data. Note this won't copy the nexts list in head. 60 | *r = *other; 61 | 62 | rope_node *nodes[ROPE_MAX_HEIGHT]; 63 | 64 | for (int i = 0; i < other->head.height; i++) { 65 | nodes[i] = &r->head; 66 | // non-NULL next pointers will be rewritten below. 67 | r->head.nexts[i] = other->head.nexts[i]; 68 | } 69 | 70 | for (rope_node *n = other->head.nexts[0].node; n != NULL; n = n->nexts[0].node) { 71 | // I wonder if it would be faster if we took this opportunity to rebalance the node list..? 72 | size_t h = n->height; 73 | rope_node *n2 = (rope_node *)r->alloc(sizeof(rope_node) + h * sizeof(rope_skip_node)); 74 | 75 | // Would it be faster to just *n2 = *n; ? 76 | n2->num_bytes = n->num_bytes; 77 | n2->height = h; 78 | memcpy(n2->str, n->str, n->num_bytes); 79 | memcpy(n2->nexts, n->nexts, h * sizeof(rope_skip_node)); 80 | 81 | for (size_t i = 0; i < h; i++) { 82 | nodes[i]->nexts[i].node = n2; 83 | nodes[i] = n2; 84 | } 85 | } 86 | 87 | return r; 88 | } 89 | 90 | // Free the specified rope 91 | void rope_free(rope *r) { 92 | assert(r); 93 | rope_node *next; 94 | 95 | for (rope_node *n = r->head.nexts[0].node; n != NULL; n = next) { 96 | next = n->nexts[0].node; 97 | r->free(n); 98 | } 99 | 100 | r->free(r); 101 | } 102 | 103 | // Get the number of characters in a rope 104 | size_t rope_char_count(const rope *r) { 105 | assert(r); 106 | return r->num_chars; 107 | } 108 | 109 | // Get the number of bytes which the rope would take up if stored as a utf8 110 | // string 111 | size_t rope_byte_count(const rope *r) { 112 | assert(r); 113 | return r->num_bytes; 114 | } 115 | 116 | // Copies the rope's contents into a utf8 encoded C string. Also copies a trailing '\0' character. 117 | // Returns the number of bytes written, which is rope_byte_count(r) + 1. 118 | size_t rope_write_cstr(rope *r, uint8_t *dest) { 119 | size_t num_bytes = rope_byte_count(r); 120 | dest[num_bytes] = '\0'; 121 | 122 | if (num_bytes) { 123 | uint8_t *p = dest; 124 | for (rope_node* restrict n = &r->head; n != NULL; n = n->nexts[0].node) { 125 | memcpy(p, n->str, n->num_bytes); 126 | p += n->num_bytes; 127 | } 128 | 129 | assert(p == &dest[num_bytes]); 130 | } 131 | return num_bytes + 1; 132 | } 133 | 134 | // Create a new C string which contains the rope. The string will contain 135 | // the rope encoded as utf8. 136 | uint8_t *rope_create_cstr(rope *r) { 137 | uint8_t *bytes = (uint8_t *)r->alloc(rope_byte_count(r) + 1); // Room for a zero. 138 | rope_write_cstr(r, bytes); 139 | return bytes; 140 | } 141 | 142 | #if ROPE_WCHAR 143 | size_t rope_wchar_count(rope *r) { 144 | assert(r); 145 | return r->head.nexts[r->head.height - 1].wchar_size; 146 | } 147 | #endif 148 | 149 | #define MIN(x,y) ((x) > (y) ? (y) : (x)) 150 | #define MAX(x,y) ((x) > (y) ? (x) : (y)) 151 | 152 | #ifdef _WIN32 153 | inline static long random() { 154 | return rand(); 155 | } 156 | #endif 157 | 158 | static uint8_t random_height() { 159 | // This function is horribly inefficient. I'm throwing away heaps of entropy, and 160 | // the mod could be replaced by some clever shifting. 161 | // 162 | // However, random_height barely appears in the profiler output - so its probably 163 | // not worth investing the time to optimise. 164 | 165 | uint8_t height = 1; 166 | 167 | // The root node's height is the height of the largest node + 1, so the largest 168 | // node can only have ROPE_MAX_HEIGHT - 1. 169 | while(height < (ROPE_MAX_HEIGHT - 1) && (random() % 100) < ROPE_BIAS) { 170 | height++; 171 | } 172 | 173 | return height; 174 | } 175 | 176 | // Figure out how many bytes to allocate for a node with the specified height. 177 | static size_t node_size(uint8_t height) { 178 | return sizeof(rope_node) + height * sizeof(rope_skip_node); 179 | } 180 | 181 | // Allocate and return a new node. The new node will be full of junk, except 182 | // for its height. 183 | // This function should be replaced at some point with an object pool based version. 184 | static rope_node *alloc_node(rope *r, uint8_t height) { 185 | rope_node *node = (rope_node *)r->alloc(node_size(height)); 186 | node->height = height; 187 | return node; 188 | } 189 | 190 | // Find out how many bytes the unicode character which starts with the specified byte 191 | // will occupy in memory. 192 | // Returns the number of bytes, or SIZE_MAX if the byte is invalid. 193 | static inline size_t codepoint_size(uint8_t byte) { 194 | if (byte == 0) { return SIZE_MAX; } // NULL byte. 195 | else if (byte <= 0x7f) { return 1; } // 0x74 = 0111 1111 196 | else if (byte <= 0xbf) { return SIZE_MAX; } // 1011 1111. Invalid for a starting byte. 197 | else if (byte <= 0xdf) { return 2; } // 1101 1111 198 | else if (byte <= 0xef) { return 3; } // 1110 1111 199 | else if (byte <= 0xf7) { return 4; } // 1111 0111 200 | else if (byte <= 0xfb) { return 5; } // 1111 1011 201 | else if (byte <= 0xfd) { return 6; } // 1111 1101 202 | else { return SIZE_MAX; } 203 | } 204 | 205 | // This little function counts how many bytes a certain number of characters take up. 206 | static size_t count_bytes_in_utf8(const uint8_t *str, size_t num_chars) { 207 | const uint8_t *p = str; 208 | for (unsigned int i = 0; i < num_chars; i++) { 209 | p += codepoint_size(*p); 210 | } 211 | return p - str; 212 | } 213 | 214 | #if ROPE_WCHAR 215 | 216 | #define NEEDS_TWO_WCHARS(x) (((x) & 0xf0) == 0xf0) 217 | 218 | static size_t count_wchars_in_utf8(const uint8_t *str, size_t num_chars) { 219 | size_t wchars = 0; 220 | for (unsigned int i = 0; i < num_chars; i++) { 221 | wchars += 1 + NEEDS_TWO_WCHARS(*str); 222 | str += codepoint_size(*str); 223 | } 224 | return wchars; 225 | } 226 | 227 | static size_t count_utf8_in_wchars(const uint8_t *str, size_t num_wchars) { 228 | size_t chars = num_wchars; 229 | for (unsigned int i = 0; i < num_wchars; i++) { 230 | if (NEEDS_TWO_WCHARS(*str)) { 231 | chars--; 232 | i++; 233 | } 234 | str += codepoint_size(*str); 235 | } 236 | return chars; 237 | } 238 | #endif 239 | 240 | // Count the number of characters in a string. 241 | static size_t strlen_utf8(const uint8_t *str) { 242 | const uint8_t *p = str; 243 | size_t i = 0; 244 | while (*p) { 245 | p += codepoint_size(*p); 246 | i++; 247 | } 248 | return i; 249 | } 250 | 251 | // Checks if a UTF8 string is ok. Returns the number of bytes in the string if 252 | // it is ok, otherwise returns -1. 253 | static ssize_t bytelen_and_check_utf8(const uint8_t *str) { 254 | const uint8_t *p = str; 255 | while (*p != '\0') { 256 | size_t size = codepoint_size(*p); 257 | if (size == SIZE_MAX) return -1; 258 | p++; size--; 259 | while (size > 0) { 260 | // Check that any middle bytes are of the form 0x10xx xxxx 261 | if ((*p & 0xc0) != 0x80) 262 | return -1; 263 | p++; size--; 264 | } 265 | } 266 | 267 | #ifdef DEBUG 268 | size_t num = p - str; 269 | assert(num == strlen((char *)str)); 270 | #endif 271 | 272 | return p - str; 273 | } 274 | 275 | typedef struct { 276 | // This stores the previous node at each height, and the number of characters from the start of 277 | // the previous node to the current iterator position. 278 | rope_skip_node s[ROPE_MAX_HEIGHT]; 279 | } rope_iter; 280 | 281 | // Internal function for navigating to a particular character offset in the rope. 282 | // The function returns the list of nodes which point past the position, as well as 283 | // offsets of how far into their character lists the specified characters are. 284 | static rope_node *iter_at_char_pos(rope *r, size_t char_pos, rope_iter *iter) { 285 | assert(char_pos <= r->num_chars); 286 | 287 | rope_node *e = &r->head; 288 | int height = r->head.height - 1; 289 | 290 | // Offset stores how many characters we still need to skip in the current node. 291 | size_t offset = char_pos; 292 | size_t skip; 293 | #if ROPE_WCHAR 294 | size_t wchar_pos = 0; // Current wchar pos from the start of the rope. 295 | #endif 296 | 297 | while (true) { 298 | skip = e->nexts[height].skip_chars; 299 | if (offset > skip) { 300 | // Go right. 301 | assert(e == &r->head || e->num_bytes); 302 | 303 | offset -= skip; 304 | #if ROPE_WCHAR 305 | wchar_pos += e->nexts[height].wchar_size; 306 | #endif 307 | e = e->nexts[height].node; 308 | } else { 309 | // Go down. 310 | iter->s[height].skip_chars = offset; 311 | iter->s[height].node = e; 312 | #if ROPE_WCHAR 313 | iter->s[height].wchar_size = wchar_pos; 314 | #endif 315 | 316 | if (height == 0) { 317 | break; 318 | } else { 319 | height--; 320 | } 321 | } 322 | } 323 | 324 | #if ROPE_WCHAR 325 | // For some reason, this is _REALLY SLOW_. Like, 5.5Mops/s -> 4Mops/s from this block of code. 326 | wchar_pos += count_wchars_in_utf8(e->str, offset); 327 | 328 | // The iterator has the wchar pos from the start of the whole string. 329 | for (int i = 0; i < r->head.height; i++) { 330 | iter->s[i].wchar_size = wchar_pos - iter->s[i].wchar_size; 331 | } 332 | #endif 333 | 334 | assert(offset <= ROPE_NODE_STR_SIZE); 335 | assert(iter->s[0].node == e); 336 | return e; 337 | } 338 | 339 | #if ROPE_WCHAR 340 | // Equivalent of iter_at_char_pos, but for wchar positions instead. 341 | static rope_node *iter_at_wchar_pos(rope *r, size_t wchar_pos, rope_iter *iter) { 342 | int height = r->head.height - 1; 343 | assert(wchar_pos <= r->head.nexts[height].wchar_size); 344 | 345 | rope_node *e = &r->head; 346 | 347 | // Offset stores how many wchar characters we still need to skip in the current node. 348 | size_t offset = wchar_pos; 349 | size_t skip; 350 | size_t char_pos = 0; // Current char pos from the start of the rope. 351 | 352 | while (true) { 353 | skip = e->nexts[height].wchar_size; 354 | if (offset > skip) { 355 | // Go right. 356 | offset -= skip; 357 | char_pos += e->nexts[height].skip_chars; 358 | e = e->nexts[height].node; 359 | } else { 360 | // Go down. 361 | iter->s[height].skip_chars = char_pos; 362 | iter->s[height].node = e; 363 | iter->s[height].wchar_size = offset; 364 | 365 | if (height == 0) { 366 | break; 367 | } else { 368 | height--; 369 | } 370 | } 371 | } 372 | 373 | char_pos += count_utf8_in_wchars(e->str, offset); 374 | 375 | // The iterator has character positions from the start of the rope to the start of the node. 376 | for (int i = 0; i < r->head.height; i++) { 377 | iter->s[i].skip_chars = char_pos - iter->s[i].skip_chars; 378 | } 379 | assert(e == iter->s[0].node); 380 | return e; 381 | } 382 | #endif 383 | 384 | #if ROPE_WCHAR 385 | static void update_offset_list(rope *r, rope_iter *iter, size_t num_chars, size_t num_wchars) { 386 | for (int i = 0; i < r->head.height; i++) { 387 | iter->s[i].node->nexts[i].skip_chars += num_chars; 388 | iter->s[i].node->nexts[i].wchar_size += num_wchars; 389 | } 390 | } 391 | #else 392 | static void update_offset_list(rope *r, rope_iter *iter, size_t num_chars) { 393 | for (int i = 0; i < r->head.height; i++) { 394 | iter->s[i].node->nexts[i].skip_chars += num_chars; 395 | } 396 | } 397 | #endif 398 | 399 | 400 | // Internal method of rope_insert. 401 | // This function creates a new node in the rope at the specified position and fills it with the 402 | // passed string. 403 | static void insert_at(rope *r, rope_iter *iter, 404 | const uint8_t *str, size_t num_bytes, size_t num_chars) { 405 | #if ROPE_WCHAR 406 | size_t num_wchars = count_wchars_in_utf8(str, num_chars); 407 | #endif 408 | 409 | // This describes how many levels of the iter are filled in. 410 | uint8_t max_height = r->head.height; 411 | uint8_t new_height = random_height(); 412 | rope_node *new_node = alloc_node(r, new_height); 413 | new_node->num_bytes = num_bytes; 414 | memcpy(new_node->str, str, num_bytes); 415 | 416 | assert(new_height < ROPE_MAX_HEIGHT); 417 | 418 | // Max height (the rope's head's height) must be 1+ the height of the largest node. 419 | while (max_height <= new_height) { 420 | r->head.height++; 421 | r->head.nexts[max_height] = r->head.nexts[max_height - 1]; 422 | 423 | // This is the position (offset from the start) of the rope. 424 | iter->s[max_height] = iter->s[max_height - 1]; 425 | max_height++; 426 | } 427 | 428 | // Fill in the new node's nexts array. 429 | int i; 430 | for (i = 0; i < new_height; i++) { 431 | rope_skip_node *prev_skip = &iter->s[i].node->nexts[i]; 432 | new_node->nexts[i].node = prev_skip->node; 433 | new_node->nexts[i].skip_chars = num_chars + prev_skip->skip_chars - iter->s[i].skip_chars; 434 | 435 | 436 | prev_skip->node = new_node; 437 | prev_skip->skip_chars = iter->s[i].skip_chars; 438 | 439 | // & move the iterator to the end of the newly inserted node. 440 | iter->s[i].node = new_node; 441 | iter->s[i].skip_chars = num_chars; 442 | #if ROPE_WCHAR 443 | new_node->nexts[i].wchar_size = num_wchars + prev_skip->wchar_size - iter->s[i].wchar_size; 444 | prev_skip->wchar_size = iter->s[i].wchar_size; 445 | iter->s[i].wchar_size = num_wchars; 446 | #endif 447 | } 448 | 449 | for (; i < max_height; i++) { 450 | iter->s[i].node->nexts[i].skip_chars += num_chars; 451 | iter->s[i].skip_chars += num_chars; 452 | #if ROPE_WCHAR 453 | iter->s[i].node->nexts[i].wchar_size += num_wchars; 454 | iter->s[i].wchar_size += num_wchars; 455 | #endif 456 | } 457 | 458 | r->num_chars += num_chars; 459 | r->num_bytes += num_bytes; 460 | } 461 | 462 | // Insert the given utf8 string into the rope at the specified position. 463 | static ROPE_RESULT rope_insert_at_iter(rope *r, rope_node *e, rope_iter *iter, const uint8_t *str) { 464 | // iter.offset contains how far (in characters) into the current element to skip. 465 | // Figure out how much that is in bytes. 466 | size_t offset_bytes = 0; 467 | // The insertion offset into the destination node. 468 | size_t offset = iter->s[0].skip_chars; 469 | if (offset) { 470 | assert(offset <= e->nexts[0].skip_chars); 471 | offset_bytes = count_bytes_in_utf8(e->str, offset); 472 | } 473 | 474 | // We might be able to insert the new data into the current node, depending on 475 | // how big it is. We'll count the bytes, and also check that its valid utf8. 476 | ssize_t num_inserted_bytes = bytelen_and_check_utf8(str); 477 | if (num_inserted_bytes == -1) return ROPE_INVALID_UTF8; 478 | 479 | // Can we insert into the current node? 480 | bool insert_here = e->num_bytes + num_inserted_bytes <= ROPE_NODE_STR_SIZE; 481 | 482 | // Can we insert into the subsequent node? 483 | rope_node *next = NULL; 484 | if (!insert_here && offset_bytes == e->num_bytes) { 485 | next = e->nexts[0].node; 486 | // We can insert into the subsequent node if: 487 | // - We can't insert into the current node 488 | // - There _is_ a next node to insert into 489 | // - The insert would be at the start of the next node 490 | // - There's room in the next node 491 | if (next && next->num_bytes + num_inserted_bytes <= ROPE_NODE_STR_SIZE) { 492 | offset = offset_bytes = 0; 493 | for (int i = 0; i < next->height; i++) { 494 | iter->s[i].node = next; 495 | // tree offset nodes will not be used. 496 | } 497 | e = next; 498 | 499 | insert_here = true; 500 | } 501 | } 502 | 503 | if (insert_here) { 504 | // First move the current bytes later on in the string. 505 | if (offset_bytes < e->num_bytes) { 506 | memmove(&e->str[offset_bytes + num_inserted_bytes], 507 | &e->str[offset_bytes], 508 | e->num_bytes - offset_bytes); 509 | } 510 | 511 | // Then copy in the string bytes 512 | memcpy(&e->str[offset_bytes], str, num_inserted_bytes); 513 | e->num_bytes += num_inserted_bytes; 514 | 515 | r->num_bytes += num_inserted_bytes; 516 | size_t num_inserted_chars = strlen_utf8(str); 517 | r->num_chars += num_inserted_chars; 518 | 519 | // .... aaaand update all the offset amounts. 520 | #if ROPE_WCHAR 521 | size_t num_inserted_wchars = count_wchars_in_utf8(str, num_inserted_chars); 522 | update_offset_list(r, iter, num_inserted_chars, num_inserted_wchars); 523 | #else 524 | update_offset_list(r, iter, num_inserted_chars); 525 | #endif 526 | 527 | } else { 528 | // There isn't room. We'll need to add at least one new node to the rope. 529 | 530 | // If we're not at the end of the current node, we'll need to remove 531 | // the end of the current node's data and reinsert it later. 532 | size_t num_end_chars = 0, num_end_bytes = e->num_bytes - offset_bytes; 533 | if (num_end_bytes) { 534 | // We'll pretend like the character have been deleted from the node, while leaving 535 | // the bytes themselves there (for later). 536 | e->num_bytes = offset_bytes; 537 | num_end_chars = e->nexts[0].skip_chars - offset; 538 | #if ROPE_WCHAR 539 | size_t num_end_wchars = count_wchars_in_utf8(&e->str[offset_bytes], num_end_chars); 540 | update_offset_list(r, iter, -num_end_chars, -num_end_wchars); 541 | #else 542 | update_offset_list(r, iter, -num_end_chars); 543 | #endif 544 | 545 | r->num_chars -= num_end_chars; 546 | r->num_bytes -= num_end_bytes; 547 | } 548 | 549 | // Now we insert new nodes containing the new character data. The data must be broken into 550 | // pieces of with a maximum size of ROPE_NODE_STR_SIZE. Node boundaries must not occur in the 551 | // middle of a utf8 codepoint. 552 | ssize_t str_offset = 0; 553 | while (str_offset < num_inserted_bytes) { 554 | size_t new_node_bytes = 0; 555 | size_t new_node_chars = 0; 556 | 557 | while (str_offset + new_node_bytes < (size_t)num_inserted_bytes) { 558 | size_t cs = codepoint_size(str[str_offset + new_node_bytes]); 559 | if (cs + new_node_bytes > ROPE_NODE_STR_SIZE) { 560 | break; 561 | } else { 562 | new_node_bytes += cs; 563 | new_node_chars++; 564 | } 565 | } 566 | 567 | insert_at(r, iter, &str[str_offset], new_node_bytes, new_node_chars); 568 | str_offset += new_node_bytes; 569 | } 570 | 571 | if (num_end_bytes) { 572 | insert_at(r, iter, &e->str[offset_bytes], num_end_bytes, num_end_chars); 573 | } 574 | } 575 | 576 | return ROPE_OK; 577 | } 578 | 579 | ROPE_RESULT rope_insert(rope *r, size_t pos, const uint8_t *str) { 580 | assert(r); 581 | assert(str); 582 | #ifdef DEBUG 583 | _rope_check(r); 584 | #endif 585 | pos = MIN(pos, r->num_chars); 586 | 587 | rope_iter iter; 588 | // First we need to search for the node where we'll insert the string. 589 | rope_node *e = iter_at_char_pos(r, pos, &iter); 590 | 591 | ROPE_RESULT result = rope_insert_at_iter(r, e, &iter, str); 592 | 593 | #ifdef DEBUG 594 | _rope_check(r); 595 | #endif 596 | 597 | return result; 598 | } 599 | 600 | #if ROPE_WCHAR 601 | // Insert the given utf8 string into the rope at the specified position. 602 | size_t rope_insert_at_wchar(rope *r, size_t wchar_pos, const uint8_t *str) { 603 | assert(r); 604 | assert(str); 605 | #ifdef DEBUG 606 | _rope_check(r); 607 | #endif 608 | wchar_pos = MIN(wchar_pos, rope_wchar_count(r)); 609 | 610 | rope_iter iter; 611 | // First we need to search for the node where we'll insert the string. 612 | rope_node *e = iter_at_wchar_pos(r, wchar_pos, &iter); 613 | size_t pos = iter.s[r->head.height - 1].skip_chars; 614 | rope_insert_at_iter(r, e, &iter, str); 615 | 616 | #ifdef DEBUG 617 | _rope_check(r); 618 | #endif 619 | return pos; 620 | } 621 | 622 | #endif 623 | 624 | // Delete num characters at position pos. Deleting past the end of the string 625 | // has no effect. 626 | static void rope_del_at_iter(rope *r, rope_node *e, rope_iter *iter, size_t length) { 627 | r->num_chars -= length; 628 | size_t offset = iter->s[0].skip_chars; 629 | while (length) { 630 | if (offset == e->nexts[0].skip_chars) { 631 | // End of the current node. Skip to the start of the next one. 632 | e = iter->s[0].node->nexts[0].node; 633 | offset = 0; 634 | } 635 | 636 | size_t num_chars = e->nexts[0].skip_chars; 637 | size_t removed = MIN(length, num_chars - offset); 638 | #if ROPE_WCHAR 639 | size_t removed_wchars; 640 | #endif 641 | 642 | int i; 643 | if (removed < num_chars || e == &r->head) { 644 | // Just trim this node down to size. 645 | size_t leading_bytes = count_bytes_in_utf8(e->str, offset); 646 | size_t removed_bytes = count_bytes_in_utf8(&e->str[leading_bytes], removed); 647 | size_t trailing_bytes = e->num_bytes - leading_bytes - removed_bytes; 648 | #if ROPE_WCHAR 649 | removed_wchars = count_wchars_in_utf8(&e->str[leading_bytes], removed); 650 | #endif 651 | if (trailing_bytes) { 652 | memmove(&e->str[leading_bytes], &e->str[leading_bytes + removed_bytes], trailing_bytes); 653 | } 654 | e->num_bytes -= removed_bytes; 655 | r->num_bytes -= removed_bytes; 656 | 657 | for (i = 0; i < e->height; i++) { 658 | e->nexts[i].skip_chars -= removed; 659 | #if ROPE_WCHAR 660 | e->nexts[i].wchar_size -= removed_wchars; 661 | #endif 662 | } 663 | } else { 664 | // Remove the node from the list 665 | #if ROPE_WCHAR 666 | removed_wchars = e->nexts[0].wchar_size; 667 | #endif 668 | for (i = 0; i < e->height; i++) { 669 | iter->s[i].node->nexts[i].node = e->nexts[i].node; 670 | iter->s[i].node->nexts[i].skip_chars += e->nexts[i].skip_chars - removed; 671 | #if ROPE_WCHAR 672 | iter->s[i].node->nexts[i].wchar_size += e->nexts[i].wchar_size - removed_wchars; 673 | #endif 674 | } 675 | 676 | r->num_bytes -= e->num_bytes; 677 | // TODO: Recycle e. 678 | rope_node *next = e->nexts[0].node; 679 | r->free(e); 680 | e = next; 681 | } 682 | 683 | for (; i < r->head.height; i++) { 684 | iter->s[i].node->nexts[i].skip_chars -= removed; 685 | #if ROPE_WCHAR 686 | iter->s[i].node->nexts[i].wchar_size -= removed_wchars; 687 | #endif 688 | } 689 | 690 | length -= removed; 691 | } 692 | } 693 | 694 | void rope_del(rope *r, size_t pos, size_t length) { 695 | #ifdef DEBUG 696 | _rope_check(r); 697 | #endif 698 | 699 | assert(r); 700 | pos = MIN(pos, r->num_chars); 701 | length = MIN(length, r->num_chars - pos); 702 | 703 | rope_iter iter; 704 | 705 | // Search for the node where we'll insert the string. 706 | rope_node *e = iter_at_char_pos(r, pos, &iter); 707 | 708 | rope_del_at_iter(r, e, &iter, length); 709 | 710 | #ifdef DEBUG 711 | _rope_check(r); 712 | #endif 713 | } 714 | 715 | #if ROPE_WCHAR 716 | size_t rope_del_at_wchar(rope *r, size_t wchar_pos, size_t wchar_num, size_t *char_len_out) { 717 | #ifdef DEBUG 718 | _rope_check(r); 719 | #endif 720 | 721 | assert(r); 722 | size_t wchar_total = rope_wchar_count(r); 723 | wchar_pos = MIN(wchar_pos, wchar_total); 724 | wchar_num = MIN(wchar_num, wchar_total - wchar_pos); 725 | 726 | rope_iter iter; 727 | 728 | // Search for the node where we'll insert the string. 729 | rope_node *start = iter_at_wchar_pos(r, wchar_pos, &iter); 730 | size_t char_pos = iter.s[r->head.height - 1].skip_chars; 731 | 732 | rope_iter end_iter; 733 | int h = r->head.height - 1; 734 | iter_at_wchar_pos(r, iter.s[h].wchar_size + wchar_num, &end_iter); 735 | 736 | size_t char_length = end_iter.s[h].skip_chars - iter.s[h].skip_chars; 737 | rope_del_at_iter(r, start, &iter, char_length); 738 | 739 | #ifdef DEBUG 740 | _rope_check(r); 741 | #endif 742 | if (char_len_out) { 743 | *char_len_out = char_length; 744 | } 745 | return char_pos; 746 | } 747 | #endif 748 | 749 | void _rope_check(rope *r) { 750 | assert(r->head.height); // Even empty ropes have a height of 1. 751 | assert(r->num_bytes >= r->num_chars); 752 | 753 | rope_skip_node skip_over = r->head.nexts[r->head.height - 1]; 754 | assert(skip_over.skip_chars == r->num_chars); 755 | assert(skip_over.node == NULL); 756 | 757 | size_t num_bytes = 0; 758 | size_t num_chars = 0; 759 | #if ROPE_WCHAR 760 | size_t num_wchar = 0; 761 | #endif 762 | 763 | // The offsets here are used to store the total distance travelled from the start 764 | // of the rope. 765 | rope_iter iter = {}; 766 | for (int i = 0; i < r->head.height; i++) { 767 | iter.s[i].node = &r->head; 768 | } 769 | 770 | for (rope_node *n = &r->head; n != NULL; n = n->nexts[0].node) { 771 | assert(n == &r->head || n->num_bytes); 772 | assert(n->height <= ROPE_MAX_HEIGHT); 773 | assert(count_bytes_in_utf8(n->str, n->nexts[0].skip_chars) == n->num_bytes); 774 | #if ROPE_WCHAR 775 | assert(count_wchars_in_utf8(n->str, n->nexts[0].skip_chars) == n->nexts[0].wchar_size); 776 | #endif 777 | for (int i = 0; i < n->height; i++) { 778 | assert(iter.s[i].node == n); 779 | assert(iter.s[i].skip_chars == num_chars); 780 | iter.s[i].node = n->nexts[i].node; 781 | iter.s[i].skip_chars += n->nexts[i].skip_chars; 782 | #if ROPE_WCHAR 783 | assert(iter.s[i].wchar_size == num_wchar); 784 | iter.s[i].wchar_size += n->nexts[i].wchar_size; 785 | #endif 786 | } 787 | 788 | num_bytes += n->num_bytes; 789 | num_chars += n->nexts[0].skip_chars; 790 | #if ROPE_WCHAR 791 | num_wchar += n->nexts[0].wchar_size; 792 | #endif 793 | } 794 | 795 | for (int i = 0; i < r->head.height; i++) { 796 | assert(iter.s[i].node == NULL); 797 | assert(iter.s[i].skip_chars == num_chars); 798 | #if ROPE_WCHAR 799 | assert(iter.s[i].wchar_size == num_wchar); 800 | #endif 801 | } 802 | 803 | assert(r->num_bytes == num_bytes); 804 | assert(r->num_chars == num_chars); 805 | #if ROPE_WCHAR 806 | assert(skip_over.wchar_size == num_wchar); 807 | #endif 808 | } 809 | 810 | // For debugging. 811 | #include 812 | void _rope_print(rope *r) { 813 | printf("chars: %zd\tbytes: %zd\theight: %d\n", r->num_chars, r->num_bytes, r->head.height); 814 | 815 | printf("HEAD"); 816 | for (int i = 0; i < r->head.height; i++) { 817 | printf(" |%3zd ", r->head.nexts[i].skip_chars); 818 | } 819 | printf("\n"); 820 | 821 | int num = 0; 822 | for (rope_node *n = &r->head; n != NULL; n = n->nexts[0].node) { 823 | printf("%3d:", num++); 824 | for (int i = 0; i < n->height; i++) { 825 | printf(" |%3zd ", n->nexts[i].skip_chars); 826 | } 827 | printf(" : \""); 828 | fwrite(n->str, n->num_bytes, 1, stdout); 829 | printf("\"\n"); 830 | } 831 | } 832 | -------------------------------------------------------------------------------- /rope_benches/rope.h: -------------------------------------------------------------------------------- 1 | /* UTF-8 Rope implementation by Joseph Gentle 2 | * 3 | * This library implements a heavyweight utf8 string type with fast 4 | * insert-at-position and delete-at-position operations. 5 | * 6 | * It uses skip lists instead of trees. Trees might be faster - who knows? 7 | * 8 | * Ropes are not syncronized. Do not access the same rope from multiple threads 9 | * simultaneously. 10 | */ 11 | 12 | #ifndef librope_rope_h 13 | #define librope_rope_h 14 | 15 | #include 16 | #include 17 | 18 | // Whether or not the rope should support converting UTF-8 character offsets to 19 | // wchar array positions. This is useful when interoperating with strings in 20 | // JS, Objective-C and many other languages. See 21 | // http://josephg.com/post/31707645955/string-length-lies 22 | // 23 | // Adding wchar conversion support decreases performance by about 30%. 24 | #ifndef ROPE_WCHAR 25 | #define ROPE_WCHAR 0 26 | #endif 27 | 28 | // These two magic values seem to be approximately optimal given the benchmark 29 | // in tests.c which does lots of small inserts. 30 | 31 | // Must be <= UINT16_MAX. Benchmarking says this is pretty close to optimal 32 | // (tested on a mac using clang 4.0 and x86_64). 33 | #ifndef ROPE_NODE_STR_SIZE 34 | #if ROPE_WCHAR 35 | #define ROPE_NODE_STR_SIZE 64 36 | #else 37 | #define ROPE_NODE_STR_SIZE 136 38 | #endif 39 | #endif 40 | 41 | // The likelyhood (%) a node will have height (n+1) instead of n 42 | #ifndef ROPE_BIAS 43 | #define ROPE_BIAS 25 44 | #endif 45 | 46 | // The rope will stop being efficient after the string is 2 ^ ROPE_MAX_HEIGHT 47 | // nodes. 48 | #ifndef ROPE_MAX_HEIGHT 49 | #define ROPE_MAX_HEIGHT 20 50 | #endif 51 | 52 | struct rope_node_t; 53 | 54 | // The number of characters in str can be read out of nexts[0].skip_chars. 55 | typedef struct { 56 | // The number of _characters_ between the start of the current node 57 | // and the start of next. 58 | size_t skip_chars; 59 | 60 | // For some reason, librope runs about 1% faster when this next pointer is 61 | // exactly _here_ in the struct. 62 | struct rope_node_t *node; 63 | 64 | #if ROPE_WCHAR 65 | // The number of wide characters contained in space. 66 | size_t wchar_size; 67 | #endif 68 | } rope_skip_node; 69 | 70 | typedef struct rope_node_t { 71 | uint8_t str[ROPE_NODE_STR_SIZE]; 72 | 73 | // The number of bytes in str in use 74 | uint16_t num_bytes; 75 | 76 | // This is the number of elements allocated in nexts. 77 | // Each height is 1/2 as likely as the height before. The minimum height is 1. 78 | uint8_t height; 79 | 80 | rope_skip_node nexts[]; 81 | } rope_node; 82 | 83 | typedef struct { 84 | // The total number of characters in the rope. 85 | size_t num_chars; 86 | 87 | // The total number of bytes which the characters in the rope take up. 88 | size_t num_bytes; 89 | 90 | void *(*alloc)(size_t bytes); 91 | void *(*realloc)(void *ptr, size_t newsize); 92 | void (*free)(void *ptr); 93 | 94 | // The first node exists inline in the rope structure itself. 95 | rope_node head; 96 | } rope; 97 | 98 | #ifdef __cplusplus 99 | extern "C" { 100 | #endif 101 | 102 | // Create a new rope with no contents 103 | rope *rope_new(); 104 | 105 | // Create a new rope using custom allocators. 106 | rope *rope_new2(void *(*alloc)(size_t bytes), 107 | void *(*realloc)(void *ptr, size_t newsize), 108 | void (*free)(void *ptr)); 109 | 110 | // Create a new rope containing a copy of the given string. Shorthand for 111 | // r = rope_new(); rope_insert(r, 0, str); 112 | rope *rope_new_with_utf8(const uint8_t *str); 113 | 114 | // Make a copy of an existing rope 115 | rope *rope_copy(const rope *r); 116 | 117 | // Free the specified rope 118 | void rope_free(rope *r); 119 | 120 | // Get the number of characters in a rope 121 | size_t rope_char_count(const rope *r); 122 | 123 | // Get the number of bytes which the rope would take up if stored as a utf8 124 | // string 125 | size_t rope_byte_count(const rope *r); 126 | 127 | // Copies the rope's contents into a utf8 encoded C string. Also copies a 128 | // trailing '\0' character. 129 | // Returns the number of bytes written, which is rope_byte_count(r) + 1. 130 | size_t rope_write_cstr(rope *r, uint8_t *dest); 131 | 132 | // Create a new C string which contains the rope. The string will contain 133 | // the rope encoded as utf8, followed by a trailing '\0'. 134 | // Use rope_byte_count(r) to get the length of the returned string. 135 | uint8_t *rope_create_cstr(rope *r); 136 | 137 | // If you try to insert data into the rope with an invalid UTF8 encoding, 138 | // nothing will happen and we'll return ROPE_INVALID_UTF8. 139 | typedef enum { ROPE_OK, ROPE_INVALID_UTF8 } ROPE_RESULT; 140 | 141 | // Insert the given utf8 string into the rope at the specified position. 142 | ROPE_RESULT rope_insert(rope *r, size_t pos, const uint8_t *str); 143 | 144 | // Delete num characters at position pos. Deleting past the end of the string 145 | // has no effect. 146 | void rope_del(rope *r, size_t pos, size_t num); 147 | 148 | // This macro expands to a for() loop header which loops over the segments in a 149 | // rope. 150 | // 151 | // Eg: 152 | // rope *r = rope_new_with_utf8(str); 153 | // ROPE_FOREACH(r, iter) { 154 | // printf("%s", rope_node_data(iter)); 155 | // } 156 | #define ROPE_FOREACH(rope, iter) \ 157 | for (rope_node *iter = &(rope)->head; iter != NULL; iter = iter->nexts[0].node) 158 | 159 | // Get the actual data inside a rope node. 160 | static inline uint8_t *rope_node_data(rope_node *n) { 161 | return n->str; 162 | } 163 | 164 | // Get the number of bytes inside a rope node. This is useful when you're 165 | // looping through a rope. 166 | static inline size_t rope_node_num_bytes(rope_node *n) { 167 | return n->num_bytes; 168 | } 169 | 170 | // Get the number of characters inside a rope node. 171 | static inline size_t rope_node_chars(rope_node *n) { 172 | return n->nexts[0].skip_chars; 173 | } 174 | 175 | #if ROPE_WCHAR 176 | // Get the number of wchar characters in the rope 177 | size_t rope_wchar_count(rope *r); 178 | 179 | // Insert the given utf8 string into the rope at the specified wchar position. 180 | // This is compatible with NSString, Javascript, etc. The string still needs to 181 | // be passed in using UTF-8. 182 | // 183 | // Returns the insertion position in characters. 184 | size_t rope_insert_at_wchar(rope *r, size_t wchar_pos, const uint8_t *utf8_str); 185 | 186 | // Delete wchar_num wide characters at the specified wchar position offset. 187 | // If the range is inside character boundaries, behaviour is undefined. 188 | // 189 | // Returns the deletion position in characters. *char_len_out is set to the 190 | // deletion length, in chars if its not null. 191 | size_t rope_del_at_wchar(rope *r, size_t wchar_pos, size_t wchar_num, size_t *char_len_out); 192 | 193 | // Get the number of wchars inside a rope node. This is useful when you're 194 | // looping through a rope. 195 | static inline size_t rope_node_wchars(rope_node *n) { 196 | return n->nexts[0].wchar_size; 197 | } 198 | #endif 199 | 200 | 201 | 202 | // For debugging. 203 | void _rope_check(rope *r); 204 | void _rope_print(rope *r); 205 | 206 | #ifdef __cplusplus 207 | } 208 | #endif 209 | 210 | #endif 211 | -------------------------------------------------------------------------------- /rope_benches/src/edittablestr.rs: -------------------------------------------------------------------------------- 1 | use super::Rope; 2 | use std::ptr; 3 | 4 | // pub trait EditableText { 5 | // // pos is in utf8 codepoints 6 | // fn insert_at(&mut self, pos: usize, contents: &str); 7 | // fn remove_at(&mut self, pos: usize, length: usize); 8 | // } 9 | 10 | impl Rope for String { 11 | const NAME: &'static str = "String"; 12 | 13 | fn new() -> Self { String::new() } 14 | 15 | fn insert_at(&mut self, char_pos: usize, contents: &str) { 16 | // If you try to write past the end of the string for now I'll just write at the end. 17 | // Panicing might be a better policy. 18 | let byte_pos = self.char_indices().skip(char_pos).next() 19 | .map(|(p, _)| p).unwrap_or(self.len()); 20 | //println!("pos {}", byte_pos); 21 | //self.insert_str(byte_pos, contents); 22 | 23 | let old_len = self.len(); 24 | let new_bytes = contents.len(); 25 | 26 | // This didn't work because it didn't change the string's length 27 | //self.reserve(new_bytes); 28 | 29 | // This is sort of ugly but its fine. 30 | for _ in 0..new_bytes { self.push('\0'); } 31 | 32 | //println!("new bytes {} {} {}", new_bytes, byte_pos, self.len() - byte_pos); 33 | unsafe { 34 | let bytes = self.as_mut_vec().as_mut_ptr(); 35 | //println!("{:?}", self.as_mut_vec()); 36 | ptr::copy( 37 | bytes.offset(byte_pos as isize), 38 | bytes.offset((byte_pos + new_bytes) as isize), 39 | old_len - byte_pos 40 | ); 41 | ptr::copy_nonoverlapping( 42 | contents.as_ptr(), 43 | bytes.offset(byte_pos as isize), 44 | new_bytes 45 | ); 46 | //println!("{:?}", self.as_mut_vec()); 47 | } 48 | } 49 | fn del_at(&mut self, pos: usize, length: usize) { 50 | let byte_range = { 51 | let mut iter = self.char_indices().map(|(p, _)| p).skip(pos).peekable(); 52 | 53 | let start = iter.peek().map_or_else(|| self.len(), |&p| p); 54 | let mut iter = iter.skip(length).peekable(); 55 | let end = iter.peek().map_or_else(|| self.len(), |&p| p); 56 | 57 | start..end 58 | }; 59 | 60 | self.drain(byte_range); 61 | } 62 | 63 | // fn len(&self) -> usize { self.len() } 64 | fn char_len(&self) -> usize { self.chars().count() } 65 | fn to_string(&self) -> String { self.clone() } 66 | } 67 | 68 | 69 | 70 | #[cfg(test)] 71 | mod tests { 72 | use super::Rope; 73 | 74 | #[test] 75 | fn insert_simple() { 76 | let mut s = "".to_string(); 77 | s.insert_at(0, "hi"); 78 | assert_eq!(s, "hi"); 79 | 80 | let mut s = "a".to_string(); 81 | s.insert_at(0, "hi"); 82 | assert_eq!(s, "hia"); 83 | 84 | let mut s = "a".to_string(); 85 | s.insert_at(1, "hi"); 86 | assert_eq!(s, "ahi"); 87 | 88 | let mut s = "ac".to_string(); 89 | s.insert_at(1, "b"); 90 | assert_eq!(s, "abc"); 91 | } 92 | 93 | #[test] 94 | fn insert_unicode() { 95 | // I mean, its all unicode but .... 96 | let mut s = "𝄞𝄞".to_string(); 97 | s.insert_at(0, "à"); 98 | assert_eq!(s, "à𝄞𝄞"); 99 | s.insert_at(2, "ë"); 100 | assert_eq!(s, "à𝄞ë𝄞"); 101 | s.insert_at(4, "ç"); 102 | assert_eq!(s, "à𝄞ë𝄞ç"); 103 | s.insert_at(6, "𝒲"); 104 | assert_eq!(s, "à𝄞ë𝄞ç𝒲"); 105 | } 106 | 107 | #[test] 108 | fn remove_simple() { 109 | let mut s = "à".to_string(); 110 | s.del_at(0, 1); 111 | assert_eq!(s, ""); 112 | s.del_at(0, 0); 113 | assert_eq!(s, ""); 114 | 115 | let mut s = "à𝄞ç".to_string(); 116 | s.del_at(0, 1); 117 | assert_eq!(s, "𝄞ç"); 118 | s.del_at(1, 1); 119 | assert_eq!(s, "𝄞"); 120 | s.del_at(0, 1); 121 | assert_eq!(s, ""); 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /rope_benches/src/main.rs: -------------------------------------------------------------------------------- 1 | 2 | // #[macro_use] 3 | extern crate criterion; 4 | use criterion::*; 5 | 6 | use crdt_testdata::*; 7 | 8 | // extern crate rand; 9 | // use rand::seq::IteratorRandom; 10 | use rand::prelude::*; 11 | 12 | mod rope; 13 | use self::rope::*; 14 | use jumprope::*; 15 | 16 | mod edittablestr; 17 | 18 | use std::cmp::min; 19 | 20 | use ropey::Rope as RopeyRope; 21 | use an_rope::Rope as AnRope; 22 | use xi_rope::Rope as XiRope; 23 | use crop::Rope as CropRope; 24 | 25 | const CHARS: &[u8; 83] = b" ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!@#$%^&*()[]{}<>?,./"; 26 | 27 | // Gross. Find a way to reuse the code from random_unicode_string. 28 | fn random_ascii_string(rng: &mut SmallRng, len: usize) -> String { 29 | let mut s = String::new(); 30 | for _ in 0..len { 31 | // s.push(*rng.choose(CHARS).unwrap() as char); 32 | s.push(CHARS[rng.gen_range(0 .. CHARS.len())] as char); 33 | } 34 | s 35 | } 36 | 37 | impl Rope for JumpRope { 38 | const NAME: &'static str = "JumpRope"; 39 | 40 | #[inline(always)] 41 | fn new() -> Self { JumpRope::new() } 42 | 43 | #[inline(always)] 44 | fn insert_at(&mut self, pos: usize, contents: &str) { self.insert(pos, contents); } 45 | #[inline(always)] 46 | fn del_at(&mut self, pos: usize, len: usize) { self.remove(pos..pos+len); } 47 | 48 | #[inline(always)] 49 | fn edit_at(&mut self, pos: usize, del_len: usize, ins_content: &str) { 50 | self.replace(pos..pos+del_len, ins_content); 51 | } 52 | 53 | #[inline(always)] 54 | fn to_string(&self) -> String { ToString::to_string(self) } 55 | 56 | #[inline(always)] 57 | fn char_len(&self) -> usize { self.len_chars() } // in unicode values 58 | } 59 | 60 | impl Rope for JumpRopeBuf { 61 | const NAME: &'static str = "JumpRopeBuf"; 62 | 63 | #[inline(always)] 64 | fn new() -> Self { JumpRopeBuf::new() } 65 | 66 | #[inline(always)] 67 | fn insert_at(&mut self, pos: usize, contents: &str) { self.insert(pos, contents); } 68 | #[inline(always)] 69 | fn del_at(&mut self, pos: usize, len: usize) { self.remove(pos..pos+len); } 70 | 71 | #[inline(always)] 72 | fn edit_at(&mut self, pos: usize, del_len: usize, ins_content: &str) { 73 | if del_len > 0 { 74 | self.remove(pos..pos + del_len); 75 | } 76 | if !ins_content.is_empty() { 77 | self.insert(pos, ins_content); 78 | } 79 | } 80 | 81 | #[inline(always)] 82 | fn to_string(&self) -> String { ToString::to_string(self) } 83 | 84 | #[inline(always)] 85 | fn char_len(&self) -> usize { self.len_chars() } // in unicode values 86 | } 87 | 88 | impl Rope for AnRope { 89 | const NAME: &'static str = "AnRope"; 90 | 91 | #[inline(always)] 92 | fn new() -> Self { AnRope::new() } 93 | 94 | #[inline(always)] 95 | fn insert_at(&mut self, pos: usize, contents: &str) { *self = self.insert_str(pos, contents); } 96 | #[inline(always)] 97 | fn del_at(&mut self, pos: usize, len: usize) { *self = self.delete(pos..pos+len); } 98 | 99 | #[inline(always)] 100 | fn to_string(&self) -> String { ToString::to_string(self) } 101 | 102 | #[inline(always)] 103 | fn char_len(&self) -> usize { self.len() } // in unicode values 104 | } 105 | 106 | impl Rope for XiRope { 107 | const NAME: &'static str = "XiRope"; 108 | 109 | #[inline(always)] 110 | fn new() -> Self { XiRope::from("") } 111 | 112 | #[inline(always)] 113 | fn insert_at(&mut self, pos: usize, contents: &str) { 114 | self.edit(pos..pos, contents); 115 | } 116 | #[inline(always)] 117 | fn del_at(&mut self, pos: usize, len: usize) { 118 | self.edit(pos..pos+len, ""); 119 | } 120 | #[inline(always)] 121 | fn edit_at(&mut self, pos: usize, del_len: usize, ins_content: &str) { 122 | self.edit(pos..pos+del_len, ins_content); 123 | } 124 | 125 | #[inline(always)] 126 | fn to_string(&self) -> String { 127 | String::from(self) 128 | } 129 | 130 | #[inline(always)] 131 | fn char_len(&self) -> usize { 132 | let mut len = 0; 133 | for s in self.iter_chunks(..) { 134 | len += s.chars().count(); 135 | } 136 | len 137 | } // in unicode values 138 | } 139 | 140 | impl Rope for RopeyRope { 141 | const NAME: &'static str = "Ropey"; 142 | 143 | #[inline(always)] 144 | fn new() -> Self { RopeyRope::new() } 145 | 146 | #[inline(always)] 147 | fn insert_at(&mut self, pos: usize, contents: &str) { 148 | self.insert(pos, contents); 149 | } 150 | #[inline(always)] 151 | fn del_at(&mut self, pos: usize, len: usize) { 152 | self.remove(pos..pos+len); 153 | } 154 | // fn del_at>(&mut self, range: R); 155 | 156 | // fn slice(&self, pos: usize, len: usize) -> Result; 157 | 158 | #[inline(always)] 159 | fn to_string(&self) -> String { unimplemented!() } 160 | 161 | #[inline(always)] 162 | fn char_len(&self) -> usize { self.len_chars() } // in unicode values 163 | } 164 | 165 | impl Rope for CropRope { 166 | const NAME: &'static str = "Crop"; 167 | const EDITS_USE_BYTE_OFFSETS: bool = true; 168 | 169 | fn new() -> Self { 170 | Self::new() 171 | } 172 | 173 | fn insert_at(&mut self, pos: usize, contents: &str) { 174 | self.insert(pos, contents); 175 | } 176 | 177 | fn del_at(&mut self, pos: usize, len: usize) { 178 | self.delete(pos..pos+len) 179 | } 180 | 181 | fn to_string(&self) -> String { 182 | ToString::to_string(self) 183 | } 184 | 185 | fn char_len(&self) -> usize { 186 | self.byte_len() 187 | } 188 | } 189 | 190 | use std::os::raw::c_char; 191 | use std::ffi::CString; 192 | use crdt_testdata::{load_testing_data, TestData}; 193 | use criterion::measurement::WallTime; 194 | 195 | #[repr(C)] 196 | struct CRopeRaw { _unused : [ u8 ; 0 ] } 197 | 198 | extern { 199 | fn rope_new() -> *mut CRopeRaw; 200 | fn rope_new_with_utf8(s: *const c_char) -> *mut CRopeRaw; 201 | fn rope_free(r: *mut CRopeRaw); 202 | fn rope_char_count(r: *const CRopeRaw) -> usize; 203 | // fn rope_byte_count(r: *const CRopeRaw) -> usize; 204 | 205 | fn rope_insert(r: *mut CRopeRaw, pos: usize, s: *const c_char) -> u32; 206 | fn rope_del(r: *mut CRopeRaw, pos: usize, len: usize) -> u32; 207 | } 208 | 209 | struct CRope(*mut CRopeRaw); 210 | impl Rope for CRope { 211 | const NAME: &'static str = "C-JumpRope"; 212 | 213 | #[inline(always)] 214 | fn new() -> Self { unsafe { CRope(rope_new()) } } 215 | 216 | #[inline(always)] 217 | fn insert_at(&mut self, pos: usize, contents: &str) { 218 | unsafe { 219 | let cstr = CString::new(contents).unwrap(); 220 | rope_insert(self.0, pos, cstr.as_ptr()); 221 | } 222 | } 223 | #[inline(always)] 224 | fn del_at(&mut self, pos: usize, len: usize) { 225 | unsafe { rope_del(self.0, pos, len); } 226 | } 227 | fn to_string(&self) -> String { unimplemented!() } 228 | 229 | #[inline(always)] 230 | fn char_len(&self) -> usize { unsafe { rope_char_count(self.0) } } // in unicode values 231 | } 232 | impl Drop for CRope { 233 | fn drop(&mut self) { 234 | unsafe { rope_free(self.0); } 235 | } 236 | } 237 | impl From for CRope { 238 | fn from(s: String) -> Self { 239 | let cstr = CString::new(s).unwrap(); 240 | CRope(unsafe { rope_new_with_utf8(cstr.as_ptr()) }) 241 | } 242 | } 243 | 244 | #[test] 245 | fn foo() { 246 | unsafe { 247 | let r = rope_new(); 248 | println!("size {}", rope_char_count(r)); 249 | } 250 | } 251 | 252 | fn gen_strings(rng: &mut SmallRng) -> Vec { 253 | // I wish there was a better syntax for just making an array here. 254 | let mut strings = Vec::::new(); 255 | for _ in 0..100 { 256 | let len = rng.gen_range(1 .. 3); 257 | strings.push(random_ascii_string(rng, len)); 258 | } 259 | 260 | strings 261 | } 262 | 263 | fn ins_append(b: &mut Bencher) { 264 | let mut rng = SmallRng::seed_from_u64(123); 265 | let strings = gen_strings(&mut rng); 266 | 267 | let mut r = R::new(); 268 | let mut len = 0; 269 | b.iter(|| { 270 | // let pos = rng.gen_range(0, len+1); 271 | let text = &strings[rng.gen_range(0 .. strings.len())]; 272 | r.insert_at(len, text.as_str()); 273 | len += text.chars().count(); 274 | }); 275 | 276 | black_box(r.char_len()); 277 | } 278 | 279 | fn ins_random(b: &mut Bencher) { 280 | let mut rng = SmallRng::seed_from_u64(123); 281 | let strings = gen_strings(&mut rng); 282 | 283 | let mut r = R::new(); 284 | // Len isn't needed, but its here to allow direct comparison with ins_append. 285 | let mut len = 0; 286 | b.iter(|| { 287 | let pos = rng.gen_range(0 .. len+1); 288 | let text = &strings[rng.gen_range(0 .. strings.len())]; 289 | r.insert_at(pos, text.as_str()); 290 | len += text.chars().count(); 291 | }); 292 | 293 | black_box(r.char_len()); 294 | black_box(len); 295 | } 296 | 297 | fn stable_ins_del>(b: &mut Bencher, target_length: &u64) { 298 | let target_length = *target_length as usize; 299 | let mut rng = SmallRng::seed_from_u64(123); 300 | 301 | // I wish there was a better syntax for just making an array here. 302 | let strings = gen_strings(&mut rng); 303 | 304 | // let target_length = 100000; 305 | // let mut r = R::new(); 306 | // while r.char_len() < target_length { 307 | // // The rope should be a hot mess. 308 | // let pos = rng.gen_range(0, r.char_len()+1); 309 | // r.insert_at(pos, strings[rng.gen_range(0, strings.len())].as_str()).unwrap(); 310 | // } 311 | let mut r = R::from(random_ascii_string(&mut rng, target_length)); 312 | let mut len = target_length; 313 | 314 | b.iter(|| { 315 | // let len = r.char_len(); 316 | // if len == 0 || rng.gen::() { 317 | if len <= target_length { 318 | // Insert 319 | let pos = rng.gen_range(0 .. len+1); 320 | let text = &strings[rng.gen_range(0 .. strings.len())]; 321 | r.insert_at(pos, text.as_str()); 322 | len += text.chars().count(); 323 | } else { 324 | // Delete 325 | let pos = rng.gen_range(0 .. len); 326 | let dlen = min(rng.gen_range(0 .. 10), len - pos); 327 | len -= dlen; 328 | 329 | r.del_at(pos, dlen); 330 | } 331 | }); 332 | 333 | // Return something based on the computation to avoid it being optimized 334 | // out. Although right now the compiler isn't smart enough for that 335 | // anyway. 336 | // r.len() 337 | black_box(r.char_len()); 338 | } 339 | 340 | #[allow(unused)] 341 | fn bench_ins_append(c: &mut Criterion) { 342 | let mut group = c.benchmark_group("ins_append"); 343 | 344 | group.bench_function("jumprope", ins_append::); 345 | group.bench_function("ropey", ins_append::); 346 | // group.bench_function("anrope", ins_append::); 347 | group.bench_function("xirope", ins_append::); 348 | group.bench_function("jumprope_c", ins_append::); 349 | group.bench_function("raw_string", ins_append::); 350 | group.finish(); 351 | } 352 | 353 | #[allow(unused)] 354 | fn bench_ins_random(c: &mut Criterion) { 355 | let mut group = c.benchmark_group("ins_random"); 356 | 357 | group.bench_function("jumprope", ins_random::); 358 | group.bench_function("ropey", ins_random::); 359 | // group.bench_function("anrope", ins_random::); 360 | group.bench_function("xirope", ins_random::); 361 | group.bench_function("jumprope_c", ins_random::); 362 | group.bench_function("raw_string", ins_random::); 363 | group.finish(); 364 | } 365 | 366 | #[allow(unused)] 367 | fn bench_stable_ins_del(c: &mut Criterion) { 368 | let mut group = c.benchmark_group("stable_ins_del"); 369 | 370 | for size in [1000, 10000, 100000, 1000000, 10000000].iter() { 371 | group.throughput(Throughput::Elements(*size)); 372 | group.bench_with_input(BenchmarkId::new("jumprope", size), size, stable_ins_del::); 373 | group.bench_with_input(BenchmarkId::new("ropey", size), size, stable_ins_del::); 374 | // group.bench_with_input(BenchmarkId::new("anrope", size), size, stable_ins_del::); 375 | group.bench_with_input(BenchmarkId::new("xirope", size), size, stable_ins_del::); 376 | group.bench_with_input(BenchmarkId::new("jumprope_c", size), size, stable_ins_del::); 377 | } 378 | group.finish(); 379 | } 380 | 381 | fn load_named_data(name: &str) -> TestData { 382 | let filename = format!("/home/seph/src/diamond-types/benchmark_data/{}.json.gz", name); 383 | load_testing_data(&filename) 384 | } 385 | 386 | // const DATASETS: &[&str] = &["automerge-paper"]; 387 | const DATASETS: &[&str] = &["automerge-paper", "rustcode", "sveltecomponent", "seph-blog1"]; 388 | 389 | fn realworld(c: &mut Criterion) { 390 | for name in DATASETS { 391 | let mut group = c.benchmark_group("realworld"); 392 | let test_data_chars = load_named_data(name); 393 | group.throughput(Throughput::Elements(test_data_chars.len() as u64)); 394 | let test_data_bytes = test_data_chars.chars_to_bytes(); 395 | 396 | let mut all_ascii = true; 397 | for txn in &test_data_chars.txns { 398 | for TestPatch(_pos, _del, ins) in &txn.patches { 399 | if ins.chars().count() != ins.len() { all_ascii = false; } 400 | } 401 | } 402 | 403 | fn x(group: &mut BenchmarkGroup, name: &str, test_data: &TestData) { 404 | assert_eq!(R::EDITS_USE_BYTE_OFFSETS, test_data.using_byte_positions); 405 | 406 | group.bench_function(BenchmarkId::new(R::NAME, name), |b| { 407 | b.iter(|| { 408 | let mut r = R::new(); 409 | for txn in &test_data.txns { 410 | for TestPatch(pos, del, ins) in &txn.patches { 411 | r.edit_at(*pos, *del, ins); 412 | } 413 | } 414 | assert_eq!(r.char_len(), test_data.end_content.len()); 415 | black_box(r.char_len()); 416 | }) 417 | }); 418 | } 419 | 420 | x::(&mut group, name, &test_data_chars); 421 | x::(&mut group, name, &test_data_chars); 422 | x::(&mut group, name, &test_data_chars); 423 | x::(&mut group, name, &test_data_chars); 424 | x::(&mut group, name, &test_data_bytes); 425 | 426 | // These two crash on non-ascii characters for some reason. 427 | if all_ascii { 428 | // Extremely slow. 429 | x::(&mut group, name, &test_data_chars); 430 | 431 | // Crashes. 432 | // x::(&mut group, name, &test_data); 433 | } 434 | 435 | // This takes a long time to run. 436 | // x::(&mut group, name, &test_data); 437 | 438 | group.finish(); 439 | } 440 | } 441 | 442 | criterion_group!(benches, 443 | bench_ins_append, 444 | bench_ins_random, 445 | bench_stable_ins_del, 446 | realworld 447 | ); 448 | // criterion_group!(benches, bench_all); 449 | criterion_main!(benches); -------------------------------------------------------------------------------- /rope_benches/src/rope.rs: -------------------------------------------------------------------------------- 1 | // use std::ops::RangeBounds; 2 | 3 | // #[derive(Debug)] 4 | // pub enum RopeError { 5 | // PositionOutOfBounds, 6 | // } 7 | 8 | pub trait Rope: From { 9 | const NAME: &'static str; 10 | const EDITS_USE_BYTE_OFFSETS: bool = false; 11 | 12 | fn new() -> Self; 13 | 14 | fn insert_at(&mut self, pos: usize, contents: &str);// -> Result<(), RopeError>; 15 | fn del_at(&mut self, pos: usize, len: usize);// -> Result<(), RopeError>; 16 | fn edit_at(&mut self, pos: usize, del_len: usize, ins_content: &str) { 17 | if del_len > 0 { 18 | self.del_at(pos, del_len); 19 | } 20 | if !ins_content.is_empty() { 21 | self.insert_at(pos, ins_content); 22 | } 23 | } 24 | 25 | // fn del_at>(&mut self, range: R) -> Result<(), RopeError>; 26 | 27 | // fn slice(&self, pos: usize, len: usize) -> Result; 28 | 29 | fn to_string(&self) -> String; 30 | 31 | // fn len(&self) -> usize; // in bytes 32 | fn char_len(&self) -> usize; // in unicode values 33 | } -------------------------------------------------------------------------------- /rope_benches/table.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs') 2 | 3 | const datasets = ["automerge-paper", "rustcode", "sveltecomponent", "seph-blog1"] 4 | const algorithms = ['String', 'XiRope', 'Ropey', 'C-JumpRope', 'JumpRope'] 5 | 6 | console.log('| Dataset | Raw string | XiRope | Ropey | librope (C) | Jumprope |') 7 | console.log('|---------|------------|--------|-------|-------------|----------|') 8 | 9 | const roundN = n => Math.round(n * 100) / 100 10 | 11 | for (const ds of datasets) { 12 | const row = `${ds} | ` + algorithms.map(alg => { 13 | const filename = `../target/criterion/realworld/${alg}/${ds}/new/estimates.json` 14 | 15 | if (fs.existsSync(filename)) { 16 | const data = JSON.parse(fs.readFileSync(filename, 'utf8')).mean.point_estimate / 1e6 17 | return `${roundN(data)} ms` 18 | } else { 19 | return 'DNF' 20 | } 21 | }).join(' | ') 22 | 23 | console.log(row) 24 | } -------------------------------------------------------------------------------- /src/buffered.rs: -------------------------------------------------------------------------------- 1 | //! This module provides an optimized wrapper around a [`JumpRope`] struct which buffers incoming 2 | //! edits and applies them "all at once" when the rope is read. This makes access patterns involving 3 | //! replaying many small operations much faster (8x faster on some real world testing data). 4 | //! 5 | //! Using [`JumpRopeBuf`] instead of [`JumpRope`] directly is equivalent to using a 6 | //! [`BufWriter`](std::io::BufWriter) to write to a file / stream. 7 | //! 8 | //! This API should be almost identical with JumpRope, but I've probably forgotten a few methods. 9 | //! If you find some useful methods which are missing, please file issues and I can add them 10 | //! explicitly to the wrapper. You can also use `rope.borrow().read_method()` or 11 | //! `rope.as_mut().write_method()` as workarounds. 12 | //! 13 | //! Internally, JumpRopeBuf stores incoming writes in a write buffer before applying them. Adjacent 14 | //! edits can be merged before the skip list is edited, which reduces the need for (relatively) 15 | //! more expensive skip list lookups. 16 | //! 17 | //! ## Caveats: 18 | //! 19 | //! - [`JumpRopeBuf`] uses a RefCell internally. As a result, it does not expose a &JumpRope 20 | //! directly. 21 | //! - Use of the RefCell means JumpRope is [`Send`](std::marker::Send) but not [`Sync`](std::marker::Sync). 22 | 23 | 24 | #[derive(Debug, Clone, Copy)] 25 | enum Kind { Ins, Del } 26 | 27 | use std::cell::{Ref, RefCell}; 28 | use std::fmt::{Debug, Display, Formatter}; 29 | use std::ops::{Deref, DerefMut, Range}; 30 | use Op::*; 31 | use crate::fast_str_tools::{char_to_byte_idx, count_chars}; 32 | use crate::JumpRope; 33 | 34 | /// This struct provides an optimized wrapper around JumpRope which buffers adjacent incoming writes 35 | /// before forwarding them to the underlying JumpRope. 36 | /// 37 | /// Most of the overhead of writing to a rope comes from finding the edit location in the rope and 38 | /// bookkeeping. Because text editing operations are usually sequential, by aggregating adjacent 39 | /// editing operations together we can amortize the cost of updating the underlying data structure 40 | /// itself. This improves performance by about 10x compared to inserting and deleting individual 41 | /// characters. 42 | /// 43 | /// There is nothing jumprope-specific in this library. It could easily be adapted to wrap other 44 | /// rope libraries (like Ropey) too. 45 | /// 46 | /// This API is still experimental. This library is only enabled by enabling the "buffered' feature. 47 | pub struct JumpRopeBuf(RefCell<(JumpRope, BufferedOp)>); 48 | 49 | #[derive(Debug, Clone)] 50 | struct BufferedOp { 51 | kind: Kind, 52 | // Always empty for deletes. 53 | ins_content: String, 54 | range: Range, 55 | } 56 | 57 | #[derive(Debug, Clone, Copy)] 58 | enum Op<'a> { 59 | Ins(usize, &'a str), 60 | Del(usize, usize), // start, end. 61 | } 62 | 63 | impl BufferedOp { 64 | fn new() -> Self { 65 | Self { 66 | kind: Kind::Ins, 67 | ins_content: "".to_string(), 68 | range: Range::default(), 69 | } 70 | } 71 | 72 | fn is_empty(&self) -> bool { 73 | // self.len == 0 74 | self.range.is_empty() 75 | } 76 | 77 | /// Length of the inserted / deleted section 78 | fn len(&self) -> usize { 79 | self.range.len() 80 | } 81 | 82 | fn clear(&mut self) { 83 | // We don't care about the tag. 84 | self.ins_content.clear(); 85 | self.range = Range::default(); 86 | } 87 | 88 | fn try_append(&mut self, op: Op) -> Result<(), ()> { 89 | if self.is_empty() { 90 | // Just set to op. 91 | match op { 92 | // I'm setting fields individually here rather than implementing From or 93 | // BufferedOp so we can reuse the allocation in self.ins_content. 94 | Ins(pos, content) => { 95 | self.kind = Kind::Ins; 96 | self.ins_content.push_str(content); 97 | self.range.start = pos; 98 | self.range.end = pos + count_chars(content); 99 | } 100 | Del(start, end) => { 101 | self.kind = Kind::Del; 102 | debug_assert!(self.ins_content.is_empty()); 103 | self.range = start..end; 104 | } 105 | } 106 | Ok(()) 107 | } else { 108 | match (self.kind, op) { 109 | (Kind::Ins, Op::Ins(pos, content)) if pos == self.range.end => { 110 | // The new insert is at the end of the buffered op. 111 | self.ins_content.push_str(content); 112 | self.range.end += count_chars(content); 113 | Ok(()) 114 | } 115 | (Kind::Ins, Op::Del(start, end)) if end == self.range.end && start >= self.range.start => { 116 | // We can merge if the delete trims the end of the insert. There's more complex 117 | // trimming we could do here, but anything too complex and we may as well just 118 | // let the rope handle it. 119 | if start == self.range.start { 120 | // Discard our local insert. 121 | self.ins_content.clear(); 122 | self.range.end = self.range.start; 123 | Ok(()) 124 | } else { 125 | // Trim from the end. 126 | let char_offset = start - self.range.start; 127 | 128 | let byte_offset = if self.range.len() == self.ins_content.len() { 129 | // If its all ascii, char offset == byte offset. 130 | char_offset 131 | } else { 132 | // TODO: Come up with a better way to calculate this. 133 | char_to_byte_idx(self.ins_content.as_str(), char_offset) 134 | }; 135 | 136 | self.range.end = start; 137 | self.ins_content.truncate(byte_offset); 138 | Ok(()) 139 | } 140 | } 141 | (Kind::Del, Op::Del(start, end)) if start <= self.range.start && end >= self.range.start => { 142 | // We can merge if our delete is inside the operation. 143 | // let self_len = self.range.len(); 144 | // dbg!(&self.range, (start, end)); 145 | self.range.end += end - self.range.start; 146 | self.range.start = start; 147 | Ok(()) 148 | } 149 | (_, _) => Err(()), 150 | } 151 | } 152 | } 153 | } 154 | 155 | impl From for JumpRopeBuf { 156 | fn from(rope: JumpRope) -> Self { 157 | Self::with_rope(rope) 158 | } 159 | } 160 | 161 | impl JumpRopeBuf { 162 | pub fn with_rope(rope: JumpRope) -> Self { 163 | Self(RefCell::new((rope, BufferedOp::new()))) 164 | } 165 | 166 | pub fn new() -> Self { 167 | Self::with_rope(JumpRope::new()) 168 | } 169 | 170 | pub fn new_from_str(s: &str) -> Self { 171 | Self::with_rope(JumpRope::from(s)) 172 | } 173 | 174 | fn flush_mut(inner: &mut (JumpRope, BufferedOp)) { 175 | if !inner.1.is_empty() { 176 | match inner.1.kind { 177 | Kind::Ins => { 178 | inner.0.insert(inner.1.range.start, &inner.1.ins_content); 179 | }, 180 | Kind::Del => { 181 | inner.0.remove(inner.1.range.clone()); 182 | } 183 | } 184 | inner.1.clear(); 185 | } 186 | } 187 | 188 | // fn flush(&self) { 189 | // let mut inner = self.0.borrow_mut(); 190 | // Self::flush_mut(inner.deref_mut()); 191 | // } 192 | 193 | fn internal_push_op(&mut self, op: Op) { 194 | // let mut inner = self.0.borrow_mut(); 195 | let inner = self.0.get_mut(); 196 | match inner.1.try_append(op) { 197 | Ok(_) => {} 198 | Err(_) => { 199 | // Self::flush_mut(inner.deref_mut()); 200 | Self::flush_mut(inner); 201 | // inner.0.insert(pos, content); 202 | inner.1.try_append(op).unwrap(); 203 | } 204 | } 205 | } 206 | 207 | /// Insert new content into the rope at the specified position. This method is semantically 208 | /// equivalent to [`JumpRope::insert`](JumpRope::insert). The only difference is that here we 209 | /// buffer the incoming edit. 210 | pub fn insert(&mut self, pos: usize, content: &str) { 211 | self.internal_push_op(Op::Ins(pos, content)) 212 | } 213 | 214 | /// Remove content from the rope at the specified position. This method is semantically 215 | /// equivalent to [`JumpRope::remove`](JumpRope::insert). The only difference is that here we 216 | /// buffer the incoming remove operation. 217 | pub fn remove(&mut self, range: Range) { 218 | self.internal_push_op(Op::Del(range.start, range.end)) 219 | } 220 | 221 | // TODO: Replace! 222 | 223 | /// Return the length of the rope in unicode characters. Note this is not the same as either 224 | /// the number of bytes the characters take, or the number of grapheme clusters in the string. 225 | /// 226 | /// This method returns the length in constant-time (*O(1)*). 227 | pub fn len_chars(&self) -> usize { 228 | let borrow = self.0.borrow(); 229 | match borrow.1.kind { 230 | Kind::Ins => borrow.0.len_chars() + borrow.1.range.len(), 231 | Kind::Del => borrow.0.len_chars() - borrow.1.range.len() 232 | } 233 | } 234 | 235 | /// Get the number of bytes used for the UTF8 representation of the rope. This will always match 236 | /// the .len() property of the equivalent String. 237 | pub fn len_bytes(&self) -> usize { 238 | let mut borrow = self.0.borrow_mut(); 239 | match borrow.1.kind { 240 | Kind::Ins => borrow.0.len_bytes() + borrow.1.ins_content.len(), 241 | Kind::Del => { 242 | // Unfortunately we have to flush to calculate byte length. 243 | Self::flush_mut(borrow.deref_mut()); 244 | borrow.0.len_bytes() 245 | } 246 | } 247 | } 248 | 249 | pub fn is_empty(&self) -> bool { 250 | let borrow = self.0.borrow(); 251 | let len_chars = borrow.0.len_chars(); 252 | match borrow.1.kind { 253 | Kind::Ins => len_chars == 0 && borrow.1.is_empty(), 254 | Kind::Del => len_chars - borrow.1.len() == 0, 255 | } 256 | } 257 | 258 | /// Consume the JumpRopeBuf, flush any buffered operations and return the contained JumpRope. 259 | pub fn into_inner(self) -> JumpRope { 260 | let mut contents = self.0.into_inner(); 261 | Self::flush_mut(&mut contents); 262 | contents.0 263 | } 264 | 265 | /// Flush changes into the rope and return a borrowed reference to the rope itself. This makes 266 | /// it easy to call any methods on the underlying rope which aren't already exposed through the 267 | /// buffered API. 268 | /// 269 | /// # Panics 270 | /// 271 | /// borrow panics if the value is currently borrowed already. 272 | pub fn borrow(&self) -> Ref<'_, JumpRope> { 273 | let mut borrow = self.0.borrow_mut(); 274 | Self::flush_mut(borrow.deref_mut()); 275 | drop(borrow); 276 | // This method could provide &mut access to the rope via the cell, but I think thats a bad 277 | // idea. 278 | Ref::map(self.0.borrow(), |(rope, _)| rope) 279 | } 280 | 281 | fn eq_str(&self, s: &str) -> bool { 282 | self.borrow().deref().eq(s) 283 | } 284 | } 285 | 286 | impl AsMut for JumpRopeBuf { 287 | /// Flush changes into the rope and mutably borrow the rope. 288 | fn as_mut(&mut self) -> &mut JumpRope { 289 | let inner = self.0.get_mut(); 290 | Self::flush_mut(inner); 291 | &mut inner.0 292 | } 293 | } 294 | 295 | impl Default for JumpRopeBuf { 296 | fn default() -> Self { 297 | JumpRopeBuf::new() 298 | } 299 | } 300 | 301 | impl Debug for JumpRopeBuf { 302 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 303 | let inner = self.0.borrow(); 304 | f.debug_struct("BufferedRope") 305 | .field("op", &inner.1) 306 | .field("rope", &inner.0) 307 | .finish() 308 | } 309 | } 310 | 311 | impl Display for JumpRopeBuf { 312 | fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { 313 | for s in self.borrow().substrings() { 314 | f.write_str(s)?; 315 | } 316 | Ok(()) 317 | } 318 | } 319 | 320 | impl Clone for JumpRopeBuf { 321 | fn clone(&self) -> Self { 322 | let inner = self.0.borrow(); 323 | Self(RefCell::new((inner.0.clone(), inner.1.clone()))) 324 | } 325 | } 326 | 327 | impl> From for JumpRopeBuf { 328 | fn from(str: S) -> Self { 329 | JumpRopeBuf::new_from_str(str.as_ref()) 330 | } 331 | } 332 | 333 | impl> PartialEq for JumpRopeBuf { 334 | fn eq(&self, other: &T) -> bool { 335 | self.eq_str(other.as_ref()) 336 | } 337 | } 338 | 339 | // Needed for assert_eq!(&rope, "Hi there"); 340 | impl PartialEq for JumpRopeBuf { 341 | fn eq(&self, other: &str) -> bool { 342 | self.eq_str(other) 343 | } 344 | } 345 | 346 | // Needed for assert_eq!(&rope, String::from("Hi there")); 347 | impl PartialEq for &JumpRopeBuf { 348 | fn eq(&self, other: &String) -> bool { 349 | self.eq_str(other.as_str()) 350 | } 351 | } 352 | 353 | impl PartialEq for JumpRopeBuf { 354 | fn eq(&self, other: &JumpRope) -> bool { 355 | self.borrow().eq(other) 356 | } 357 | } 358 | 359 | impl PartialEq for JumpRopeBuf { 360 | fn eq(&self, other: &JumpRopeBuf) -> bool { 361 | // This check is important because we can't borrow the Cell twice at runtime. 362 | std::ptr::eq(self as *const _, other as *const _) 363 | || self.borrow().eq(other.borrow().deref()) 364 | } 365 | } 366 | 367 | impl Eq for JumpRopeBuf {} 368 | 369 | #[cfg(test)] 370 | mod test { 371 | use crate::JumpRopeBuf; 372 | 373 | // TODO: This could probably use more specific tests. JumpRopeBuf is currently thoroughly 374 | // tested more deeply by a fuzzer, but it'd be good to have more tests here. 375 | 376 | #[test] 377 | fn is_empty() { 378 | let mut r = JumpRopeBuf::new(); 379 | assert!(r.is_empty()); 380 | 381 | r.insert(0, "hi"); 382 | assert!(!r.is_empty()); 383 | 384 | // Force the rope to be flushed. 385 | r.borrow(); 386 | 387 | r.remove(0..2); 388 | assert!(r.is_empty()); 389 | } 390 | 391 | #[test] 392 | fn eq_reflexive() { 393 | // This was a regression. 394 | let r = JumpRopeBuf::new(); 395 | assert_eq!(r, r); 396 | } 397 | } 398 | -------------------------------------------------------------------------------- /src/fast_str_tools.rs: -------------------------------------------------------------------------------- 1 | //! Utility functions for utf8 string slices. 2 | //! 3 | //! This file mostly defers to str_indicies but overrides some methods because the compiler is 4 | //! smart. 5 | 6 | /// Converts from byte-index to char-index in a string slice. 7 | /// 8 | /// If the byte is in the middle of a multi-byte char, returns the index of 9 | /// the char that the byte belongs to. 10 | /// 11 | /// Any past-the-end index will return the one-past-the-end char index. 12 | /// 13 | /// Runs in O(N) time. 14 | #[inline] 15 | #[allow(unused)] 16 | pub fn byte_to_char_idx(text: &str, byte_idx: usize) -> usize { 17 | let count = count_chars_in_bytes(&text.as_bytes()[0..(byte_idx + 1).min(text.len())]); 18 | if byte_idx < text.len() { 19 | count - 1 20 | } else { 21 | count 22 | } 23 | } 24 | 25 | /// Converts from char-index to byte-index in a string slice. 26 | /// 27 | /// Any past-the-end index will return the one-past-the-end byte index. 28 | /// 29 | /// Runs in O(N) time. 30 | #[inline] 31 | pub fn char_to_byte_idx(text: &str, char_idx: usize) -> usize { 32 | if cfg!(not(miri)) { 33 | str_indices::chars::to_byte_idx(text, char_idx) 34 | } else { 35 | // Naive version. 36 | let mut byte_count = 0; 37 | let mut char_count = 0; 38 | 39 | let mut i = 0; 40 | let text = text.as_bytes(); 41 | while i < text.len() && char_count <= char_idx { 42 | char_count += ((text[i] & 0xC0) != 0x80) as usize; 43 | i += 1; 44 | } 45 | byte_count += i; 46 | 47 | if byte_count == text.len() && char_count <= char_idx { 48 | byte_count 49 | } else { 50 | byte_count - 1 51 | } 52 | } 53 | } 54 | 55 | // #[allow(unused)] 56 | // #[inline(always)] 57 | // fn char_to_byte_idx_naive(text: &[u8], char_idx: usize) -> usize { 58 | // let mut byte_count = 0; 59 | // let mut char_count = 0; 60 | // 61 | // let mut i = 0; 62 | // while i < text.len() && char_count <= char_idx { 63 | // char_count += ((text[i] & 0xC0) != 0x80) as usize; 64 | // i += 1; 65 | // } 66 | // byte_count += i; 67 | // 68 | // if byte_count == text.len() && char_count <= char_idx { 69 | // byte_count 70 | // } else { 71 | // byte_count - 1 72 | // } 73 | // } 74 | 75 | /// Counts the utf16 surrogate pairs that would be in `text` if it were encoded 76 | /// as utf16. 77 | #[inline] 78 | pub(crate) fn count_utf16_surrogates(text: &str) -> usize { 79 | unsafe { count_utf16_surrogates_in_bytes(text.as_bytes()) } 80 | } 81 | 82 | /// SAFETY: Passed text array must be a valid UTF8 string. This will not be checked at runtime. 83 | #[inline] 84 | pub(crate) unsafe fn count_utf16_surrogates_in_bytes(text: &[u8]) -> usize { 85 | if cfg!(miri) { 86 | // Naive version 87 | let mut utf16_surrogate_count = 0; 88 | 89 | for byte in text.iter() { 90 | utf16_surrogate_count += ((byte & 0xf0) == 0xf0) as usize; 91 | } 92 | 93 | utf16_surrogate_count 94 | } else { 95 | str_indices::utf16::count_surrogates(std::str::from_utf8_unchecked(text)) 96 | } 97 | } 98 | 99 | // This is an alternate naive method which may make sense later. 100 | // #[inline] 101 | // #[allow(unused)] 102 | // pub(crate) fn count_utf16_surrogates_in_bytes_naive(text: &[u8]) -> usize { 103 | // let mut utf16_surrogate_count = 0; 104 | // 105 | // for byte in text.iter() { 106 | // utf16_surrogate_count += ((byte & 0xf0) == 0xf0) as usize; 107 | // } 108 | // 109 | // utf16_surrogate_count 110 | // } 111 | 112 | #[inline(always)] 113 | #[allow(unused)] 114 | pub(crate) fn byte_to_utf16_surrogate_idx(text: &str, byte_idx: usize) -> usize { 115 | count_utf16_surrogates(&text[..byte_idx]) 116 | } 117 | 118 | #[inline(always)] 119 | #[allow(unused)] 120 | pub(crate) fn utf16_code_unit_to_char_idx(text: &str, utf16_idx: usize) -> usize { 121 | // TODO: optimized version. This is pretty slow. It isn't expected to be 122 | // used in performance critical functionality, so this isn't urgent. But 123 | // might as well make it faster when we get the chance. 124 | let mut char_i = 0; 125 | let mut utf16_i = 0; 126 | for c in text.chars() { 127 | if utf16_idx <= utf16_i { 128 | break; 129 | } 130 | char_i += 1; 131 | utf16_i += c.len_utf16(); 132 | } 133 | 134 | if utf16_idx < utf16_i { 135 | char_i -= 1; 136 | } 137 | 138 | char_i 139 | } 140 | 141 | //=========================================================================== 142 | // Internal 143 | //=========================================================================== 144 | 145 | /// Uses bit-fiddling magic to count utf8 chars really quickly. 146 | /// We actually count the number of non-starting utf8 bytes, since 147 | /// they have a consistent starting two-bit pattern. We then 148 | /// subtract from the byte length of the text to get the final 149 | /// count. 150 | #[inline] 151 | #[allow(unused)] 152 | pub(crate) fn count_chars(text: &str) -> usize { 153 | count_chars_in_bytes(text.as_bytes()) 154 | } 155 | 156 | #[inline] 157 | pub(crate) fn count_chars_in_bytes(text: &[u8]) -> usize { 158 | if text.len() <= 1 { text.len() } 159 | else if !cfg!(miri) { 160 | unsafe { str_indices::chars::count(std::str::from_utf8_unchecked(text)) } 161 | } else { 162 | let mut inv_count = 0; 163 | for byte in text.iter() { 164 | inv_count += ((byte & 0xC0) != 0x80) as usize; 165 | } 166 | inv_count 167 | } 168 | } 169 | 170 | #[cfg(test)] 171 | mod tests { 172 | use super::*; 173 | 174 | // 124 bytes, 100 chars, 4 lines 175 | const TEXT_LINES: &str = "Hello there! How're you doing?\nIt's \ 176 | a fine day, isn't it?\nAren't you glad \ 177 | we're alive?\nこんにちは、みんなさん!"; 178 | 179 | #[test] 180 | fn count_chars_01() { 181 | let text = "Hello せかい! Hello せかい! Hello せかい! Hello せかい! Hello せかい!"; 182 | 183 | assert_eq!(54, count_chars(text)); 184 | } 185 | 186 | #[test] 187 | fn count_chars_02() { 188 | assert_eq!(100, count_chars(TEXT_LINES)); 189 | } 190 | 191 | #[test] 192 | fn byte_to_char_idx_01() { 193 | let text = "Hello せかい!"; 194 | assert_eq!(0, byte_to_char_idx(text, 0)); 195 | assert_eq!(1, byte_to_char_idx(text, 1)); 196 | assert_eq!(6, byte_to_char_idx(text, 6)); 197 | assert_eq!(6, byte_to_char_idx(text, 7)); 198 | assert_eq!(6, byte_to_char_idx(text, 8)); 199 | assert_eq!(7, byte_to_char_idx(text, 9)); 200 | assert_eq!(7, byte_to_char_idx(text, 10)); 201 | assert_eq!(7, byte_to_char_idx(text, 11)); 202 | assert_eq!(8, byte_to_char_idx(text, 12)); 203 | assert_eq!(8, byte_to_char_idx(text, 13)); 204 | assert_eq!(8, byte_to_char_idx(text, 14)); 205 | assert_eq!(9, byte_to_char_idx(text, 15)); 206 | assert_eq!(10, byte_to_char_idx(text, 16)); 207 | assert_eq!(10, byte_to_char_idx(text, 17)); 208 | assert_eq!(10, byte_to_char_idx(text, 18)); 209 | assert_eq!(10, byte_to_char_idx(text, 19)); 210 | } 211 | 212 | #[test] 213 | fn byte_to_char_idx_02() { 214 | let text = ""; 215 | assert_eq!(0, byte_to_char_idx(text, 0)); 216 | assert_eq!(0, byte_to_char_idx(text, 1)); 217 | 218 | let text = "h"; 219 | assert_eq!(0, byte_to_char_idx(text, 0)); 220 | assert_eq!(1, byte_to_char_idx(text, 1)); 221 | assert_eq!(1, byte_to_char_idx(text, 2)); 222 | 223 | let text = "hi"; 224 | assert_eq!(0, byte_to_char_idx(text, 0)); 225 | assert_eq!(1, byte_to_char_idx(text, 1)); 226 | assert_eq!(2, byte_to_char_idx(text, 2)); 227 | assert_eq!(2, byte_to_char_idx(text, 3)); 228 | } 229 | 230 | #[test] 231 | fn byte_to_char_idx_03() { 232 | let text = "せかい"; 233 | assert_eq!(0, byte_to_char_idx(text, 0)); 234 | assert_eq!(0, byte_to_char_idx(text, 1)); 235 | assert_eq!(0, byte_to_char_idx(text, 2)); 236 | assert_eq!(1, byte_to_char_idx(text, 3)); 237 | assert_eq!(1, byte_to_char_idx(text, 4)); 238 | assert_eq!(1, byte_to_char_idx(text, 5)); 239 | assert_eq!(2, byte_to_char_idx(text, 6)); 240 | assert_eq!(2, byte_to_char_idx(text, 7)); 241 | assert_eq!(2, byte_to_char_idx(text, 8)); 242 | assert_eq!(3, byte_to_char_idx(text, 9)); 243 | assert_eq!(3, byte_to_char_idx(text, 10)); 244 | assert_eq!(3, byte_to_char_idx(text, 11)); 245 | assert_eq!(3, byte_to_char_idx(text, 12)); 246 | } 247 | 248 | #[test] 249 | fn byte_to_char_idx_04() { 250 | // Ascii range 251 | for i in 0..88 { 252 | assert_eq!(i, byte_to_char_idx(TEXT_LINES, i)); 253 | } 254 | 255 | // Hiragana characters 256 | for i in 88..125 { 257 | assert_eq!(88 + ((i - 88) / 3), byte_to_char_idx(TEXT_LINES, i)); 258 | } 259 | 260 | // Past the end 261 | for i in 125..130 { 262 | assert_eq!(100, byte_to_char_idx(TEXT_LINES, i)); 263 | } 264 | } 265 | 266 | #[test] 267 | fn char_to_byte_idx_01() { 268 | let text = "Hello せかい!"; 269 | assert_eq!(0, char_to_byte_idx(text, 0)); 270 | assert_eq!(1, char_to_byte_idx(text, 1)); 271 | assert_eq!(2, char_to_byte_idx(text, 2)); 272 | assert_eq!(5, char_to_byte_idx(text, 5)); 273 | assert_eq!(6, char_to_byte_idx(text, 6)); 274 | assert_eq!(12, char_to_byte_idx(text, 8)); 275 | assert_eq!(15, char_to_byte_idx(text, 9)); 276 | assert_eq!(16, char_to_byte_idx(text, 10)); 277 | } 278 | 279 | #[test] 280 | fn char_to_byte_idx_02() { 281 | let text = "せかい"; 282 | assert_eq!(0, char_to_byte_idx(text, 0)); 283 | assert_eq!(3, char_to_byte_idx(text, 1)); 284 | assert_eq!(6, char_to_byte_idx(text, 2)); 285 | assert_eq!(9, char_to_byte_idx(text, 3)); 286 | } 287 | 288 | #[test] 289 | fn char_to_byte_idx_03() { 290 | let text = "Hello world!"; 291 | assert_eq!(0, char_to_byte_idx(text, 0)); 292 | assert_eq!(1, char_to_byte_idx(text, 1)); 293 | assert_eq!(8, char_to_byte_idx(text, 8)); 294 | assert_eq!(11, char_to_byte_idx(text, 11)); 295 | assert_eq!(12, char_to_byte_idx(text, 12)); 296 | } 297 | 298 | #[test] 299 | fn char_to_byte_idx_04() { 300 | let text = "Hello world! Hello せかい! Hello world! Hello せかい! \ 301 | Hello world! Hello せかい! Hello world! Hello せかい! \ 302 | Hello world! Hello せかい! Hello world! Hello せかい! \ 303 | Hello world! Hello せかい! Hello world! Hello せかい!"; 304 | assert_eq!(0, char_to_byte_idx(text, 0)); 305 | assert_eq!(30, char_to_byte_idx(text, 24)); 306 | assert_eq!(60, char_to_byte_idx(text, 48)); 307 | assert_eq!(90, char_to_byte_idx(text, 72)); 308 | assert_eq!(115, char_to_byte_idx(text, 93)); 309 | assert_eq!(120, char_to_byte_idx(text, 96)); 310 | assert_eq!(150, char_to_byte_idx(text, 120)); 311 | assert_eq!(180, char_to_byte_idx(text, 144)); 312 | assert_eq!(210, char_to_byte_idx(text, 168)); 313 | assert_eq!(239, char_to_byte_idx(text, 191)); 314 | } 315 | 316 | #[test] 317 | fn char_to_byte_idx_05() { 318 | // Ascii range 319 | for i in 0..88 { 320 | assert_eq!(i, char_to_byte_idx(TEXT_LINES, i)); 321 | } 322 | 323 | // Hiragana characters 324 | for i in 88..100 { 325 | assert_eq!(88 + ((i - 88) * 3), char_to_byte_idx(TEXT_LINES, i)); 326 | } 327 | 328 | // Past the end 329 | for i in 100..110 { 330 | assert_eq!(124, char_to_byte_idx(TEXT_LINES, i)); 331 | } 332 | } 333 | } 334 | -------------------------------------------------------------------------------- /src/gapbuffer.rs: -------------------------------------------------------------------------------- 1 | use crate::fast_str_tools::*; 2 | #[cfg(feature = "line_conversion")] 3 | use crate::utils::count_lines; 4 | use crate::utils::str_chars_to_bytes_rev; 5 | 6 | #[derive(Debug, Clone, Eq)] 7 | pub struct GapBuffer { 8 | data: [u8; LEN], 9 | 10 | pub(crate) gap_start_bytes: u16, 11 | pub(crate) gap_start_chars: u16, 12 | 13 | /// The number of UTF16 surrogate pairs before the gap. 14 | #[cfg(feature = "wchar_conversion")] 15 | pub(crate) gap_start_surrogate_pairs: u16, 16 | 17 | /// The number of lines before the gap 18 | #[cfg(feature = "line_conversion")] 19 | pub(crate) gap_start_lines: u16, 20 | 21 | pub(crate) gap_len: u16, 22 | all_ascii: bool, 23 | } 24 | 25 | #[inline] 26 | unsafe fn slice_to_str(arr: &[u8]) -> &str { 27 | if cfg!(debug_assertions) { 28 | std::str::from_utf8(arr).unwrap() 29 | } else { 30 | std::str::from_utf8_unchecked(arr) 31 | } 32 | } 33 | 34 | impl GapBuffer { 35 | pub fn new() -> Self { 36 | Self { 37 | data: [0; LEN], 38 | gap_start_bytes: 0, 39 | gap_start_chars: 0, 40 | #[cfg(feature = "wchar_conversion")] 41 | gap_start_surrogate_pairs: 0, 42 | #[cfg(feature = "line_conversion")] 43 | gap_start_lines: 0, 44 | gap_len: LEN as u16, 45 | all_ascii: true, 46 | } 47 | } 48 | 49 | pub fn new_from_str(s: &str) -> Self { 50 | let mut val = Self::new(); 51 | val.try_insert(0, s).unwrap(); 52 | val 53 | } 54 | 55 | // #[allow(unused)] 56 | // pub fn len_space(&self) -> usize { 57 | // self.gap_len as usize 58 | // } 59 | 60 | /// In bytes. 61 | pub fn len_bytes(&self) -> usize { 62 | LEN - self.gap_len as usize 63 | } 64 | 65 | // #[allow(unused)] 66 | // pub fn char_len(&self) -> usize { 67 | // count_chars(self.start_as_str()) + count_chars(self.end_as_str()) 68 | // } 69 | 70 | pub fn is_empty(&self) -> bool { 71 | self.gap_len as usize == LEN 72 | } 73 | 74 | fn count_internal_chars(&self, s: &str) -> usize { 75 | if self.all_ascii { s.len() } else { count_chars(s) } 76 | } 77 | 78 | #[cfg(feature = "wchar_conversion")] 79 | fn int_count_surrogate_pairs(&self, s: &str) -> usize { 80 | if self.all_ascii { 0 } else { count_utf16_surrogates(s) } 81 | } 82 | 83 | fn int_str_get_byte_offset(&self, s: &str, char_pos: usize) -> usize { 84 | if self.all_ascii { char_pos } else { char_to_byte_idx(s, char_pos) } 85 | } 86 | fn int_chars_to_bytes_backwards(&self, s: &str, char_len: usize) -> usize { 87 | if self.all_ascii { char_len } else { str_chars_to_bytes_rev(s, char_len) } 88 | } 89 | 90 | pub fn move_gap(&mut self, new_start_bytes: usize) { 91 | let current_start = self.gap_start_bytes as usize; 92 | 93 | if new_start_bytes != current_start { 94 | let len = self.gap_len as usize; 95 | debug_assert!(new_start_bytes <= LEN-len); 96 | 97 | #[allow(clippy::comparison_chain)] 98 | if new_start_bytes < current_start { 99 | // move characters to the right (gap to the left) 100 | let moved_chars = new_start_bytes..current_start; 101 | let s = unsafe { slice_to_str(&self.data[moved_chars.clone()]) }; 102 | let char_len = self.count_internal_chars(s); 103 | 104 | #[cfg(feature = "wchar_conversion")] { 105 | let surrogate_pairs = self.int_count_surrogate_pairs(s); 106 | self.gap_start_surrogate_pairs -= surrogate_pairs as u16; 107 | } 108 | 109 | #[cfg(feature = "line_conversion")] { 110 | self.gap_start_lines -= count_lines(s) as u16; 111 | } 112 | 113 | self.gap_start_chars -= char_len as u16; 114 | 115 | self.data.copy_within(moved_chars, new_start_bytes + len); 116 | } else if current_start < new_start_bytes { 117 | // Move characters to the left (gap to the right) 118 | let moved_chars = current_start+len..new_start_bytes +len; 119 | let s = unsafe { slice_to_str(&self.data[moved_chars.clone()]) }; 120 | let char_len = self.count_internal_chars(s); 121 | 122 | #[cfg(feature = "wchar_conversion")] { 123 | let surrogate_pairs = self.int_count_surrogate_pairs(s); 124 | self.gap_start_surrogate_pairs += surrogate_pairs as u16; 125 | } 126 | 127 | #[cfg(feature = "line_conversion")] { 128 | self.gap_start_lines += count_lines(s) as u16; 129 | } 130 | 131 | self.gap_start_chars += char_len as u16; 132 | 133 | self.data.copy_within(moved_chars, current_start); 134 | } 135 | 136 | if cfg!(debug_assertions) { 137 | // This is unnecessary but tidy, and makes debugging easier. 138 | self.data[new_start_bytes..new_start_bytes +len].fill(0); 139 | } 140 | 141 | self.gap_start_bytes = new_start_bytes as u16; 142 | } 143 | } 144 | 145 | /// Panics if there's no room. This inserts at the start of the gap (and moves the gap after the 146 | /// inserted text). 147 | pub fn insert_in_gap(&mut self, s: &str) { 148 | let len = s.len(); 149 | let char_len = count_chars(s); 150 | assert!(len <= self.gap_len as usize); 151 | 152 | let start = self.gap_start_bytes as usize; 153 | self.data[start..start+len].copy_from_slice(s.as_bytes()); 154 | self.gap_start_bytes += len as u16; 155 | self.gap_start_chars += char_len as u16; 156 | self.gap_len -= len as u16; 157 | 158 | #[cfg(feature = "wchar_conversion")] 159 | if len != char_len { 160 | self.gap_start_surrogate_pairs += count_utf16_surrogates(s) as u16; 161 | } 162 | 163 | #[cfg(feature = "line_conversion")] { 164 | self.gap_start_lines += count_lines(s) as u16; 165 | } 166 | 167 | if len != char_len { self.all_ascii = false; } 168 | } 169 | 170 | pub fn try_insert(&mut self, byte_pos: usize, s: &str) -> Result<(), ()> { 171 | let len = s.len(); 172 | if len > self.gap_len as usize { 173 | // No space in this node! 174 | Result::Err(()) 175 | } else { 176 | self.move_gap(byte_pos); 177 | self.insert_in_gap(s); 178 | Result::Ok(()) 179 | } 180 | } 181 | 182 | /// Remove chars after the gap (ie, at gap .. gap+del_len) 183 | pub fn remove_after_gap(&mut self, del_bytes: usize) { 184 | if cfg!(debug_assertions) { 185 | // Zero out the deleted bytes in debug mode. 186 | self.data[ 187 | (self.gap_start_bytes +self.gap_len) as usize..(self.gap_start_bytes +self.gap_len) as usize + del_bytes 188 | ].fill(0); 189 | } 190 | self.gap_len += del_bytes as u16; 191 | } 192 | 193 | // Returns the number of items actually removed. 194 | #[allow(unused)] 195 | pub fn remove(&mut self, pos: usize, del_len: usize) -> usize { 196 | let len = self.len_bytes(); 197 | 198 | if pos >= len { return 0; } 199 | let del_len = del_len.min(len - pos); 200 | 201 | self.move_gap(pos); 202 | 203 | self.remove_after_gap(del_len); 204 | del_len 205 | } 206 | 207 | /// Returns the number of bytes removed. 208 | pub fn remove_chars(&mut self, pos: usize, mut del_len: usize) -> usize { 209 | // This function is longer than it needs to be; but having it be a bit longer makes the 210 | // code faster. I think the trade-off is worth it. 211 | // self.move_gap(self.count_bytes(pos)); 212 | // let removed_bytes = str_get_byte_offset(s.end_as_str(), del_len); 213 | // self.remove_at_gap(removed_bytes); 214 | // removed_bytes 215 | 216 | if del_len == 0 { return 0; } 217 | debug_assert!(del_len <= self.len_bytes() - pos); 218 | let mut rm_start_bytes = 0; 219 | 220 | let gap_chars = self.gap_start_chars as usize; 221 | #[cfg(any(feature = "wchar_conversion", feature = "line_conversion"))] 222 | let gap_start_bytes = self.gap_start_bytes as usize; 223 | if pos <= gap_chars && pos+del_len >= gap_chars { 224 | if pos < gap_chars { 225 | // Delete the bit from pos..gap. 226 | // TODO: It would be better to count backwards here. 227 | // let pos_bytes = str_get_byte_offset(self.start_as_str(), pos) as u16; 228 | // rm_start_bytes = self.gap_start_bytes - pos_bytes; 229 | rm_start_bytes = self.int_chars_to_bytes_backwards(self.start_as_str(), gap_chars - pos); 230 | 231 | #[cfg(feature = "wchar_conversion")] 232 | if !self.all_ascii { 233 | self.gap_start_surrogate_pairs -= unsafe { 234 | count_utf16_surrogates_in_bytes(&self.data[gap_start_bytes - rm_start_bytes..gap_start_bytes]) as u16 235 | } 236 | } 237 | 238 | #[cfg(feature = "line_conversion")] { 239 | unsafe { 240 | let s = std::str::from_utf8_unchecked(&self.data[gap_start_bytes - rm_start_bytes..gap_start_bytes]); 241 | self.gap_start_lines -= count_lines(s) as u16; 242 | } 243 | } 244 | 245 | del_len -= self.gap_start_chars as usize - pos; 246 | let rm_start_bytes = rm_start_bytes as u16; 247 | self.gap_len += rm_start_bytes; 248 | self.gap_start_chars = pos as u16; 249 | self.gap_start_bytes -= rm_start_bytes; 250 | // self.gap_start_bytes = pos_bytes; 251 | if del_len == 0 { return rm_start_bytes as usize; } 252 | } 253 | 254 | debug_assert!(del_len > 0); 255 | debug_assert!(pos >= self.gap_start_chars as usize); 256 | } else { 257 | // This is equivalent to self.count_bytes() (below), but for some reason manually 258 | // inlining it here results in both faster and smaller executables. 259 | let gap_bytes = if pos < gap_chars { 260 | self.int_str_get_byte_offset(self.start_as_str(), pos) 261 | } else { 262 | self.int_str_get_byte_offset(self.end_as_str(), pos - gap_chars) + self.gap_start_bytes as usize 263 | }; 264 | self.move_gap(gap_bytes); 265 | } 266 | 267 | // At this point the gap is guaranteed to be directly after pos. 268 | let rm_end_bytes = self.int_str_get_byte_offset(self.end_as_str(), del_len); 269 | self.remove_after_gap(rm_end_bytes); 270 | rm_start_bytes as usize + rm_end_bytes 271 | } 272 | 273 | pub fn start_as_str(&self) -> &str { 274 | unsafe { 275 | slice_to_str(&self.data[0..self.gap_start_bytes as usize]) 276 | } 277 | } 278 | pub fn end_as_str(&self) -> &str { 279 | unsafe { 280 | slice_to_str(&self.data[(self.gap_start_bytes +self.gap_len) as usize..LEN]) 281 | } 282 | } 283 | 284 | pub fn count_bytes(&self, char_pos: usize) -> usize { 285 | if self.all_ascii { return char_pos; } 286 | 287 | let gap_chars = self.gap_start_chars as usize; 288 | let gap_bytes = self.gap_start_bytes as usize; 289 | // Clippy complains about this but if I swap to a match expression, performance drops by 1%. 290 | #[allow(clippy::comparison_chain)] 291 | if char_pos == gap_chars { 292 | gap_bytes 293 | } else if char_pos < gap_chars { 294 | self.int_str_get_byte_offset(self.start_as_str(), char_pos) 295 | } else { // char_pos > start_char_len. 296 | gap_bytes + self.int_str_get_byte_offset(self.end_as_str(), char_pos - gap_chars) 297 | } 298 | } 299 | 300 | /// Calculate & return the number of surrogate pairs in `[0..char_pos]` 301 | #[cfg(feature = "wchar_conversion")] 302 | pub(crate) fn count_chars_in_wchars(&self, wchar_pos: usize) -> usize { 303 | if self.all_ascii { wchar_pos } 304 | else { 305 | let gap_chars = self.gap_start_chars as usize; 306 | let gap_pairs = self.gap_start_surrogate_pairs as usize; 307 | let gap_wchars = gap_chars + gap_pairs; 308 | 309 | if wchar_pos == gap_wchars { 310 | gap_chars 311 | } else if wchar_pos < gap_wchars { 312 | // In start. 313 | if self.gap_start_surrogate_pairs == 0 { wchar_pos } 314 | else { 315 | utf16_code_unit_to_char_idx(self.start_as_str(), wchar_pos) 316 | } 317 | } else { 318 | // In end. 319 | gap_chars + utf16_code_unit_to_char_idx(self.end_as_str(), wchar_pos - gap_wchars) 320 | } 321 | } 322 | } 323 | 324 | #[cfg(feature = "wchar_conversion")] 325 | pub(crate) fn count_surrogate_pairs(&self, char_pos: usize) -> usize { 326 | if self.all_ascii { 327 | 0 328 | } else { 329 | let gap_chars = self.gap_start_chars as usize; 330 | if char_pos == gap_chars { 331 | self.gap_start_surrogate_pairs as usize 332 | } else if char_pos < gap_chars { 333 | if self.gap_start_surrogate_pairs == 0 { 0 } 334 | else { 335 | let bytes = self.int_str_get_byte_offset(self.start_as_str(), char_pos); 336 | unsafe { count_utf16_surrogates_in_bytes(&self.data[..bytes]) } 337 | } 338 | } else { 339 | // Right stuff. 340 | let bytes = self.int_str_get_byte_offset(self.end_as_str(), char_pos - gap_chars); 341 | let base = (self.gap_start_bytes + self.gap_len) as usize; 342 | let slice = &self.data[base..base + bytes]; 343 | unsafe { self.gap_start_surrogate_pairs as usize + count_utf16_surrogates_in_bytes(slice) } 344 | } 345 | } 346 | } 347 | 348 | /// Take the remaining contents in the gap buffer. Mark them as deleted, but return them. 349 | /// This will leave those items non-zero, but that doesn't matter. 350 | pub fn take_rest(&mut self) -> &str { 351 | let last_idx = (self.gap_start_bytes + self.gap_len) as usize; 352 | self.gap_len = LEN as u16 - self.gap_start_bytes; 353 | unsafe { slice_to_str(&self.data[last_idx..LEN]) } 354 | } 355 | 356 | pub(crate) fn check(&self) { 357 | let char_len = count_chars(self.start_as_str()); 358 | assert_eq!(char_len, self.gap_start_chars as usize); 359 | 360 | #[cfg(feature = "wchar_conversion")] { 361 | let pairs = count_utf16_surrogates(self.start_as_str()); 362 | assert_eq!(pairs, self.gap_start_surrogate_pairs as usize); 363 | } 364 | 365 | #[cfg(feature = "line_conversion")] { 366 | let lines = count_lines(self.start_as_str()); 367 | assert_eq!(lines, self.gap_start_lines as usize); 368 | } 369 | 370 | if self.all_ascii { 371 | assert_eq!(self.gap_start_bytes, self.gap_start_chars); 372 | #[cfg(feature = "wchar_conversion")] { 373 | assert_eq!(self.gap_start_surrogate_pairs, 0); 374 | } 375 | } 376 | } 377 | } 378 | 379 | impl ToString for GapBuffer { 380 | fn to_string(&self) -> String { 381 | let mut result = String::with_capacity(self.len_bytes()); 382 | result.push_str(self.start_as_str()); 383 | result.push_str(self.end_as_str()); 384 | result 385 | } 386 | } 387 | 388 | impl PartialEq for GapBuffer { 389 | // Eq is interesting because we need to ignore where the gap is. 390 | fn eq(&self, other: &Self) -> bool { 391 | if self.gap_len != other.gap_len { return false; } 392 | // There's 3 sections to check: 393 | // - Before our gap 394 | // - The inter-gap part 395 | // - The last, common part. 396 | let (a, b) = if self.gap_start_bytes < other.gap_start_bytes { 397 | (self, other) 398 | } else { 399 | (other, self) 400 | }; 401 | // a has its gap first (or the gaps are at the same time). 402 | let a_start = a.gap_start_bytes as usize; 403 | let b_start = b.gap_start_bytes as usize; 404 | let gap_len = a.gap_len as usize; 405 | 406 | // Section before the gaps 407 | if a.data[0..a_start] != b.data[0..a_start] { return false; } 408 | 409 | // Gappy bit 410 | if a.data[a_start+gap_len..b_start+gap_len] != b.data[a_start..b_start] { return false; } 411 | 412 | // Last bit 413 | let end_idx = b_start + gap_len; 414 | a.data[end_idx..LEN] == b.data[end_idx..LEN] 415 | } 416 | } 417 | 418 | #[cfg(test)] 419 | mod test { 420 | use crate::gapbuffer::GapBuffer; 421 | 422 | fn check_eq(b: &GapBuffer, s: &str) { 423 | assert_eq!(b.to_string(), s); 424 | assert_eq!(b.len_bytes(), s.len()); 425 | assert_eq!(s.is_empty(), b.is_empty()); 426 | } 427 | 428 | #[test] 429 | fn smoke_test() { 430 | let mut b = GapBuffer::<5>::new(); 431 | 432 | b.try_insert(0, "hi").unwrap(); 433 | b.try_insert(0, "x").unwrap(); // 'xhi' 434 | // b.move_gap(2); 435 | b.try_insert(2, "x").unwrap(); // 'xhxi' 436 | check_eq(&b, "xhxi"); 437 | } 438 | 439 | #[test] 440 | fn remove() { 441 | let mut b = GapBuffer::<5>::new_from_str("hi"); 442 | assert_eq!(b.remove(2, 2), 0); 443 | check_eq(&b, "hi"); 444 | 445 | assert_eq!(b.remove(0, 1), 1); 446 | check_eq(&b, "i"); 447 | 448 | assert_eq!(b.remove(0, 1000), 1); 449 | check_eq(&b, ""); 450 | } 451 | 452 | #[test] 453 | fn eq() { 454 | let hi = GapBuffer::<5>::new_from_str("hi"); 455 | let yo = GapBuffer::<5>::new_from_str("yo"); 456 | assert_ne!(hi, yo); 457 | assert_eq!(hi, hi); 458 | 459 | let mut hi2 = GapBuffer::<5>::new_from_str("hi"); 460 | hi2.move_gap(1); 461 | assert_eq!(hi, hi2); 462 | 463 | hi2.move_gap(0); 464 | assert_eq!(hi, hi2); 465 | } 466 | } -------------------------------------------------------------------------------- /src/iter.rs: -------------------------------------------------------------------------------- 1 | use std::ops::Range; 2 | use crate::jumprope::*; 3 | use crate::utils::str_chars_to_bytes; 4 | 5 | /// An iterator over chunks (nodes) in the list. 6 | pub(crate) struct NodeIter<'a>(Option<&'a Node>); 7 | 8 | impl<'a> Iterator for NodeIter<'a> { 9 | type Item = &'a Node; 10 | 11 | fn next(&mut self) -> Option<&'a Node> { 12 | let prev = self.0; 13 | if let Some(n) = self.0 { 14 | // TODO: What? 15 | *self = NodeIter(unsafe { n.next_ptr().as_ref() }); 16 | } 17 | prev 18 | } 19 | } 20 | 21 | /// A content iterator iterates over the strings in the rope 22 | pub struct ContentIter<'a> { 23 | next: Option<&'a Node>, 24 | /// Are we at the start or the end of the gap buffer? 25 | at_start: bool, 26 | } 27 | 28 | impl<'a> ContentIter<'a> { 29 | pub fn substrings(self) -> Substrings<'a> { 30 | Substrings(self) 31 | } 32 | 33 | pub fn chars(self) -> Chars<'a> { 34 | self.into() 35 | } 36 | } 37 | 38 | impl<'a> Iterator for ContentIter<'a> { 39 | type Item = (&'a str, usize); 40 | 41 | fn next(&mut self) -> Option { 42 | while let Some(n) = self.next { 43 | let s = if self.at_start { 44 | self.at_start = false; 45 | (n.str.start_as_str(), n.str.gap_start_chars as usize) 46 | } else { 47 | self.next = unsafe { n.next_ptr().as_ref() }; 48 | self.at_start = true; 49 | (n.str.end_as_str(), n.num_chars() - n.str.gap_start_chars as usize) 50 | }; 51 | 52 | if s.1 > 0 { 53 | return Some(s); 54 | } 55 | } 56 | 57 | None 58 | } 59 | } 60 | 61 | /// Iterator over the substrings in some content. This is just a hand-written .map(|s, len| s) 62 | /// iterator to make it possible to embed a jumprope iterator inside another iterator. 63 | pub struct Substrings<'a, I: Iterator = ContentIter<'a>>(I); 64 | 65 | impl<'a, I: Iterator> Substrings<'a, I> { 66 | /// Convert this content into a string 67 | pub fn into_string(self) -> String { 68 | self.collect::() 69 | } 70 | } 71 | 72 | impl<'a, I: Iterator> Iterator for Substrings<'a, I> { 73 | type Item = &'a str; 74 | 75 | fn next(&mut self) -> Option { 76 | self.0.next().map(|(s, _)| s) 77 | } 78 | } 79 | 80 | /// Iterator over the individual characters in a rope (or rope slice). 81 | pub struct Chars<'a, I: Iterator = ContentIter<'a>> { 82 | inner: I, 83 | current: std::str::Chars<'a>, 84 | } 85 | 86 | impl<'a, I: Iterator> From for Chars<'a, I> { 87 | fn from(inner: I) -> Self { 88 | Self { 89 | inner, 90 | current: "".chars() 91 | } 92 | } 93 | } 94 | 95 | impl<'a, I: Iterator> Iterator for Chars<'a, I> { 96 | type Item = char; 97 | 98 | fn next(&mut self) -> Option { 99 | self.current.next().or_else(|| { 100 | self.current = self.inner.next()?.0.chars(); 101 | let next = self.current.next(); 102 | // None of the items returned from our inner iterator should be empty. 103 | debug_assert!(next.is_some()); 104 | next 105 | }) 106 | } 107 | } 108 | 109 | /// Iterate over a sub-range of the rope. 110 | pub struct SliceIter<'a> { 111 | inner: ContentIter<'a>, 112 | skip: usize, 113 | take_len: usize, 114 | } 115 | 116 | pub type SubstringsInRange<'a> = Substrings<'a, SliceIter<'a>>; 117 | pub type CharsInRange<'a> = Chars<'a, SliceIter<'a>>; 118 | 119 | impl<'a> SliceIter<'a> { 120 | pub fn substrings(self) -> SubstringsInRange<'a> { 121 | Substrings(self) 122 | } 123 | 124 | pub fn chars(self) -> CharsInRange<'a> { 125 | self.into() 126 | } 127 | } 128 | 129 | impl<'a> Iterator for SliceIter<'a> { 130 | type Item = (&'a str, usize); 131 | 132 | fn next(&mut self) -> Option { 133 | if self.take_len == 0 { return None; } 134 | 135 | self.inner.next().map(|(mut s, mut char_len)| { 136 | if self.skip > 0 { 137 | let byte = str_chars_to_bytes(s, self.skip); 138 | assert!(byte < s.len()); 139 | 140 | s = &s[byte..]; 141 | char_len -= self.skip; 142 | self.skip = 0; 143 | } 144 | 145 | if self.take_len < char_len { 146 | let byte = str_chars_to_bytes(s, self.take_len); 147 | s = &s[0..byte]; 148 | char_len = self.take_len; 149 | } 150 | 151 | self.take_len -= char_len; 152 | 153 | (s, char_len) 154 | }) 155 | } 156 | } 157 | 158 | impl JumpRope { 159 | pub(crate) fn node_iter_at_start(&self) -> NodeIter { NodeIter(Some(&self.head)) } 160 | 161 | /// Iterate over the rope, visiting each substring in [`str`] chunks. Whenever possible, this is 162 | /// the best way for a program to read back the contents of a rope, because it avoids allocating 163 | /// memory or copying the characters themselves (as you get with .to_string() or .chars()). 164 | /// 165 | /// ## Stability Warning 166 | /// 167 | /// This iterator will always return all the characters in document order, but the particular 168 | /// way characters are grouped together is based on internal implementation details. Thus it 169 | /// might change in arbitrary ways at any time. Your application should not depend on the 170 | /// specifics of this chunking. 171 | /// 172 | /// # Example 173 | /// 174 | /// ``` 175 | /// # use jumprope::*; 176 | /// let rope = JumpRope::from("oh hai"); 177 | /// let mut string = String::new(); 178 | /// for str in rope.substrings() { 179 | /// string.push_str(str); 180 | /// } 181 | /// assert_eq!(string, "oh hai"); 182 | /// ``` 183 | pub fn substrings(&self) -> Substrings<'_> { 184 | self.substrings_with_len().substrings() 185 | } 186 | 187 | /// Iterate over all substrings in the rope, but also yield the unicode character length for 188 | /// each item. A caller could obviously recalculate these lengths from the provided &str 189 | /// objects, but since the unicode lengths are known this allows small optimizations. 190 | /// 191 | /// The iterator yields pairs of (str, char_len). 192 | /// 193 | /// ## Stability Warning 194 | /// 195 | /// This iterator will always return all the characters in document order, but the particular 196 | /// way characters are grouped together is based on internal implementation details. Thus it 197 | /// might change in arbitrary ways at any time. Your application should not depend on the 198 | /// specifics of this chunking. 199 | /// 200 | /// # Example 201 | /// 202 | /// ``` 203 | /// # use jumprope::*; 204 | /// let rope = JumpRope::from("oh hai"); 205 | /// let mut string = String::new(); 206 | /// for (str, char_len) in rope.substrings_with_len() { 207 | /// assert_eq!(str.chars().count(), char_len); 208 | /// string.push_str(str); 209 | /// } 210 | /// assert_eq!(string, "oh hai"); 211 | /// ``` 212 | pub fn substrings_with_len(&self) -> ContentIter { 213 | ContentIter { 214 | next: Some(&self.head), 215 | at_start: true 216 | } 217 | } 218 | 219 | /// Get an iterator over all characters in the rope. 220 | /// 221 | /// In most cases this will be less efficient than using [`substrings`](Self::substrings) to 222 | /// iterate over all &str items contained in the rope. 223 | /// 224 | /// # Example 225 | /// 226 | /// ``` 227 | /// # use jumprope::*; 228 | /// let rope = JumpRope::from("oh hai"); 229 | /// assert_eq!("oh hai", rope.chars().collect::()); 230 | /// ``` 231 | pub fn chars(&self) -> Chars { 232 | self.substrings_with_len().chars() 233 | } 234 | 235 | 236 | 237 | /// Iterate through all the substrings within the specified unicode character range in the 238 | /// document. 239 | /// 240 | /// # Example 241 | /// 242 | /// ``` 243 | /// # use jumprope::*; 244 | /// let rope = JumpRope::from("xxxGreetings!xxx"); 245 | /// let mut string = String::new(); 246 | /// for s in rope.slice_substrings(3..rope.len_chars() - 3) { 247 | /// string.push_str(s); 248 | /// } 249 | /// assert_eq!(string, "Greetings!"); 250 | /// ``` 251 | pub fn slice_substrings(&self, range: Range) -> SubstringsInRange { 252 | self.slice_substrings_with_len(range).substrings() 253 | } 254 | 255 | /// Iterate through chunks across a character range in the document. 256 | /// 257 | /// # Example 258 | /// 259 | /// ``` 260 | /// # use jumprope::*; 261 | /// let rope = JumpRope::from("xxxGreetings!xxx"); 262 | /// let mut string = String::new(); 263 | /// for (str, char_len) in rope.slice_substrings_with_len(3..rope.len_chars() - 3) { 264 | /// assert_eq!(str.chars().count(), char_len); 265 | /// string.push_str(str); 266 | /// } 267 | /// assert_eq!(string, "Greetings!"); 268 | /// ``` 269 | /// 270 | /// Or more simply: 271 | /// 272 | /// ``` 273 | /// # use jumprope::*; 274 | /// let rope = JumpRope::from("xxxGreetings!xxx"); 275 | /// let string = rope.slice_substrings_with_len(3..13).map(|(str, _len)| str).collect::(); 276 | /// assert_eq!(string, "Greetings!"); 277 | /// ``` 278 | pub fn slice_substrings_with_len(&self, range: Range) -> SliceIter { 279 | let cursor = self.read_cursor_at_char(range.start, false); 280 | let node_gap_start = cursor.node.str.gap_start_chars as usize; 281 | let local_pos = cursor.offset_chars; 282 | 283 | let (at_start, skip) = if local_pos >= node_gap_start { 284 | (false, local_pos - node_gap_start) 285 | } else { 286 | (true, local_pos) 287 | }; 288 | 289 | SliceIter { 290 | inner: ContentIter { 291 | next: Some(cursor.node), at_start 292 | }, 293 | skip, 294 | take_len: range.end - range.start 295 | } 296 | } 297 | 298 | /// Iterate through characters in the rope within the specified range. The range is specified 299 | /// using unicode characters, not bytes. 300 | /// 301 | /// # Example 302 | /// 303 | /// ``` 304 | /// # use jumprope::*; 305 | /// let rope = JumpRope::from("xxxGreetings!xxx"); 306 | /// 307 | /// assert_eq!("Greetings!", 308 | /// rope.slice_chars(3..rope.len_chars() - 3).collect::() 309 | /// ); 310 | /// ``` 311 | pub fn slice_chars(&self, range: Range) -> CharsInRange { 312 | self.slice_substrings_with_len(range).chars() 313 | } 314 | 315 | // We also have a to_string implementation from Display, but that doesn't provide size hints. 316 | pub fn to_string(&self) -> String { 317 | let mut result = String::with_capacity(self.len_bytes()); 318 | for s in self.substrings() { 319 | result.push_str(s); 320 | } 321 | result 322 | } 323 | } 324 | 325 | #[cfg(test)] 326 | mod tests { 327 | use crate::fast_str_tools::*; 328 | use crate::JumpRope; 329 | use crate::jumprope::NODE_STR_SIZE; 330 | 331 | fn check(rope: &JumpRope) { 332 | for (s, len) in rope.substrings_with_len() { 333 | assert_eq!(count_chars(s), len); 334 | assert_ne!(len, 0); // Returned items may not be empty. 335 | } 336 | 337 | for (s, len) in rope.slice_substrings_with_len(0..rope.len_chars()) { 338 | assert_eq!(count_chars(s), len); 339 | assert_ne!(len, 0); // Returned items may not be empty. 340 | } 341 | 342 | assert_eq!(rope.substrings_with_len().chars().collect::(), rope.to_string()); 343 | assert_eq!(rope.chars().collect::(), rope.to_string()); 344 | assert_eq!(rope.slice_chars(0..rope.len_chars()).collect::(), rope.to_string()); 345 | 346 | let s = rope.to_string(); 347 | for start in 0..=rope.len_chars() { 348 | let iter = rope.slice_chars(start..rope.len_chars()); 349 | let str = iter.collect::(); 350 | 351 | let byte_start = char_to_byte_idx(&s, start); 352 | assert_eq!(str, &s[byte_start..]); 353 | } 354 | } 355 | 356 | #[test] 357 | fn iter_smoke_tests() { 358 | check(&JumpRope::new()); 359 | check(&JumpRope::from("hi there")); 360 | 361 | let mut rope = JumpRope::from("aaaa"); 362 | rope.insert(2, "b"); // This will force a gap. 363 | assert_eq!(rope.substrings_with_len().count(), 2); 364 | check(&rope); 365 | 366 | // Long enough that in debugging mode we'll spill into multiple items. 367 | let s = "XXXaaaaaaaaaaaaaaaaaaaaaaaaaaXXX"; 368 | let rope = JumpRope::from(s); 369 | assert!(rope.substrings_with_len().count() > 1); 370 | check(&rope); 371 | 372 | assert_eq!( 373 | rope.slice_substrings_with_len(3..s.len() - 3).chars().collect::(), 374 | &s[3..s.len() - 3] 375 | ); 376 | } 377 | 378 | #[test] 379 | fn iter_non_ascii() { 380 | check(&JumpRope::from("κό𝕐𝕆😘σμε")); 381 | } 382 | 383 | #[test] 384 | fn iter_chars_tricky() { 385 | let mut rope = JumpRope::new(); 386 | rope.extend(std::iter::repeat("x").take(NODE_STR_SIZE * 2)); 387 | check(&rope); 388 | } 389 | } -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! # JumpRope 2 | //! 3 | //! A small, fast rope library for rust built on a skip list of gap buffers 4 | //! 5 | //! This library enables super fast in-memory string editing, where an edit might insert, delete 6 | //! or modify text from anywhere in the string. Unlike inserting and deleting in a String directly, 7 | //! jumprope avoids expensive memcopy / memmove operations. All editing operations are O(log n) 8 | //! based on the size of the string. 9 | //! 10 | //! ## Example 11 | //! 12 | //! ``` 13 | //! use jumprope::JumpRope; 14 | //! 15 | //! let mut rope = JumpRope::from("Some large text document"); 16 | //! rope.insert(5, "really "); // "Some really large text document" 17 | //! rope.replace(0..4, "My rad"); // "My rad really large text document" 18 | //! assert_eq!(rope, "My rad really large text document"); 19 | //! 20 | //! // Extract to a string 21 | //! let s: String = rope.to_string(); 22 | //! assert_eq!(s, "My rad really large text document"); 23 | //! ``` 24 | //! 25 | //! See the [`JumpRope`] type for more usage details. 26 | //! 27 | //! # Random numbers, Determinism and DoS protection 28 | //! 29 | //! Jumprope is built on top of [skip lists](https://en.wikipedia.org/wiki/Skip_list), which are a 30 | //! probabilistic data structure. Each node in the list uses a random number generator to decide 31 | //! its "height". To do this well, skip lists depend on a random number generator for performance. 32 | //! If a pathologically bad RNG source was used, the skip list would degrade to a linked list (with 33 | //! `O(n)` performance). 34 | //! 35 | //! ## Security 36 | //! 37 | //! We have plenty of high quality RNGs available in rust. However, the bad news is that if a 38 | //! malicious actor can: 39 | //! 40 | //! - Predict the sequence of random numbers, and 41 | //! - Control a sequence of insert & removal operations in the rope 42 | //! 43 | //! Then they can *force* the rope to degrade to `O(n)` performance. 44 | //! 45 | //! The obvious protection against this is to use a good RNG, seeded with a good entropy source. 46 | //! This makes the random sequence impossible to predict. Luckily jumprope isn't sensitive to the 47 | //! performance of the RNG used. The only downside is that using a CSRNG + a good entropy source 48 | //! makes the compiled binary bigger. 49 | //! 50 | //! So there's a feature flag: `["ddos_protection"]`. This flag configures jumprope to use a larger 51 | //! CSRNG instead of a PRNG. To disable it (eg for WASM), you need to compile jumprope with default 52 | //! features turned off: 53 | //! 54 | //! ```toml 55 | //! jumprope = { default-features = false } 56 | //! ``` 57 | //! 58 | //! 59 | //! 60 | //! # A rant on character lengths 61 | //! 62 | //! There are 3 different, useful ways to measure string lengths. All of them are useful in certain 63 | //! situations: 64 | //! 65 | //! - The number of bytes needed to represent the string, in some specific encoding (eg UTF8) 66 | //! - The number of unicode characters contained within 67 | //! - The number of grapheme clusters in the string. This is the number of characters drawn to 68 | //! the screen. 69 | //! 70 | //! For example, the unicode polar bear ("🐻‍❄️") has a single grapheme cluster (only one 71 | //! character is drawn). It contains 4 unicode characters (Bear emoji + zero width joiner + snow 72 | //! emoji + variation selector). And it takes 16 bytes to store in UTF8. 73 | //! 74 | //! ``` 75 | //! # use jumprope::*; 76 | //! assert_eq!("🐻‍❄️".len(), 13); 77 | //! assert_eq!("🐻‍❄️".chars().count(), 4); 78 | //! 79 | //! let rope = JumpRope::from("🐻‍❄️"); // One grapheme cluster 80 | //! assert_eq!(rope.len_bytes(), 13); // 13 UTF8 bytes 81 | //! assert_eq!(rope.len_chars(), 4); // 4 unicode characters 82 | //! ``` 83 | //! 84 | //! Worse, many popular languages (including javascript and C#) use UCS2 internally and thus their 85 | //! `string.length` property doesn't give you a useful value for any application. Javascript reports 86 | //! a snowman's length as 5 - which is useless: 87 | //! 88 | //! ```shell 89 | //! $ node 90 | //! Welcome to Node.js v16.6.1. 91 | //! > "🐻‍❄️".length 92 | //! 5 93 | //! ``` 94 | //! 95 | //! But there is no perfect "length" property for a string anyway: 96 | //! 97 | //! - The number of bytes is encoding-specific. The polar bear takes 16 bytes in UTF8, but only 10 98 | //! bytes in UTF16. 99 | //! - The number of grapheme clusters varies by device, font and software version. The conversion 100 | //! from characters to grapheme clusters is complex, and changes all the time. The polar bear 101 | //! icon was only added in May 2019. If your software is older than that (or uses a text library 102 | //! older than that), you will just see "🐻❄️". 103 | //! 104 | //! Most CRDTs and OT systems are slowly standardizing on counting unicode character positions as 105 | //! the default "length" property. The number of unicode characters isn't human-meaningful, but it 106 | //! has a number of useful properties: 107 | //! 108 | //! - Its simple and easy to define 109 | //! - Its stable across time (unlike grapheme clusters) 110 | //! - Its rarely convenient, but its very portable across different programming languages, 111 | //! regardless of that language's character encoding system. 112 | //! 113 | //! Jumprope follows this approach, using unicode character positions everywhere internally: 114 | //! 115 | //! ``` 116 | //! # use jumprope::*; 117 | //! let mut rope = JumpRope::from("🐻‍❄️"); 118 | //! rope.remove(1..4); // Remove "polar" from our polar bear 119 | //! assert_eq!(rope, "🐻"); 120 | //! ``` 121 | 122 | #![cfg_attr(doc_cfg, feature(doc_cfg))] 123 | 124 | mod jumprope; 125 | mod gapbuffer; 126 | mod utils; 127 | mod iter; 128 | mod fast_str_tools; 129 | 130 | pub use crate::jumprope::JumpRope; 131 | 132 | mod buffered; 133 | pub use crate::buffered::JumpRopeBuf; -------------------------------------------------------------------------------- /src/utils.rs: -------------------------------------------------------------------------------- 1 | use crate::fast_str_tools::*; 2 | 3 | // Get the byte offset after char_pos utf8 characters 4 | pub(crate) fn str_chars_to_bytes(s: &str, char_pos: usize) -> usize { 5 | // s.char_indices().nth(char_pos).map_or_else( 6 | // || s.len(), 7 | // |(i, _)| i 8 | // ) 9 | 10 | char_to_byte_idx(s, char_pos) 11 | } 12 | 13 | // pub(crate) fn str_bytes_to_chars(s: &str, bytes: usize) -> usize { 14 | // byte_to_char_idx(s, bytes) 15 | // } 16 | // 17 | // pub(crate) fn count_chars(s: &str) -> usize { 18 | // str_bytes_to_chars(s, s.len()) 19 | // } 20 | 21 | pub(crate) fn str_chars_to_bytes_rev(s: &str, char_len: usize) -> usize { 22 | if char_len == 0 { return 0; } 23 | 24 | // Scan backwards, looking for utf8 start bytes (marked by 0b0x or 0b 25 | let mut chars_remaining = char_len; 26 | for (i, byte) in s.as_bytes().iter().rev().enumerate() { 27 | if (*byte & 0b11_00_0000) != 0b10_00_0000 { 28 | chars_remaining -= 1; 29 | if chars_remaining == 0 { return i+1; } 30 | } 31 | } 32 | panic!("Insufficient characters in string"); 33 | } 34 | 35 | // #[cfg(feature = "wchar_conversion")] 36 | // pub(crate) fn count_wchars(s: &str) -> usize { 37 | // // TODO: There's a better way to write this. 38 | // s.chars() 39 | // .map(|c| c.len_utf16()) 40 | // .sum() 41 | // } 42 | // 43 | // #[cfg(feature = "wchar_conversion")] 44 | // pub(crate) fn str_chars_to_wchars(s: &str, char_len: usize) -> usize { 45 | // // TODO: There's a better way to write this. 46 | // // TODO: Compare this with char_len + filter + count. 47 | // s.chars() 48 | // .take(char_len) 49 | // .map(|c| c.len_utf16()) 50 | // .sum() 51 | // } 52 | 53 | #[cfg(feature = "line_conversion")] 54 | pub(crate) fn count_lines(s: &str) -> usize { 55 | // I'm sure there's faster implementations of this but this will do for now. 56 | s.as_bytes().iter().filter(|b| **b == ('\n' as u8)).count() 57 | } 58 | 59 | #[cfg(test)] 60 | mod tests { 61 | use crate::utils::*; 62 | 63 | fn check_counts(s: &str) { 64 | let num_chars = s.chars().count(); 65 | assert_eq!(count_chars(s), num_chars); 66 | 67 | for i in 0..=num_chars { 68 | let byte_offset = str_chars_to_bytes(s, i); 69 | assert_eq!(count_chars(&s[..byte_offset]), i); 70 | 71 | let end_offset = str_chars_to_bytes_rev(s, num_chars - i); 72 | assert_eq!(end_offset, s.len() - byte_offset); 73 | } 74 | } 75 | 76 | #[test] 77 | fn backwards_smoke_tests() { 78 | check_counts("hi there"); 79 | check_counts("κό𝕐𝕆😘σμε"); 80 | } 81 | 82 | #[test] 83 | #[cfg(feature = "line_conversion")] 84 | fn count_lines_tests() { 85 | assert_eq!(count_lines(""), 0); 86 | assert_eq!(count_lines("\n"), 1); 87 | assert_eq!(count_lines("fop\n\n"), 2); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /tests/test.rs: -------------------------------------------------------------------------------- 1 | // These tests are also adapted from the C code tests here: 2 | // https://github.com/josephg/librope/blob/master/test/tests.c 3 | 4 | use rand::prelude::*; 5 | 6 | use std::cmp::min; 7 | use std::ops::Range; 8 | use std::ptr; 9 | use jumprope::JumpRope; 10 | use jumprope::JumpRopeBuf; 11 | 12 | const UNI_CHARS: [char; 24] = [ 13 | '\n', 'a', 'b', 'c', '1', '2', '3', ' ', '_', // ASCII. 14 | '©', '¥', '½', // The Latin-1 suppliment (U+80 - U+ff) 15 | 'Ύ', 'Δ', 'δ', 'Ϡ', // Greek (U+0370 - U+03FF) 16 | '←', '↯', '↻', '⇈', // Arrows (U+2190 – U+21FF) 17 | '𐆐', '𐆔', '𐆘', '𐆚', // Ancient roman symbols (U+10190 – U+101CF) 18 | ]; 19 | 20 | fn random_unicode_string(len: usize, rng: &mut SmallRng) -> String { 21 | let mut s = String::new(); 22 | for _ in 0..len { 23 | s.push(UNI_CHARS[rng.gen_range(0 .. UNI_CHARS.len())] as char); 24 | } 25 | s 26 | } 27 | 28 | const ASCII_CHARS: &[u8; 83] = b" ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!@#$%^&*()[]{}<>?,./"; 29 | 30 | // Gross. Find a way to reuse the code from random_unicode_string. 31 | #[allow(unused)] 32 | fn random_ascii_string(len: usize, rng: &mut SmallRng) -> String { 33 | let mut s = String::new(); 34 | for _ in 0..len { 35 | s.push(ASCII_CHARS[rng.gen_range(0 .. ASCII_CHARS.len())] as char); 36 | } 37 | s 38 | } 39 | 40 | fn check(r: &JumpRope, expected: &str) { 41 | // println!("--- rope ---"); 42 | // r.print(); 43 | 44 | r.check(); 45 | assert_eq!(r.to_string(), expected); 46 | assert_eq!(r.len_bytes(), expected.len()); 47 | assert_eq!(r.len_chars(), expected.chars().count()); 48 | #[cfg(feature = "wchar_conversion")] { 49 | assert_eq!(r.len_wchars(), expected.chars().map(|c| c.len_utf16()).sum()); 50 | 51 | assert_eq!(r.chars_to_wchars(r.len_chars()), r.len_wchars()); 52 | assert_eq!(r.chars_to_wchars(0), 0); 53 | assert!(r.len_wchars() >= r.len_chars()); 54 | 55 | // And if we convert back, we should get the number of characters. 56 | assert_eq!(r.wchars_to_chars(r.len_wchars()), r.len_chars()); 57 | } 58 | assert_eq!(*r, JumpRope::from(expected), "Rope comparison fails"); 59 | 60 | let clone = r.clone(); 61 | // println!("--- clone ---"); 62 | // clone.print(); 63 | clone.check(); 64 | assert_eq!(*r, clone, "Rope does not equal its clone"); 65 | } 66 | 67 | #[test] 68 | fn empty_rope_has_no_contents() { 69 | let mut r = JumpRope::new(); 70 | check(&r, ""); 71 | 72 | r.insert(0, ""); 73 | check(&r, ""); 74 | } 75 | 76 | #[test] 77 | fn from_str_and_string() { 78 | let r1 = JumpRope::from("hi"); 79 | check(&r1, "hi"); 80 | 81 | let r2 = JumpRope::from(String::from("hi")); 82 | check(&r2, "hi"); 83 | } 84 | 85 | #[test] 86 | fn insert_at_location() { 87 | let mut r = JumpRope::new(); 88 | 89 | r.insert(0, "AAA"); 90 | check(&r, "AAA"); 91 | 92 | r.insert(0, "BBB"); 93 | check(&r, "BBBAAA"); 94 | 95 | r.insert(6, "CCC"); 96 | check(&r, "BBBAAACCC"); 97 | 98 | r.insert(5, "DDD"); 99 | check(&r, "BBBAADDDACCC"); 100 | } 101 | 102 | #[test] 103 | fn new_string_has_content() { 104 | let r = JumpRope::from("hi there"); 105 | check(&r, "hi there"); 106 | 107 | let mut r = JumpRope::from("κόσμε"); 108 | check(&r, "κόσμε"); 109 | r.insert(2, "𝕐𝕆😘"); 110 | check(&r, "κό𝕐𝕆😘σμε"); 111 | } 112 | 113 | #[test] 114 | fn del_at_location() { 115 | let mut r = JumpRope::from("012345678"); 116 | check(&r, "012345678"); 117 | 118 | r.remove(8..9); 119 | check(&r, "01234567"); 120 | 121 | r.remove(0..1); 122 | check(&r, "1234567"); 123 | 124 | r.remove(5..6); 125 | check(&r, "123457"); 126 | 127 | r.remove(5..6); 128 | check(&r, "12345"); 129 | 130 | r.remove(0..5); 131 | check(&r, ""); 132 | } 133 | 134 | #[test] 135 | fn del_past_end_of_string() { 136 | let mut r = JumpRope::new(); 137 | 138 | r.remove(0..100); 139 | check(&r, ""); 140 | 141 | r.insert(0, "hi there"); 142 | r.remove(3..13); 143 | check(&r, "hi "); 144 | } 145 | 146 | #[test] 147 | fn really_long_ascii_string() { 148 | let mut rng = SmallRng::seed_from_u64(1234); 149 | let len = 2000; 150 | let s = random_ascii_string(len, &mut rng); 151 | // let s = random_unicode_string(len, &mut rng); 152 | 153 | let mut r = JumpRope::from(s.as_str()); 154 | check(&r, s.as_str()); 155 | 156 | // Delete everything but the first and last characters 157 | r.remove(1..len - 1); 158 | let expect = format!("{}{}", s.chars().next().unwrap(), s.chars().rev().next().unwrap()); 159 | check(&r, expect.as_str()); 160 | } 161 | 162 | fn string_insert_at(s: &mut String, char_pos: usize, contents: &str) { 163 | // If you try to write past the end of the string for now I'll just write at the end. 164 | // Panicing might be a better policy. 165 | let byte_pos = s.char_indices().skip(char_pos).next() 166 | .map(|(p, _)| p).unwrap_or(s.len()); 167 | 168 | let old_len = s.len(); 169 | let new_bytes = contents.len(); 170 | 171 | // This didn't work because it didn't change the string's length 172 | //s.reserve(new_bytes); 173 | 174 | // This is sort of ugly but its fine. 175 | for _ in 0..new_bytes { s.push('\0'); } 176 | 177 | //println!("new bytes {} {} {}", new_bytes, byte_pos, s.len() - byte_pos); 178 | unsafe { 179 | let bytes = s.as_mut_vec().as_mut_ptr(); 180 | ptr::copy( 181 | bytes.offset(byte_pos as isize), 182 | bytes.offset((byte_pos + new_bytes) as isize), 183 | old_len - byte_pos 184 | ); 185 | ptr::copy_nonoverlapping( 186 | contents.as_ptr(), 187 | bytes.offset(byte_pos as isize), 188 | new_bytes 189 | ); 190 | } 191 | } 192 | 193 | fn char_range_to_byte_range(s: &String, range: Range) -> Range { 194 | let mut iter = s.char_indices().map(|(p, _)| p).skip(range.start).peekable(); 195 | 196 | let start = iter.peek().map_or_else(|| s.len(), |&p| p); 197 | let mut iter = iter.skip(range.end - range.start).peekable(); 198 | let end = iter.peek().map_or_else(|| s.len(), |&p| p); 199 | 200 | start..end 201 | } 202 | 203 | fn string_del_at(s: &mut String, pos: usize, length: usize) { 204 | let byte_range = char_range_to_byte_range(s, pos..pos+length); 205 | 206 | s.drain(byte_range); 207 | } 208 | 209 | fn random_edits(seed: u64, verbose: bool) { 210 | let mut r = JumpRope::new(); 211 | let mut s = String::new(); 212 | 213 | // let mut rng = rand::thread_rng(); 214 | let mut rng = SmallRng::seed_from_u64(seed); 215 | 216 | for _i in 0..400 { 217 | if verbose { println!("{_i} s: '{s}'"); } 218 | // r.print(); 219 | 220 | let len = s.chars().count(); 221 | 222 | // if _i == 1 { 223 | // println!("haaayyy"); 224 | // } 225 | // println!("i {}: {}", i, len); 226 | 227 | if len == 0 || (len < 1000 && rng.gen::() < 0.5) { 228 | // Insert. 229 | let pos = rng.gen_range(0..len+1); 230 | // Sometimes generate strings longer than a single node to stress everything. 231 | let text = random_unicode_string(rng.gen_range(0..20), &mut rng); 232 | if verbose { 233 | println!("Inserting '{text}' at char {pos} (Byte length: {}, char len: {}, wchar len: {})", 234 | text.len(), text.chars().count(), 235 | text.chars().map(|c| c.len_utf16()).sum::() 236 | ); 237 | } 238 | 239 | r.insert(pos, text.as_str()); 240 | string_insert_at(&mut s, pos, text.as_str()); 241 | } else { 242 | // Delete 243 | let pos = rng.gen_range(0..len); 244 | let dlen = min(rng.gen_range(0..10), len - pos); 245 | if verbose { 246 | println!("Removing {dlen} characters at {pos}"); 247 | } 248 | 249 | r.remove(pos..pos+dlen); 250 | string_del_at(&mut s, pos, dlen); 251 | } 252 | 253 | // Calling check() is super slow with miri, and it doesn't matter much so long as we test 254 | // for correctness normally. 255 | if !cfg!(miri) { 256 | check(&r, s.as_str()); 257 | } 258 | } 259 | 260 | if cfg!(miri) { 261 | check(&r, s.as_str()); 262 | } 263 | } 264 | 265 | #[test] 266 | fn fuzz_once() { 267 | random_edits(10, false); 268 | } 269 | 270 | // Run with: 271 | // cargo test --release fuzz_forever -- --ignored --nocapture 272 | #[test] 273 | #[ignore] 274 | fn fuzz_forever() { 275 | for seed in 0.. { 276 | if seed % 100 == 0 { println!("seed: {seed}"); } 277 | random_edits(seed, false); 278 | } 279 | } 280 | 281 | #[cfg(feature = "wchar_conversion")] 282 | fn random_edits_wchar(seed: u64, verbose: bool) { 283 | let mut r = JumpRope::new(); 284 | let mut s = String::new(); 285 | 286 | // let mut rng = rand::thread_rng(); 287 | let mut rng = SmallRng::seed_from_u64(seed); 288 | 289 | for _i in 0..400 { 290 | if verbose { println!("{_i} s: '{s}'"); } 291 | // r.print(); 292 | let len_chars = s.chars().count(); 293 | 294 | // println!("i {}: {}", i, len); 295 | 296 | if len_chars == 0 || (len_chars < 1000 && rng.gen::() < 0.5) { 297 | // Insert. 298 | let pos_chars = rng.gen_range(0..len_chars + 1); 299 | // Convert pos to wchars 300 | let pos_wchar = s 301 | .chars() 302 | .take(pos_chars) 303 | .map(|c| c.len_utf16()) 304 | .sum(); 305 | // Sometimes generate strings longer than a single node to stress everything. 306 | let text = random_unicode_string(rng.gen_range(0..20), &mut rng); 307 | if verbose { 308 | println!("Inserting '{text}' at char {pos_chars} / wchar {pos_wchar}"); 309 | println!("Byte length {} char len {} / wchar len {}", 310 | text.len(), text.chars().count(), text.chars().map(|c| c.len_utf16()).sum::()); 311 | } 312 | r.insert_at_wchar(pos_wchar, text.as_str()); 313 | // r.print(); 314 | string_insert_at(&mut s, pos_chars, text.as_str()); 315 | } else { 316 | // Delete 317 | let pos_chars = rng.gen_range(0..len_chars); 318 | let dlen_chars = min(rng.gen_range(0..10), len_chars - pos_chars); 319 | let char_range = pos_chars..pos_chars+dlen_chars; 320 | let byte_range = char_range_to_byte_range(&s, char_range.clone()); 321 | // Now convert it to a wchar range :p 322 | let start_wchar = s[..byte_range.start].chars().map(|c| c.len_utf16()).sum::(); 323 | let len_wchar = s[byte_range.clone()].chars().map(|c| c.len_utf16()).sum::(); 324 | let wchar_range = start_wchar..start_wchar + len_wchar; 325 | 326 | if verbose { 327 | println!("Removing {}..{} (wchar {}..{})", 328 | char_range.start, char_range.end, 329 | wchar_range.start, wchar_range.end 330 | ); 331 | } 332 | 333 | // r.remove(pos_chars..pos_chars + dlen_chars); 334 | r.remove_at_wchar(wchar_range); 335 | // r.print(); 336 | // string_del_at(&mut s, pos_chars, dlen_chars); 337 | s.drain(byte_range); 338 | } 339 | 340 | if !cfg!(miri) { 341 | check(&r, s.as_str()); 342 | } 343 | } 344 | } 345 | 346 | #[cfg(feature = "wchar_conversion")] 347 | #[test] 348 | fn fuzz_wchar_once() { 349 | random_edits_wchar(22, false); 350 | } 351 | 352 | // Run with: 353 | // cargo test --release fuzz_forever -- --ignored --nocapture 354 | #[cfg(feature = "wchar_conversion")] 355 | #[test] 356 | #[ignore] 357 | fn fuzz_wchar_forever() { 358 | for seed in 0.. { 359 | if seed % 100 == 0 { println!("seed: {seed}"); } 360 | random_edits_wchar(seed, false); 361 | } 362 | } 363 | 364 | fn random_edits_buffered(seed: u64, verbose: bool) { 365 | let mut r = JumpRopeBuf::new(); 366 | let mut s = String::new(); 367 | 368 | // let mut rng = rand::thread_rng(); 369 | let mut rng = SmallRng::seed_from_u64(seed); 370 | 371 | for _i in 0..400 { 372 | // for _i in 0..19 { 373 | if verbose { println!("{_i} s: '{s}'"); } 374 | // r.print(); 375 | 376 | let len = s.chars().count(); 377 | 378 | // if _i == 1 { 379 | // println!("haaayyy"); 380 | // } 381 | // println!("i {}: {}", i, len); 382 | 383 | if len == 0 || (len < 1000 && rng.gen::() < 0.5) { 384 | // Insert. 385 | let pos = rng.gen_range(0..len+1); 386 | // Sometimes generate strings longer than a single node to stress everything. 387 | let text = random_unicode_string(rng.gen_range(0..20), &mut rng); 388 | if verbose { 389 | println!("Inserting '{text}' at char {pos} (Byte length: {}, char len: {}, wchar len: {})", 390 | text.len(), text.chars().count(), 391 | text.chars().map(|c| c.len_utf16()).sum::() 392 | ); 393 | } 394 | 395 | r.insert(pos, text.as_str()); 396 | string_insert_at(&mut s, pos, text.as_str()); 397 | } else { 398 | // Delete 399 | let pos = rng.gen_range(0..len); 400 | let dlen = min(rng.gen_range(0..10), len - pos); 401 | if verbose { 402 | println!("Removing {dlen} characters at {pos}"); 403 | } 404 | 405 | r.remove(pos..pos+dlen); 406 | string_del_at(&mut s, pos, dlen); 407 | } 408 | // dbg!(&r); 409 | 410 | assert_eq!(r.is_empty(), s.is_empty()); 411 | 412 | // Checking the length flushes the buffered op - which is a useful test, but if we do it 413 | // every time, the buffer won't build up and the test won't have the right coverage. 414 | if rng.gen_bool(0.05) { 415 | assert_eq!(r.len_chars(), s.chars().count()); 416 | } 417 | } 418 | 419 | let rope = r.into_inner(); 420 | check(&rope, s.as_str()); 421 | } 422 | 423 | #[test] 424 | fn fuzz_buffered_once() { 425 | random_edits_buffered(0, false); 426 | } 427 | 428 | #[test] 429 | #[ignore] 430 | fn fuzz_buffered_forever() { 431 | for seed in 0.. { 432 | if seed % 1000 == 0 { println!("seed: {seed}"); } 433 | random_edits_buffered(seed, false); 434 | } 435 | } 436 | 437 | #[test] 438 | fn eq_variants() { 439 | let rope = JumpRope::from("Hi there"); 440 | 441 | assert_eq!(rope.clone(), "Hi there"); 442 | assert_eq!(rope.clone(), String::from("Hi there")); 443 | assert_eq!(rope.clone(), &String::from("Hi there")); 444 | 445 | assert_eq!(&rope, "Hi there"); 446 | assert_eq!(&rope, String::from("Hi there")); 447 | assert_eq!(&rope, &String::from("Hi there")); 448 | } 449 | 450 | #[test] 451 | fn buffered_eq_variants() { 452 | let rope = JumpRopeBuf::from("Hi there"); 453 | 454 | assert_eq!(rope.clone(), "Hi there"); 455 | assert_eq!(rope.clone(), String::from("Hi there")); 456 | assert_eq!(rope.clone(), &String::from("Hi there")); 457 | 458 | assert_eq!(&rope, "Hi there"); 459 | assert_eq!(&rope, String::from("Hi there")); 460 | assert_eq!(&rope, &String::from("Hi there")); 461 | } --------------------------------------------------------------------------------