├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── afl ├── README.md ├── afl_harness.c ├── afl_testcases │ └── simple └── run_afl ├── librope.sln ├── librope.vcxproj ├── librope.xcodeproj └── project.pbxproj ├── rope.c ├── rope.h └── test ├── benchmark.c ├── slowstring.c ├── slowstring.h ├── tests.c └── tests.h /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.bc 3 | librope.a 4 | tests 5 | *.swp 6 | .DS_Store 7 | Debug 8 | Release 9 | librope.suo 10 | librope.sdf 11 | librope.vcxproj.* 12 | Build 13 | *.dSYM 14 | 15 | afl/afl 16 | afl/afl_findings 17 | 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Licensed under the standard MIT license: 2 | 3 | Copyright 2011 Joseph Gentle. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all clean 2 | 3 | CFLAGS=-O2 -Wall -I. -std=c99 4 | 5 | UNAME := $(shell uname) 6 | 7 | ifeq ($(UNAME), Darwin) 8 | CFLAGS := $(CFLAGS) -arch x86_64 9 | endif 10 | 11 | all: librope.a 12 | 13 | clean: 14 | rm -f librope.a *.bc *.o tests 15 | 16 | # You can add -emit-llvm here if you're using clang. 17 | rope.o: rope.c rope.h 18 | $(CC) $(CFLAGS) $< -c -o $@ 19 | 20 | librope.a: rope.o 21 | ar rcs $@ $+ 22 | 23 | # Only need corefoundation to run the tests on mac 24 | tests: test/tests.c test/benchmark.c test/slowstring.c librope.a 25 | $(CC) $(CFLAGS) $+ -o $@ 26 | 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | librope 2 | ======= 3 | 4 | This is a little C library for heavyweight utf-8 strings (rope). Unlike regular C strings, ropes can do substring insertion and deletion in O(log n) time. 5 | 6 | librope is implemented using skip lists, which have the same big-O time complexity as trees but don't require rebalancing. 7 | 8 | librope is _fast_. It will happily perform [~15 million edit operations per second](https://home.seph.codes/public/rope_bench/realworld/C-JumpRope/automerge-paper/report/index.html) on a modern CPU. Inserts and deletes in librope outperform straight C strings for any document longer than a few hundred bytes. 9 | 10 | ## Support 11 | 12 | This library works (C code never dies). But I'm moving to rust for my newer projects. This library has been rewritten in rust as [Jumprope](https://crates.io/crates/jumprope). Jumprope is another 2-3x faster than this library on real world editing traces. Its obnoxiously fast. 13 | 14 | Usage 15 | ----- 16 | 17 | Just add `rope.c` and `rope.h` to your project. 18 | Be sure to add `rope.c` to your compile line as well. 19 | 20 | ```c 21 | // Import rope library into project 22 | #include "rope.h" 23 | 24 | // Make a new empty rope 25 | rope *r = rope_new(); 26 | 27 | // Put some content in it (at position 0) 28 | rope_insert(r, 0, "Hi there!"); 29 | 30 | // Delete 6 characters at position 2 31 | rope_del(r, 2, 6); 32 | 33 | // Get the whole string back out of the rope 34 | uint8_t *str = rope_create_cstr(r); 35 | 36 | // str now contains "Hi!"! Test it out!: 37 | _rope_print(r); 38 | 39 | // Done with the rope? 40 | rope_free(r); 41 | ``` 42 | 43 | Wide Character String Compatibility 44 | ----------------------------------- 45 | 46 | String insertion / deletion positions in Javascript, Objective-C (NSString), Java, C# and others are **wrong sometimes**!!! 47 | 48 | These languages store strings as `wchar` arrays (arrays of two byte characters). Some characters in the unicode character set require more than two bytes. These languages encode such characters using multiple wchars as per UTF-16. This works most of the time. However, insertion and deletion positions in these strings still refer to offsets in the underlying array. So unicode characters which take up 4 bytes in UTF-16 count as two characters for the purpose of deletion ranges, insertion positions and string length. 49 | 50 | Even though these characters are exceptionally rare, I don't want my editor to go all funky if people start getting creative. About a quarter of librope's code is dedicated to fixing this mismatch. However, bookkeeping isn't free - librope performance drops by 35% when wchar conversion support is enabled. 51 | 52 | For more information, read my [blog post about it](https://josephg.com/blog/string-length-lies). 53 | 54 | Long story short, if you need to interoperate with strings from any of these dodgy languages, here's what you do: 55 | 56 | - Compile with `-DROPE_WCHAR=1`. This macro enables the expensive wchar bookkeeping. 57 | - Use the alternate insert & delete functions `rope_insert_at_wchar(...)` and `rope_del_at_wchar(...)` when your index / size is specified in UTF-16 offsets. 58 | 59 | Take a look at the header file for documentation. 60 | 61 | #### Beware: 62 | 63 | - When using `rope_insert_at_wchar` you still need to convert the string you're inserting into UTF-8 before you pass it into librope. 64 | - The API lets you try to delete or insert halfway through a large character. You probably don't want to do that. 65 | - librope is 100% faithful when it comes to the characters you're inserting. If your string has byte order marks, you might want to remove them before passing the string into librope. 66 | 67 | -------------------------------------------------------------------------------- /afl/README.md: -------------------------------------------------------------------------------- 1 | This is a little harness & set of tools for testing librope with 2 | [american fuzzy lop](http://lcamtuf.coredump.cx/afl/). 3 | 4 | To get started, have a read through my [blog post on 5 | AFL](https://josephg.com/blog/bug-hunting-with-american-fuzzy-lop/). 6 | -------------------------------------------------------------------------------- /afl/afl_harness.c: -------------------------------------------------------------------------------- 1 | // 2 | // afl.c 3 | // librope 4 | // 5 | // Created by Joseph Gentle on 11/12/2014. 6 | // Copyright (c) 2014 Joseph Gentle. All rights reserved. 7 | // 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "rope.h" 14 | 15 | int main() { 16 | printf("AFL test harness\n"); 17 | rope *r = rope_new(); 18 | 19 | //FILE *stream = fopen("/Users/josephg/src/librope/death1", "r"); 20 | FILE *stream = stdin; 21 | 22 | char *buffer = NULL; 23 | size_t buf_cap = 0; 24 | while (true) { 25 | // First read the position we're editing the rope 26 | ssize_t bytes_read = getline(&buffer, &buf_cap, stream); 27 | if (bytes_read == -1) break; 28 | 29 | int pos = atoi(buffer); 30 | int length = (int)rope_char_count(r); 31 | pos = pos < 0 ? 0 : pos > length ? length : pos; 32 | 33 | // Now read the characters to insert 34 | bytes_read = getline(&buffer, &buf_cap, stream); 35 | if (bytes_read == -1) break; 36 | 37 | if (bytes_read > 0 && buffer[0] == '-') { 38 | // Delete some characters 39 | int to_del = atoi(&buffer[1]); 40 | rope_del(r, pos, to_del); 41 | } else { 42 | // Delete the newline. 43 | if (bytes_read > 0) buffer[bytes_read - 1] = '\0'; 44 | ROPE_RESULT result = rope_insert(r, pos, (uint8_t *)buffer); 45 | if (result == ROPE_INVALID_UTF8) { 46 | fprintf(stderr, "invalid utf8 - insert ignored\n"); 47 | } 48 | } 49 | } 50 | 51 | _rope_check(r); 52 | printf("Final length: %zu\n", rope_char_count(r)); 53 | rope_free(r); 54 | } 55 | 56 | -------------------------------------------------------------------------------- /afl/afl_testcases/simple: -------------------------------------------------------------------------------- 1 | 0 2 | omg hi 3 | 3 4 | -3 5 | 6 | -------------------------------------------------------------------------------- /afl/run_afl: -------------------------------------------------------------------------------- 1 | rm -rf afl_findings/* 2 | afl-clang -O2 -Wall -I.. -std=c99 -arch x86_64 ../rope.c afl_harness.c -o afl 3 | afl-fuzz -i afl_testcases -o afl_findings ./afl 4 | 5 | -------------------------------------------------------------------------------- /librope.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 11.00 3 | # Visual Studio 2010 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "librope", "librope.vcxproj", "{8BC2F5A9-0E22-C440-4A06-6EA14611419A}" 5 | EndProject 6 | Global 7 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 8 | Debug|Win32 = Debug|Win32 9 | Release|Win32 = Release|Win32 10 | EndGlobalSection 11 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 12 | {8BC2F5A9-0E22-C440-4A06-6EA14611419A}.Debug|Win32.ActiveCfg = Debug|Win32 13 | {8BC2F5A9-0E22-C440-4A06-6EA14611419A}.Debug|Win32.Build.0 = Debug|Win32 14 | {8BC2F5A9-0E22-C440-4A06-6EA14611419A}.Release|Win32.ActiveCfg = Release|Win32 15 | {8BC2F5A9-0E22-C440-4A06-6EA14611419A}.Release|Win32.Build.0 = Release|Win32 16 | EndGlobalSection 17 | GlobalSection(SolutionProperties) = preSolution 18 | HideSolutionNode = FALSE 19 | EndGlobalSection 20 | EndGlobal 21 | -------------------------------------------------------------------------------- /librope.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | Win32Proj 15 | 16 | 17 | 18 | StaticLibrary 19 | true 20 | 21 | 22 | StaticLibrary 23 | false 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | true 37 | 38 | 39 | true 40 | 41 | 42 | 43 | WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 44 | MultiThreadedDebugDLL 45 | Level3 46 | ProgramDatabase 47 | Disabled 48 | CompileAsCpp 49 | 50 | 51 | MachineX86 52 | true 53 | Console 54 | 55 | 56 | 57 | 58 | WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 59 | MultiThreadedDLL 60 | Level3 61 | ProgramDatabase 62 | CompileAsCpp 63 | 64 | 65 | MachineX86 66 | true 67 | Console 68 | true 69 | true 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /librope.xcodeproj/project.pbxproj: -------------------------------------------------------------------------------- 1 | // !$*UTF8*$! 2 | { 3 | archiveVersion = 1; 4 | classes = { 5 | }; 6 | objectVersion = 46; 7 | objects = { 8 | 9 | /* Begin PBXBuildFile section */ 10 | FD5D7C3315E1DC4A00F847DF /* rope.c in Sources */ = {isa = PBXBuildFile; fileRef = FD5D7C3215E1DC4A00F847DF /* rope.c */; }; 11 | FD90DBF315EF60900045B2C2 /* librope.a in Frameworks */ = {isa = PBXBuildFile; fileRef = FD5D7C2515E1DBAD00F847DF /* librope.a */; }; 12 | FD967CC215ECBF8600B36CA1 /* benchmark.c in Sources */ = {isa = PBXBuildFile; fileRef = FD967CBD15ECBF8600B36CA1 /* benchmark.c */; }; 13 | FD967CC315ECBF8600B36CA1 /* slowstring.c in Sources */ = {isa = PBXBuildFile; fileRef = FD967CBE15ECBF8600B36CA1 /* slowstring.c */; }; 14 | FD967CC415ECBF8600B36CA1 /* tests.c in Sources */ = {isa = PBXBuildFile; fileRef = FD967CC015ECBF8600B36CA1 /* tests.c */; }; 15 | FD967CC515ECBF9400B36CA1 /* rope.h in Headers */ = {isa = PBXBuildFile; fileRef = FD5D7C3515E1DC5300F847DF /* rope.h */; settings = {ATTRIBUTES = (Public, ); }; }; 16 | /* End PBXBuildFile section */ 17 | 18 | /* Begin PBXCopyFilesBuildPhase section */ 19 | FD5D7C3A15E1DCA100F847DF /* CopyFiles */ = { 20 | isa = PBXCopyFilesBuildPhase; 21 | buildActionMask = 2147483647; 22 | dstPath = /usr/share/man/man1/; 23 | dstSubfolderSpec = 0; 24 | files = ( 25 | ); 26 | runOnlyForDeploymentPostprocessing = 1; 27 | }; 28 | /* End PBXCopyFilesBuildPhase section */ 29 | 30 | /* Begin PBXFileReference section */ 31 | FD5D7C2515E1DBAD00F847DF /* librope.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = librope.a; sourceTree = BUILT_PRODUCTS_DIR; }; 32 | FD5D7C3215E1DC4A00F847DF /* rope.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; lineEnding = 0; path = rope.c; sourceTree = ""; xcLanguageSpecificationIdentifier = xcode.lang.c; }; 33 | FD5D7C3515E1DC5300F847DF /* rope.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; lineEnding = 0; path = rope.h; sourceTree = ""; xcLanguageSpecificationIdentifier = xcode.lang.objcpp; }; 34 | FD5D7C3C15E1DCA100F847DF /* tests */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = tests; sourceTree = BUILT_PRODUCTS_DIR; }; 35 | FD967CBD15ECBF8600B36CA1 /* benchmark.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = benchmark.c; sourceTree = ""; }; 36 | FD967CBE15ECBF8600B36CA1 /* slowstring.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = slowstring.c; sourceTree = ""; }; 37 | FD967CBF15ECBF8600B36CA1 /* slowstring.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = slowstring.h; sourceTree = ""; }; 38 | FD967CC015ECBF8600B36CA1 /* tests.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; lineEnding = 0; path = tests.c; sourceTree = ""; xcLanguageSpecificationIdentifier = xcode.lang.c; }; 39 | FD967CC115ECBF8600B36CA1 /* tests.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tests.h; sourceTree = ""; }; 40 | /* End PBXFileReference section */ 41 | 42 | /* Begin PBXFrameworksBuildPhase section */ 43 | FD5D7C2215E1DBAD00F847DF /* Frameworks */ = { 44 | isa = PBXFrameworksBuildPhase; 45 | buildActionMask = 2147483647; 46 | files = ( 47 | ); 48 | runOnlyForDeploymentPostprocessing = 0; 49 | }; 50 | FD5D7C3915E1DCA100F847DF /* Frameworks */ = { 51 | isa = PBXFrameworksBuildPhase; 52 | buildActionMask = 2147483647; 53 | files = ( 54 | FD90DBF315EF60900045B2C2 /* librope.a in Frameworks */, 55 | ); 56 | runOnlyForDeploymentPostprocessing = 0; 57 | }; 58 | /* End PBXFrameworksBuildPhase section */ 59 | 60 | /* Begin PBXGroup section */ 61 | FD5D7C1A15E1DBAD00F847DF = { 62 | isa = PBXGroup; 63 | children = ( 64 | FD5D7C3215E1DC4A00F847DF /* rope.c */, 65 | FD5D7C3515E1DC5300F847DF /* rope.h */, 66 | FD967CBC15ECBF8600B36CA1 /* test */, 67 | FD5D7C2615E1DBAD00F847DF /* Products */, 68 | ); 69 | sourceTree = ""; 70 | }; 71 | FD5D7C2615E1DBAD00F847DF /* Products */ = { 72 | isa = PBXGroup; 73 | children = ( 74 | FD5D7C2515E1DBAD00F847DF /* librope.a */, 75 | FD5D7C3C15E1DCA100F847DF /* tests */, 76 | ); 77 | name = Products; 78 | sourceTree = ""; 79 | }; 80 | FD967CBC15ECBF8600B36CA1 /* test */ = { 81 | isa = PBXGroup; 82 | children = ( 83 | FD967CC015ECBF8600B36CA1 /* tests.c */, 84 | FD967CC115ECBF8600B36CA1 /* tests.h */, 85 | FD967CBD15ECBF8600B36CA1 /* benchmark.c */, 86 | FD967CBE15ECBF8600B36CA1 /* slowstring.c */, 87 | FD967CBF15ECBF8600B36CA1 /* slowstring.h */, 88 | ); 89 | path = test; 90 | sourceTree = ""; 91 | }; 92 | /* End PBXGroup section */ 93 | 94 | /* Begin PBXHeadersBuildPhase section */ 95 | FD5D7C2315E1DBAD00F847DF /* Headers */ = { 96 | isa = PBXHeadersBuildPhase; 97 | buildActionMask = 2147483647; 98 | files = ( 99 | FD967CC515ECBF9400B36CA1 /* rope.h in Headers */, 100 | ); 101 | runOnlyForDeploymentPostprocessing = 0; 102 | }; 103 | /* End PBXHeadersBuildPhase section */ 104 | 105 | /* Begin PBXNativeTarget section */ 106 | FD5D7C2415E1DBAD00F847DF /* rope */ = { 107 | isa = PBXNativeTarget; 108 | buildConfigurationList = FD5D7C2915E1DBAD00F847DF /* Build configuration list for PBXNativeTarget "rope" */; 109 | buildPhases = ( 110 | FD5D7C2115E1DBAD00F847DF /* Sources */, 111 | FD5D7C2215E1DBAD00F847DF /* Frameworks */, 112 | FD5D7C2315E1DBAD00F847DF /* Headers */, 113 | ); 114 | buildRules = ( 115 | ); 116 | dependencies = ( 117 | ); 118 | name = rope; 119 | productName = librope; 120 | productReference = FD5D7C2515E1DBAD00F847DF /* librope.a */; 121 | productType = "com.apple.product-type.library.static"; 122 | }; 123 | FD5D7C3B15E1DCA100F847DF /* tests */ = { 124 | isa = PBXNativeTarget; 125 | buildConfigurationList = FD5D7C4315E1DCA100F847DF /* Build configuration list for PBXNativeTarget "tests" */; 126 | buildPhases = ( 127 | FD5D7C3815E1DCA100F847DF /* Sources */, 128 | FD5D7C3915E1DCA100F847DF /* Frameworks */, 129 | FD5D7C3A15E1DCA100F847DF /* CopyFiles */, 130 | ); 131 | buildRules = ( 132 | ); 133 | dependencies = ( 134 | ); 135 | name = tests; 136 | productName = tests; 137 | productReference = FD5D7C3C15E1DCA100F847DF /* tests */; 138 | productType = "com.apple.product-type.tool"; 139 | }; 140 | /* End PBXNativeTarget section */ 141 | 142 | /* Begin PBXProject section */ 143 | FD5D7C1C15E1DBAD00F847DF /* Project object */ = { 144 | isa = PBXProject; 145 | attributes = { 146 | LastUpgradeCheck = 0510; 147 | ORGANIZATIONNAME = "Joseph Gentle"; 148 | }; 149 | buildConfigurationList = FD5D7C1F15E1DBAD00F847DF /* Build configuration list for PBXProject "librope" */; 150 | compatibilityVersion = "Xcode 3.2"; 151 | developmentRegion = English; 152 | hasScannedForEncodings = 0; 153 | knownRegions = ( 154 | en, 155 | ); 156 | mainGroup = FD5D7C1A15E1DBAD00F847DF; 157 | productRefGroup = FD5D7C2615E1DBAD00F847DF /* Products */; 158 | projectDirPath = ""; 159 | projectRoot = ""; 160 | targets = ( 161 | FD5D7C2415E1DBAD00F847DF /* rope */, 162 | FD5D7C3B15E1DCA100F847DF /* tests */, 163 | ); 164 | }; 165 | /* End PBXProject section */ 166 | 167 | /* Begin PBXSourcesBuildPhase section */ 168 | FD5D7C2115E1DBAD00F847DF /* Sources */ = { 169 | isa = PBXSourcesBuildPhase; 170 | buildActionMask = 2147483647; 171 | files = ( 172 | FD5D7C3315E1DC4A00F847DF /* rope.c in Sources */, 173 | ); 174 | runOnlyForDeploymentPostprocessing = 0; 175 | }; 176 | FD5D7C3815E1DCA100F847DF /* Sources */ = { 177 | isa = PBXSourcesBuildPhase; 178 | buildActionMask = 2147483647; 179 | files = ( 180 | FD967CC215ECBF8600B36CA1 /* benchmark.c in Sources */, 181 | FD967CC315ECBF8600B36CA1 /* slowstring.c in Sources */, 182 | FD967CC415ECBF8600B36CA1 /* tests.c in Sources */, 183 | ); 184 | runOnlyForDeploymentPostprocessing = 0; 185 | }; 186 | /* End PBXSourcesBuildPhase section */ 187 | 188 | /* Begin XCBuildConfiguration section */ 189 | FD5D7C2715E1DBAD00F847DF /* Debug */ = { 190 | isa = XCBuildConfiguration; 191 | buildSettings = { 192 | ALWAYS_SEARCH_USER_PATHS = NO; 193 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; 194 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; 195 | COPY_PHASE_STRIP = NO; 196 | GCC_C_LANGUAGE_STANDARD = gnu11; 197 | GCC_DYNAMIC_NO_PIC = NO; 198 | GCC_ENABLE_OBJC_EXCEPTIONS = YES; 199 | GCC_OPTIMIZATION_LEVEL = 0; 200 | GCC_PREPROCESSOR_DEFINITIONS = ( 201 | "DEBUG=1", 202 | "$(inherited)", 203 | ); 204 | GCC_SYMBOLS_PRIVATE_EXTERN = NO; 205 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES; 206 | GCC_WARN_ABOUT_RETURN_TYPE = YES; 207 | GCC_WARN_UNINITIALIZED_AUTOS = YES; 208 | GCC_WARN_UNUSED_VARIABLE = YES; 209 | MACOSX_DEPLOYMENT_TARGET = ""; 210 | ONLY_ACTIVE_ARCH = YES; 211 | SDKROOT = macosx; 212 | }; 213 | name = Debug; 214 | }; 215 | FD5D7C2815E1DBAD00F847DF /* Release */ = { 216 | isa = XCBuildConfiguration; 217 | buildSettings = { 218 | ALWAYS_SEARCH_USER_PATHS = NO; 219 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; 220 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; 221 | COPY_PHASE_STRIP = YES; 222 | DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; 223 | GCC_C_LANGUAGE_STANDARD = gnu11; 224 | GCC_ENABLE_OBJC_EXCEPTIONS = YES; 225 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES; 226 | GCC_WARN_ABOUT_RETURN_TYPE = YES; 227 | GCC_WARN_UNINITIALIZED_AUTOS = YES; 228 | GCC_WARN_UNUSED_VARIABLE = YES; 229 | LLVM_LTO = YES; 230 | MACOSX_DEPLOYMENT_TARGET = ""; 231 | SDKROOT = macosx; 232 | }; 233 | name = Release; 234 | }; 235 | FD5D7C2A15E1DBAD00F847DF /* Debug */ = { 236 | isa = XCBuildConfiguration; 237 | buildSettings = { 238 | COMBINE_HIDPI_IMAGES = YES; 239 | EXECUTABLE_PREFIX = lib; 240 | PRODUCT_NAME = "$(TARGET_NAME)"; 241 | }; 242 | name = Debug; 243 | }; 244 | FD5D7C2B15E1DBAD00F847DF /* Release */ = { 245 | isa = XCBuildConfiguration; 246 | buildSettings = { 247 | COMBINE_HIDPI_IMAGES = YES; 248 | EXECUTABLE_PREFIX = lib; 249 | PRODUCT_NAME = "$(TARGET_NAME)"; 250 | }; 251 | name = Release; 252 | }; 253 | FD5D7C4415E1DCA100F847DF /* Debug */ = { 254 | isa = XCBuildConfiguration; 255 | buildSettings = { 256 | PRODUCT_NAME = "$(TARGET_NAME)"; 257 | }; 258 | name = Debug; 259 | }; 260 | FD5D7C4515E1DCA100F847DF /* Release */ = { 261 | isa = XCBuildConfiguration; 262 | buildSettings = { 263 | CLANG_USE_OPTIMIZATION_PROFILE = YES; 264 | PRODUCT_NAME = "$(TARGET_NAME)"; 265 | }; 266 | name = Release; 267 | }; 268 | /* End XCBuildConfiguration section */ 269 | 270 | /* Begin XCConfigurationList section */ 271 | FD5D7C1F15E1DBAD00F847DF /* Build configuration list for PBXProject "librope" */ = { 272 | isa = XCConfigurationList; 273 | buildConfigurations = ( 274 | FD5D7C2715E1DBAD00F847DF /* Debug */, 275 | FD5D7C2815E1DBAD00F847DF /* Release */, 276 | ); 277 | defaultConfigurationIsVisible = 0; 278 | defaultConfigurationName = Release; 279 | }; 280 | FD5D7C2915E1DBAD00F847DF /* Build configuration list for PBXNativeTarget "rope" */ = { 281 | isa = XCConfigurationList; 282 | buildConfigurations = ( 283 | FD5D7C2A15E1DBAD00F847DF /* Debug */, 284 | FD5D7C2B15E1DBAD00F847DF /* Release */, 285 | ); 286 | defaultConfigurationIsVisible = 0; 287 | defaultConfigurationName = Release; 288 | }; 289 | FD5D7C4315E1DCA100F847DF /* Build configuration list for PBXNativeTarget "tests" */ = { 290 | isa = XCConfigurationList; 291 | buildConfigurations = ( 292 | FD5D7C4415E1DCA100F847DF /* Debug */, 293 | FD5D7C4515E1DCA100F847DF /* Release */, 294 | ); 295 | defaultConfigurationIsVisible = 0; 296 | defaultConfigurationName = Release; 297 | }; 298 | /* End XCConfigurationList section */ 299 | }; 300 | rootObject = FD5D7C1C15E1DBAD00F847DF /* Project object */; 301 | } 302 | -------------------------------------------------------------------------------- /rope.c: -------------------------------------------------------------------------------- 1 | // Implementation for rope library. 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | // Needed for VC++, which always compiles in C++ mode and doesn't have stdbool. 8 | #ifndef __cplusplus 9 | #include 10 | #endif 11 | 12 | #include 13 | #include "rope.h" 14 | 15 | // The number of bytes the rope head structure takes up 16 | static const size_t ROPE_SIZE = sizeof(rope) + sizeof(rope_node) * ROPE_MAX_HEIGHT; 17 | 18 | // Create a new rope with no contents 19 | rope *rope_new2(void *(*alloc)(size_t bytes), 20 | void *(*realloc)(void *ptr, size_t newsize), 21 | void (*free)(void *ptr)) { 22 | rope *r = (rope *)alloc(ROPE_SIZE); 23 | r->num_chars = r->num_bytes = 0; 24 | 25 | r->alloc = alloc; 26 | r->realloc = realloc; 27 | r->free = free; 28 | 29 | r->head.height = 1; 30 | r->head.num_bytes = 0; 31 | r->head.nexts[0].node = NULL; 32 | r->head.nexts[0].skip_size = 0; 33 | #if ROPE_WCHAR 34 | r->head.nexts[0].wchar_size = 0; 35 | #endif 36 | return r; 37 | } 38 | 39 | rope *rope_new() { 40 | return rope_new2(malloc, realloc, free); 41 | } 42 | 43 | // Create a new rope containing the specified string 44 | rope *rope_new_with_utf8(const uint8_t *str) { 45 | rope *r = rope_new(); 46 | ROPE_RESULT result = rope_insert(r, 0, str); 47 | 48 | if (result != ROPE_OK) { 49 | rope_free(r); 50 | return NULL; 51 | } else { 52 | return r; 53 | } 54 | } 55 | 56 | rope *rope_copy(const rope *other) { 57 | rope *r = (rope *)other->alloc(ROPE_SIZE); 58 | 59 | // Just copy most of the head's data. Note this won't copy the nexts list in head. 60 | *r = *other; 61 | 62 | rope_node *nodes[ROPE_MAX_HEIGHT]; 63 | 64 | for (int i = 0; i < other->head.height; i++) { 65 | nodes[i] = &r->head; 66 | // non-NULL next pointers will be rewritten below. 67 | r->head.nexts[i] = other->head.nexts[i]; 68 | } 69 | 70 | for (rope_node *n = other->head.nexts[0].node; n != NULL; n = n->nexts[0].node) { 71 | // I wonder if it would be faster if we took this opportunity to rebalance the node list..? 72 | size_t h = n->height; 73 | rope_node *n2 = (rope_node *)r->alloc(sizeof(rope_node) + h * sizeof(rope_skip_node)); 74 | 75 | // Would it be faster to just *n2 = *n; ? 76 | n2->num_bytes = n->num_bytes; 77 | n2->height = h; 78 | memcpy(n2->str, n->str, n->num_bytes); 79 | memcpy(n2->nexts, n->nexts, h * sizeof(rope_skip_node)); 80 | 81 | for (int i = 0; i < h; i++) { 82 | nodes[i]->nexts[i].node = n2; 83 | nodes[i] = n2; 84 | } 85 | } 86 | 87 | return r; 88 | } 89 | 90 | // Free the specified rope 91 | void rope_free(rope *r) { 92 | assert(r); 93 | rope_node *next; 94 | 95 | for (rope_node *n = r->head.nexts[0].node; n != NULL; n = next) { 96 | next = n->nexts[0].node; 97 | r->free(n); 98 | } 99 | 100 | r->free(r); 101 | } 102 | 103 | // Get the number of characters in a rope 104 | size_t rope_char_count(const rope *r) { 105 | assert(r); 106 | return r->num_chars; 107 | } 108 | 109 | // Get the number of bytes which the rope would take up if stored as a utf8 110 | // string 111 | size_t rope_byte_count(const rope *r) { 112 | assert(r); 113 | return r->num_bytes; 114 | } 115 | 116 | // Copies the rope's contents into a utf8 encoded C string. Also copies a trailing '\0' character. 117 | // Returns the number of bytes written, which is rope_byte_count(r) + 1. 118 | size_t rope_write_cstr(rope *r, uint8_t *dest) { 119 | size_t num_bytes = rope_byte_count(r); 120 | dest[num_bytes] = '\0'; 121 | 122 | if (num_bytes) { 123 | uint8_t *p = dest; 124 | for (rope_node* restrict n = &r->head; n != NULL; n = n->nexts[0].node) { 125 | memcpy(p, n->str, n->num_bytes); 126 | p += n->num_bytes; 127 | } 128 | 129 | assert(p == &dest[num_bytes]); 130 | } 131 | return num_bytes + 1; 132 | } 133 | 134 | // Create a new C string which contains the rope. The string will contain 135 | // the rope encoded as utf8. 136 | uint8_t *rope_create_cstr(rope *r) { 137 | uint8_t *bytes = (uint8_t *)r->alloc(rope_byte_count(r) + 1); // Room for a zero. 138 | rope_write_cstr(r, bytes); 139 | return bytes; 140 | } 141 | 142 | #if ROPE_WCHAR 143 | size_t rope_wchar_count(rope *r) { 144 | assert(r); 145 | return r->head.nexts[r->head.height - 1].wchar_size; 146 | } 147 | #endif 148 | 149 | #define MIN(x,y) ((x) > (y) ? (y) : (x)) 150 | #define MAX(x,y) ((x) > (y) ? (x) : (y)) 151 | 152 | #ifdef _WIN32 153 | inline static long random() { 154 | return rand(); 155 | } 156 | #endif 157 | 158 | static uint8_t random_height() { 159 | // This function is horribly inefficient. I'm throwing away heaps of entropy, and 160 | // the mod could be replaced by some clever shifting. 161 | // 162 | // However, random_height barely appears in the profiler output - so its probably 163 | // not worth investing the time to optimise. 164 | 165 | uint8_t height = 1; 166 | 167 | // The root node's height is the height of the largest node + 1, so the largest 168 | // node can only have ROPE_MAX_HEIGHT - 1. 169 | while(height < (ROPE_MAX_HEIGHT - 1) && (random() % 100) < ROPE_BIAS) { 170 | height++; 171 | } 172 | 173 | return height; 174 | } 175 | 176 | // Figure out how many bytes to allocate for a node with the specified height. 177 | static size_t node_size(uint8_t height) { 178 | return sizeof(rope_node) + height * sizeof(rope_skip_node); 179 | } 180 | 181 | // Allocate and return a new node. The new node will be full of junk, except 182 | // for its height. 183 | // This function should be replaced at some point with an object pool based version. 184 | static rope_node *alloc_node(rope *r, uint8_t height) { 185 | rope_node *node = (rope_node *)r->alloc(node_size(height)); 186 | node->height = height; 187 | return node; 188 | } 189 | 190 | // Find out how many bytes the unicode character which starts with the specified byte 191 | // will occupy in memory. 192 | // Returns the number of bytes, or SIZE_MAX if the byte is invalid. 193 | static inline size_t codepoint_size(uint8_t byte) { 194 | if (byte == 0) { return SIZE_MAX; } // NULL byte. 195 | else if (byte <= 0x7f) { return 1; } // 0x74 = 0111 1111 196 | else if (byte <= 0xbf) { return SIZE_MAX; } // 1011 1111. Invalid for a starting byte. 197 | else if (byte <= 0xdf) { return 2; } // 1101 1111 198 | else if (byte <= 0xef) { return 3; } // 1110 1111 199 | else if (byte <= 0xf7) { return 4; } // 1111 0111 200 | else if (byte <= 0xfb) { return 5; } // 1111 1011 201 | else if (byte <= 0xfd) { return 6; } // 1111 1101 202 | else { return SIZE_MAX; } 203 | } 204 | 205 | // This little function counts how many bytes a certain number of characters take up. 206 | static size_t count_bytes_in_utf8(const uint8_t *str, size_t num_chars) { 207 | const uint8_t *p = str; 208 | for (unsigned int i = 0; i < num_chars; i++) { 209 | p += codepoint_size(*p); 210 | } 211 | return p - str; 212 | } 213 | 214 | #if ROPE_WCHAR 215 | 216 | #define NEEDS_TWO_WCHARS(x) (((x) & 0xf0) == 0xf0) 217 | 218 | static size_t count_wchars_in_utf8(const uint8_t *str, size_t num_chars) { 219 | size_t wchars = 0; 220 | for (unsigned int i = 0; i < num_chars; i++) { 221 | wchars += 1 + NEEDS_TWO_WCHARS(*str); 222 | str += codepoint_size(*str); 223 | } 224 | return wchars; 225 | } 226 | 227 | static size_t count_utf8_in_wchars(const uint8_t *str, size_t num_wchars) { 228 | size_t chars = num_wchars; 229 | for (unsigned int i = 0; i < num_wchars; i++) { 230 | if (NEEDS_TWO_WCHARS(*str)) { 231 | chars--; 232 | i++; 233 | } 234 | str += codepoint_size(*str); 235 | } 236 | return chars; 237 | } 238 | #endif 239 | 240 | // Count the number of characters in a string. 241 | static size_t strlen_utf8(const uint8_t *str) { 242 | const uint8_t *p = str; 243 | size_t i = 0; 244 | while (*p) { 245 | p += codepoint_size(*p); 246 | i++; 247 | } 248 | return i; 249 | } 250 | 251 | // Checks if a UTF8 string is ok. Returns the number of bytes in the string if 252 | // it is ok, otherwise returns -1. 253 | static ssize_t bytelen_and_check_utf8(const uint8_t *str) { 254 | const uint8_t *p = str; 255 | while (*p != '\0') { 256 | size_t size = codepoint_size(*p); 257 | if (size == SIZE_MAX) return -1; 258 | p++; size--; 259 | while (size > 0) { 260 | // Check that any middle bytes are of the form 0x10xx xxxx 261 | if ((*p & 0xc0) != 0x80) 262 | return -1; 263 | p++; size--; 264 | } 265 | } 266 | 267 | #ifdef DEBUG 268 | size_t num = p - str; 269 | assert(num == strlen((char *)str)); 270 | #endif 271 | 272 | return p - str; 273 | } 274 | 275 | typedef struct { 276 | // This stores the previous node at each height, and the number of characters from the start of 277 | // the previous node to the current iterator position. 278 | rope_skip_node s[ROPE_MAX_HEIGHT]; 279 | } rope_iter; 280 | 281 | // Internal function for navigating to a particular character offset in the rope. 282 | // The function returns the list of nodes which point past the position, as well as 283 | // offsets of how far into their character lists the specified characters are. 284 | static rope_node *iter_at_char_pos(rope *r, size_t char_pos, rope_iter *iter) { 285 | assert(char_pos <= r->num_chars); 286 | 287 | rope_node *e = &r->head; 288 | int height = r->head.height - 1; 289 | 290 | // Offset stores how many characters we still need to skip in the current node. 291 | size_t offset = char_pos; 292 | size_t skip; 293 | #if ROPE_WCHAR 294 | size_t wchar_pos = 0; // Current wchar pos from the start of the rope. 295 | #endif 296 | 297 | while (true) { 298 | skip = e->nexts[height].skip_size; 299 | if (offset > skip) { 300 | // Go right. 301 | assert(e == &r->head || e->num_bytes); 302 | 303 | offset -= skip; 304 | #if ROPE_WCHAR 305 | wchar_pos += e->nexts[height].wchar_size; 306 | #endif 307 | e = e->nexts[height].node; 308 | } else { 309 | // Go down. 310 | iter->s[height].skip_size = offset; 311 | iter->s[height].node = e; 312 | #if ROPE_WCHAR 313 | iter->s[height].wchar_size = wchar_pos; 314 | #endif 315 | 316 | if (height == 0) { 317 | break; 318 | } else { 319 | height--; 320 | } 321 | } 322 | } 323 | 324 | #if ROPE_WCHAR 325 | // For some reason, this is _REALLY SLOW_. Like, 5.5Mops/s -> 4Mops/s from this block of code. 326 | wchar_pos += count_wchars_in_utf8(e->str, offset); 327 | 328 | // The iterator has the wchar pos from the start of the whole string. 329 | for (int i = 0; i < r->head.height; i++) { 330 | iter->s[i].wchar_size = wchar_pos - iter->s[i].wchar_size; 331 | } 332 | #endif 333 | 334 | assert(offset <= ROPE_NODE_STR_SIZE); 335 | assert(iter->s[0].node == e); 336 | return e; 337 | } 338 | 339 | #if ROPE_WCHAR 340 | // Equivalent of iter_at_char_pos, but for wchar positions instead. 341 | static rope_node *iter_at_wchar_pos(rope *r, size_t wchar_pos, rope_iter *iter) { 342 | int height = r->head.height - 1; 343 | assert(wchar_pos <= r->head.nexts[height].wchar_size); 344 | 345 | rope_node *e = &r->head; 346 | 347 | // Offset stores how many wchar characters we still need to skip in the current node. 348 | size_t offset = wchar_pos; 349 | size_t skip; 350 | size_t char_pos = 0; // Current char pos from the start of the rope. 351 | 352 | while (true) { 353 | skip = e->nexts[height].wchar_size; 354 | if (offset > skip) { 355 | // Go right. 356 | offset -= skip; 357 | char_pos += e->nexts[height].skip_size; 358 | e = e->nexts[height].node; 359 | } else { 360 | // Go down. 361 | iter->s[height].skip_size = char_pos; 362 | iter->s[height].node = e; 363 | iter->s[height].wchar_size = offset; 364 | 365 | if (height == 0) { 366 | break; 367 | } else { 368 | height--; 369 | } 370 | } 371 | } 372 | 373 | char_pos += count_utf8_in_wchars(e->str, offset); 374 | 375 | // The iterator has character positions from the start of the rope to the start of the node. 376 | for (int i = 0; i < r->head.height; i++) { 377 | iter->s[i].skip_size = char_pos - iter->s[i].skip_size; 378 | } 379 | assert(e == iter->s[0].node); 380 | return e; 381 | } 382 | #endif 383 | 384 | #if ROPE_WCHAR 385 | static void update_offset_list(rope *r, rope_iter *iter, size_t num_chars, size_t num_wchars) { 386 | for (int i = 0; i < r->head.height; i++) { 387 | iter->s[i].node->nexts[i].skip_size += num_chars; 388 | iter->s[i].node->nexts[i].wchar_size += num_wchars; 389 | } 390 | } 391 | #else 392 | static void update_offset_list(rope *r, rope_iter *iter, size_t num_chars) { 393 | for (int i = 0; i < r->head.height; i++) { 394 | iter->s[i].node->nexts[i].skip_size += num_chars; 395 | } 396 | } 397 | #endif 398 | 399 | 400 | // Internal method of rope_insert. 401 | // This function creates a new node in the rope at the specified position and fills it with the 402 | // passed string. 403 | static void insert_at(rope *r, rope_iter *iter, 404 | const uint8_t *str, size_t num_bytes, size_t num_chars) { 405 | #if ROPE_WCHAR 406 | size_t num_wchars = count_wchars_in_utf8(str, num_chars); 407 | #endif 408 | 409 | // This describes how many levels of the iter are filled in. 410 | uint8_t max_height = r->head.height; 411 | uint8_t new_height = random_height(); 412 | rope_node *new_node = alloc_node(r, new_height); 413 | new_node->num_bytes = num_bytes; 414 | memcpy(new_node->str, str, num_bytes); 415 | 416 | assert(new_height < ROPE_MAX_HEIGHT); 417 | 418 | // Max height (the rope's head's height) must be 1+ the height of the largest node. 419 | while (max_height <= new_height) { 420 | r->head.height++; 421 | r->head.nexts[max_height] = r->head.nexts[max_height - 1]; 422 | 423 | // This is the position (offset from the start) of the rope. 424 | iter->s[max_height] = iter->s[max_height - 1]; 425 | max_height++; 426 | } 427 | 428 | // Fill in the new node's nexts array. 429 | int i; 430 | for (i = 0; i < new_height; i++) { 431 | rope_skip_node *prev_skip = &iter->s[i].node->nexts[i]; 432 | new_node->nexts[i].node = prev_skip->node; 433 | new_node->nexts[i].skip_size = num_chars + prev_skip->skip_size - iter->s[i].skip_size; 434 | 435 | 436 | prev_skip->node = new_node; 437 | prev_skip->skip_size = iter->s[i].skip_size; 438 | 439 | // & move the iterator to the end of the newly inserted node. 440 | iter->s[i].node = new_node; 441 | iter->s[i].skip_size = num_chars; 442 | #if ROPE_WCHAR 443 | new_node->nexts[i].wchar_size = num_wchars + prev_skip->wchar_size - iter->s[i].wchar_size; 444 | prev_skip->wchar_size = iter->s[i].wchar_size; 445 | iter->s[i].wchar_size = num_wchars; 446 | #endif 447 | } 448 | 449 | for (; i < max_height; i++) { 450 | iter->s[i].node->nexts[i].skip_size += num_chars; 451 | iter->s[i].skip_size += num_chars; 452 | #if ROPE_WCHAR 453 | iter->s[i].node->nexts[i].wchar_size += num_wchars; 454 | iter->s[i].wchar_size += num_wchars; 455 | #endif 456 | } 457 | 458 | r->num_chars += num_chars; 459 | r->num_bytes += num_bytes; 460 | } 461 | 462 | // Insert the given utf8 string into the rope at the specified position. 463 | static ROPE_RESULT rope_insert_at_iter(rope *r, rope_node *e, rope_iter *iter, const uint8_t *str) { 464 | // iter.offset contains how far (in characters) into the current element to skip. 465 | // Figure out how much that is in bytes. 466 | size_t offset_bytes = 0; 467 | // The insertion offset into the destination node. 468 | size_t offset = iter->s[0].skip_size; 469 | if (offset) { 470 | assert(offset <= e->nexts[0].skip_size); 471 | offset_bytes = count_bytes_in_utf8(e->str, offset); 472 | } 473 | 474 | // We might be able to insert the new data into the current node, depending on 475 | // how big it is. We'll count the bytes, and also check that its valid utf8. 476 | ssize_t num_inserted_bytes = bytelen_and_check_utf8(str); 477 | if (num_inserted_bytes == -1) return ROPE_INVALID_UTF8; 478 | 479 | // Can we insert into the current node? 480 | bool insert_here = e->num_bytes + num_inserted_bytes <= ROPE_NODE_STR_SIZE; 481 | 482 | // Can we insert into the subsequent node? 483 | rope_node *next = NULL; 484 | if (!insert_here && offset_bytes == e->num_bytes) { 485 | next = e->nexts[0].node; 486 | // We can insert into the subsequent node if: 487 | // - We can't insert into the current node 488 | // - There _is_ a next node to insert into 489 | // - The insert would be at the start of the next node 490 | // - There's room in the next node 491 | if (next && next->num_bytes + num_inserted_bytes <= ROPE_NODE_STR_SIZE) { 492 | offset = offset_bytes = 0; 493 | for (int i = 0; i < next->height; i++) { 494 | iter->s[i].node = next; 495 | // tree offset nodes will not be used. 496 | } 497 | e = next; 498 | 499 | insert_here = true; 500 | } 501 | } 502 | 503 | if (insert_here) { 504 | // First move the current bytes later on in the string. 505 | if (offset_bytes < e->num_bytes) { 506 | memmove(&e->str[offset_bytes + num_inserted_bytes], 507 | &e->str[offset_bytes], 508 | e->num_bytes - offset_bytes); 509 | } 510 | 511 | // Then copy in the string bytes 512 | memcpy(&e->str[offset_bytes], str, num_inserted_bytes); 513 | e->num_bytes += num_inserted_bytes; 514 | 515 | r->num_bytes += num_inserted_bytes; 516 | size_t num_inserted_chars = strlen_utf8(str); 517 | r->num_chars += num_inserted_chars; 518 | 519 | // .... aaaand update all the offset amounts. 520 | #if ROPE_WCHAR 521 | size_t num_inserted_wchars = count_wchars_in_utf8(str, num_inserted_chars); 522 | update_offset_list(r, iter, num_inserted_chars, num_inserted_wchars); 523 | #else 524 | update_offset_list(r, iter, num_inserted_chars); 525 | #endif 526 | 527 | } else { 528 | // There isn't room. We'll need to add at least one new node to the rope. 529 | 530 | // If we're not at the end of the current node, we'll need to remove 531 | // the end of the current node's data and reinsert it later. 532 | size_t num_end_chars, num_end_bytes = e->num_bytes - offset_bytes; 533 | if (num_end_bytes) { 534 | // We'll pretend like the character have been deleted from the node, while leaving 535 | // the bytes themselves there (for later). 536 | e->num_bytes = offset_bytes; 537 | num_end_chars = e->nexts[0].skip_size - offset; 538 | #if ROPE_WCHAR 539 | size_t num_end_wchars = count_wchars_in_utf8(&e->str[offset_bytes], num_end_chars); 540 | update_offset_list(r, iter, -num_end_chars, -num_end_wchars); 541 | #else 542 | update_offset_list(r, iter, -num_end_chars); 543 | #endif 544 | 545 | r->num_chars -= num_end_chars; 546 | r->num_bytes -= num_end_bytes; 547 | } 548 | 549 | // Now we insert new nodes containing the new character data. The data must be broken into 550 | // pieces of with a maximum size of ROPE_NODE_STR_SIZE. Node boundaries must not occur in the 551 | // middle of a utf8 codepoint. 552 | size_t str_offset = 0; 553 | while (str_offset < num_inserted_bytes) { 554 | size_t new_node_bytes = 0; 555 | size_t new_node_chars = 0; 556 | 557 | while (str_offset + new_node_bytes < num_inserted_bytes) { 558 | size_t cs = codepoint_size(str[str_offset + new_node_bytes]); 559 | if (cs + new_node_bytes > ROPE_NODE_STR_SIZE) { 560 | break; 561 | } else { 562 | new_node_bytes += cs; 563 | new_node_chars++; 564 | } 565 | } 566 | 567 | insert_at(r, iter, &str[str_offset], new_node_bytes, new_node_chars); 568 | str_offset += new_node_bytes; 569 | } 570 | 571 | if (num_end_bytes) { 572 | insert_at(r, iter, &e->str[offset_bytes], num_end_bytes, num_end_chars); 573 | } 574 | } 575 | 576 | return ROPE_OK; 577 | } 578 | 579 | ROPE_RESULT rope_insert(rope *r, size_t pos, const uint8_t *str) { 580 | assert(r); 581 | assert(str); 582 | #ifdef DEBUG 583 | _rope_check(r); 584 | #endif 585 | pos = MIN(pos, r->num_chars); 586 | 587 | rope_iter iter; 588 | // First we need to search for the node where we'll insert the string. 589 | rope_node *e = iter_at_char_pos(r, pos, &iter); 590 | 591 | ROPE_RESULT result = rope_insert_at_iter(r, e, &iter, str); 592 | 593 | #ifdef DEBUG 594 | _rope_check(r); 595 | #endif 596 | 597 | return result; 598 | } 599 | 600 | #if ROPE_WCHAR 601 | // Insert the given utf8 string into the rope at the specified position. 602 | size_t rope_insert_at_wchar(rope *r, size_t wchar_pos, const uint8_t *str) { 603 | assert(r); 604 | assert(str); 605 | #ifdef DEBUG 606 | _rope_check(r); 607 | #endif 608 | wchar_pos = MIN(wchar_pos, rope_wchar_count(r)); 609 | 610 | rope_iter iter; 611 | // First we need to search for the node where we'll insert the string. 612 | rope_node *e = iter_at_wchar_pos(r, wchar_pos, &iter); 613 | size_t pos = iter.s[r->head.height - 1].skip_size; 614 | rope_insert_at_iter(r, e, &iter, str); 615 | 616 | #ifdef DEBUG 617 | _rope_check(r); 618 | #endif 619 | return pos; 620 | } 621 | 622 | #endif 623 | 624 | // Delete num characters at position pos. Deleting past the end of the string 625 | // has no effect. 626 | static void rope_del_at_iter(rope *r, rope_node *e, rope_iter *iter, size_t length) { 627 | r->num_chars -= length; 628 | size_t offset = iter->s[0].skip_size; 629 | while (length) { 630 | if (offset == e->nexts[0].skip_size) { 631 | // End of the current node. Skip to the start of the next one. 632 | e = iter->s[0].node->nexts[0].node; 633 | offset = 0; 634 | } 635 | 636 | size_t num_chars = e->nexts[0].skip_size; 637 | size_t removed = MIN(length, num_chars - offset); 638 | #if ROPE_WCHAR 639 | size_t removed_wchars; 640 | #endif 641 | 642 | int i; 643 | if (removed < num_chars || e == &r->head) { 644 | // Just trim this node down to size. 645 | size_t leading_bytes = count_bytes_in_utf8(e->str, offset); 646 | size_t removed_bytes = count_bytes_in_utf8(&e->str[leading_bytes], removed); 647 | size_t trailing_bytes = e->num_bytes - leading_bytes - removed_bytes; 648 | #if ROPE_WCHAR 649 | removed_wchars = count_wchars_in_utf8(&e->str[leading_bytes], removed); 650 | #endif 651 | if (trailing_bytes) { 652 | memmove(&e->str[leading_bytes], &e->str[leading_bytes + removed_bytes], trailing_bytes); 653 | } 654 | e->num_bytes -= removed_bytes; 655 | r->num_bytes -= removed_bytes; 656 | 657 | for (i = 0; i < e->height; i++) { 658 | e->nexts[i].skip_size -= removed; 659 | #if ROPE_WCHAR 660 | e->nexts[i].wchar_size -= removed_wchars; 661 | #endif 662 | } 663 | } else { 664 | // Remove the node from the list 665 | #if ROPE_WCHAR 666 | removed_wchars = e->nexts[0].wchar_size; 667 | #endif 668 | for (i = 0; i < e->height; i++) { 669 | iter->s[i].node->nexts[i].node = e->nexts[i].node; 670 | iter->s[i].node->nexts[i].skip_size += e->nexts[i].skip_size - removed; 671 | #if ROPE_WCHAR 672 | iter->s[i].node->nexts[i].wchar_size += e->nexts[i].wchar_size - removed_wchars; 673 | #endif 674 | } 675 | 676 | r->num_bytes -= e->num_bytes; 677 | // TODO: Recycle e. 678 | rope_node *next = e->nexts[0].node; 679 | r->free(e); 680 | e = next; 681 | } 682 | 683 | for (; i < r->head.height; i++) { 684 | iter->s[i].node->nexts[i].skip_size -= removed; 685 | #if ROPE_WCHAR 686 | iter->s[i].node->nexts[i].wchar_size -= removed_wchars; 687 | #endif 688 | } 689 | 690 | length -= removed; 691 | } 692 | } 693 | 694 | void rope_del(rope *r, size_t pos, size_t length) { 695 | #ifdef DEBUG 696 | _rope_check(r); 697 | #endif 698 | 699 | assert(r); 700 | pos = MIN(pos, r->num_chars); 701 | length = MIN(length, r->num_chars - pos); 702 | 703 | rope_iter iter; 704 | 705 | // Search for the node where we'll insert the string. 706 | rope_node *e = iter_at_char_pos(r, pos, &iter); 707 | 708 | rope_del_at_iter(r, e, &iter, length); 709 | 710 | #ifdef DEBUG 711 | _rope_check(r); 712 | #endif 713 | } 714 | 715 | #if ROPE_WCHAR 716 | size_t rope_del_at_wchar(rope *r, size_t wchar_pos, size_t wchar_num, size_t *char_len_out) { 717 | #ifdef DEBUG 718 | _rope_check(r); 719 | #endif 720 | 721 | assert(r); 722 | size_t wchar_total = rope_wchar_count(r); 723 | wchar_pos = MIN(wchar_pos, wchar_total); 724 | wchar_num = MIN(wchar_num, wchar_total - wchar_pos); 725 | 726 | rope_iter iter; 727 | 728 | // Search for the node where we'll insert the string. 729 | rope_node *start = iter_at_wchar_pos(r, wchar_pos, &iter); 730 | size_t char_pos = iter.s[r->head.height - 1].skip_size; 731 | 732 | rope_iter end_iter; 733 | int h = r->head.height - 1; 734 | iter_at_wchar_pos(r, iter.s[h].wchar_size + wchar_num, &end_iter); 735 | 736 | size_t char_length = end_iter.s[h].skip_size - iter.s[h].skip_size; 737 | rope_del_at_iter(r, start, &iter, char_length); 738 | 739 | #ifdef DEBUG 740 | _rope_check(r); 741 | #endif 742 | if (char_len_out) { 743 | *char_len_out = char_length; 744 | } 745 | return char_pos; 746 | } 747 | #endif 748 | 749 | void _rope_check(rope *r) { 750 | assert(r->head.height); // Even empty ropes have a height of 1. 751 | assert(r->num_bytes >= r->num_chars); 752 | 753 | rope_skip_node skip_over = r->head.nexts[r->head.height - 1]; 754 | assert(skip_over.skip_size == r->num_chars); 755 | assert(skip_over.node == NULL); 756 | 757 | size_t num_bytes = 0; 758 | size_t num_chars = 0; 759 | #if ROPE_WCHAR 760 | size_t num_wchar = 0; 761 | #endif 762 | 763 | // The offsets here are used to store the total distance travelled from the start 764 | // of the rope. 765 | rope_iter iter = {}; 766 | for (int i = 0; i < r->head.height; i++) { 767 | iter.s[i].node = &r->head; 768 | } 769 | 770 | for (rope_node *n = &r->head; n != NULL; n = n->nexts[0].node) { 771 | assert(n == &r->head || n->num_bytes); 772 | assert(n->height <= ROPE_MAX_HEIGHT); 773 | assert(count_bytes_in_utf8(n->str, n->nexts[0].skip_size) == n->num_bytes); 774 | #if ROPE_WCHAR 775 | assert(count_wchars_in_utf8(n->str, n->nexts[0].skip_size) == n->nexts[0].wchar_size); 776 | #endif 777 | for (int i = 0; i < n->height; i++) { 778 | assert(iter.s[i].node == n); 779 | assert(iter.s[i].skip_size == num_chars); 780 | iter.s[i].node = n->nexts[i].node; 781 | iter.s[i].skip_size += n->nexts[i].skip_size; 782 | #if ROPE_WCHAR 783 | assert(iter.s[i].wchar_size == num_wchar); 784 | iter.s[i].wchar_size += n->nexts[i].wchar_size; 785 | #endif 786 | } 787 | 788 | num_bytes += n->num_bytes; 789 | num_chars += n->nexts[0].skip_size; 790 | #if ROPE_WCHAR 791 | num_wchar += n->nexts[0].wchar_size; 792 | #endif 793 | } 794 | 795 | for (int i = 0; i < r->head.height; i++) { 796 | assert(iter.s[i].node == NULL); 797 | assert(iter.s[i].skip_size == num_chars); 798 | #if ROPE_WCHAR 799 | assert(iter.s[i].wchar_size == num_wchar); 800 | #endif 801 | } 802 | 803 | assert(r->num_bytes == num_bytes); 804 | assert(r->num_chars == num_chars); 805 | #if ROPE_WCHAR 806 | assert(skip_over.wchar_size == num_wchar); 807 | #endif 808 | } 809 | 810 | // For debugging. 811 | #include 812 | void _rope_print(rope *r) { 813 | printf("chars: %zd\tbytes: %zd\theight: %d\n", r->num_chars, r->num_bytes, r->head.height); 814 | 815 | printf("HEAD"); 816 | for (int i = 0; i < r->head.height; i++) { 817 | printf(" |%3zd ", r->head.nexts[i].skip_size); 818 | } 819 | printf("\n"); 820 | 821 | int num = 0; 822 | for (rope_node *n = &r->head; n != NULL; n = n->nexts[0].node) { 823 | printf("%3d:", num++); 824 | for (int i = 0; i < n->height; i++) { 825 | printf(" |%3zd ", n->nexts[i].skip_size); 826 | } 827 | printf(" : \""); 828 | fwrite(n->str, n->num_bytes, 1, stdout); 829 | printf("\"\n"); 830 | } 831 | } 832 | -------------------------------------------------------------------------------- /rope.h: -------------------------------------------------------------------------------- 1 | /* UTF-8 Rope implementation by Joseph Gentle 2 | * 3 | * This library implements a heavyweight utf8 string type with fast 4 | * insert-at-position and delete-at-position operations. 5 | * 6 | * It uses skip lists instead of trees. Trees might be faster - who knows? 7 | * 8 | * Ropes are not syncronized. Do not access the same rope from multiple threads 9 | * simultaneously. 10 | */ 11 | 12 | #ifndef librope_rope_h 13 | #define librope_rope_h 14 | 15 | #include 16 | #include 17 | 18 | // Whether or not the rope should support converting UTF-8 character offsets to 19 | // wchar array positions. This is useful when interoperating with strings in 20 | // JS, Objective-C and many other languages. See 21 | // http://josephg.com/post/31707645955/string-length-lies 22 | // 23 | // Adding wchar conversion support decreases performance by about 30%. 24 | #ifndef ROPE_WCHAR 25 | #define ROPE_WCHAR 0 26 | #endif 27 | 28 | // These two magic values seem to be approximately optimal given the benchmark 29 | // in tests.c which does lots of small inserts. 30 | 31 | // Must be <= UINT16_MAX. Benchmarking says this is pretty close to optimal 32 | // (tested on a mac using clang 4.0 and x86_64). 33 | #ifndef ROPE_NODE_STR_SIZE 34 | #if ROPE_WCHAR 35 | #define ROPE_NODE_STR_SIZE 64 36 | #else 37 | #define ROPE_NODE_STR_SIZE 136 38 | #endif 39 | #endif 40 | 41 | // The likelyhood (%) a node will have height (n+1) instead of n 42 | #ifndef ROPE_BIAS 43 | #define ROPE_BIAS 25 44 | #endif 45 | 46 | // The rope will stop being efficient after the string is 2 ^ ROPE_MAX_HEIGHT 47 | // nodes. 48 | #ifndef ROPE_MAX_HEIGHT 49 | #define ROPE_MAX_HEIGHT 60 50 | #endif 51 | 52 | struct rope_node_t; 53 | 54 | // The number of characters in str can be read out of nexts[0].skip_size. 55 | typedef struct { 56 | // The number of _characters_ between the start of the current node 57 | // and the start of next. 58 | size_t skip_size; 59 | 60 | // For some reason, librope runs about 1% faster when this next pointer is 61 | // exactly _here_ in the struct. 62 | struct rope_node_t *node; 63 | 64 | #if ROPE_WCHAR 65 | // The number of wide characters contained in space. 66 | size_t wchar_size; 67 | #endif 68 | } rope_skip_node; 69 | 70 | typedef struct rope_node_t { 71 | uint8_t str[ROPE_NODE_STR_SIZE]; 72 | 73 | // The number of bytes in str in use 74 | uint16_t num_bytes; 75 | 76 | // This is the number of elements allocated in nexts. 77 | // Each height is 1/2 as likely as the height before. The minimum height is 1. 78 | uint8_t height; 79 | 80 | rope_skip_node nexts[]; 81 | } rope_node; 82 | 83 | typedef struct { 84 | // The total number of characters in the rope. 85 | size_t num_chars; 86 | 87 | // The total number of bytes which the characters in the rope take up. 88 | size_t num_bytes; 89 | 90 | void *(*alloc)(size_t bytes); 91 | void *(*realloc)(void *ptr, size_t newsize); 92 | void (*free)(void *ptr); 93 | 94 | // The first node exists inline in the rope structure itself. 95 | rope_node head; 96 | } rope; 97 | 98 | #ifdef __cplusplus 99 | extern "C" { 100 | #endif 101 | 102 | // Create a new rope with no contents 103 | rope *rope_new(); 104 | 105 | // Create a new rope using custom allocators. 106 | rope *rope_new2(void *(*alloc)(size_t bytes), 107 | void *(*realloc)(void *ptr, size_t newsize), 108 | void (*free)(void *ptr)); 109 | 110 | // Create a new rope containing a copy of the given string. Shorthand for 111 | // r = rope_new(); rope_insert(r, 0, str); 112 | rope *rope_new_with_utf8(const uint8_t *str); 113 | 114 | // Make a copy of an existing rope 115 | rope *rope_copy(const rope *r); 116 | 117 | // Free the specified rope 118 | void rope_free(rope *r); 119 | 120 | // Get the number of characters in a rope 121 | size_t rope_char_count(const rope *r); 122 | 123 | // Get the number of bytes which the rope would take up if stored as a utf8 124 | // string 125 | size_t rope_byte_count(const rope *r); 126 | 127 | // Copies the rope's contents into a utf8 encoded C string. Also copies a 128 | // trailing '\0' character. 129 | // Returns the number of bytes written, which is rope_byte_count(r) + 1. 130 | size_t rope_write_cstr(rope *r, uint8_t *dest); 131 | 132 | // Create a new C string which contains the rope. The string will contain 133 | // the rope encoded as utf8, followed by a trailing '\0'. 134 | // Use rope_byte_count(r) to get the length of the returned string. 135 | uint8_t *rope_create_cstr(rope *r); 136 | 137 | // If you try to insert data into the rope with an invalid UTF8 encoding, 138 | // nothing will happen and we'll return ROPE_INVALID_UTF8. 139 | typedef enum { ROPE_OK, ROPE_INVALID_UTF8 } ROPE_RESULT; 140 | 141 | // Insert the given utf8 string into the rope at the specified position. 142 | ROPE_RESULT rope_insert(rope *r, size_t pos, const uint8_t *str); 143 | 144 | // Delete num characters at position pos. Deleting past the end of the string 145 | // has no effect. 146 | void rope_del(rope *r, size_t pos, size_t num); 147 | 148 | // This macro expands to a for() loop header which loops over the segments in a 149 | // rope. 150 | // 151 | // Eg: 152 | // rope *r = rope_new_with_utf8(str); 153 | // ROPE_FOREACH(r, iter) { 154 | // printf("%s", rope_node_data(iter)); 155 | // } 156 | #define ROPE_FOREACH(rope, iter) \ 157 | for (rope_node *iter = &(rope)->head; iter != NULL; iter = iter->nexts[0].node) 158 | 159 | // Get the actual data inside a rope node. 160 | static inline uint8_t *rope_node_data(rope_node *n) { 161 | return n->str; 162 | } 163 | 164 | // Get the number of bytes inside a rope node. This is useful when you're 165 | // looping through a rope. 166 | static inline size_t rope_node_num_bytes(rope_node *n) { 167 | return n->num_bytes; 168 | } 169 | 170 | // Get the number of characters inside a rope node. 171 | static inline size_t rope_node_chars(rope_node *n) { 172 | return n->nexts[0].skip_size; 173 | } 174 | 175 | #if ROPE_WCHAR 176 | // Get the number of wchar characters in the rope 177 | size_t rope_wchar_count(rope *r); 178 | 179 | // Insert the given utf8 string into the rope at the specified wchar position. 180 | // This is compatible with NSString, Javascript, etc. The string still needs to 181 | // be passed in using UTF-8. 182 | // 183 | // Returns the insertion position in characters. 184 | size_t rope_insert_at_wchar(rope *r, size_t wchar_pos, const uint8_t *utf8_str); 185 | 186 | // Delete wchar_num wide characters at the specified wchar position offset. 187 | // If the range is inside character boundaries, behaviour is undefined. 188 | // 189 | // Returns the deletion position in characters. *char_len_out is set to the 190 | // deletion length, in chars if its not null. 191 | size_t rope_del_at_wchar(rope *r, size_t wchar_pos, size_t wchar_num, size_t *char_len_out); 192 | 193 | // Get the number of wchars inside a rope node. This is useful when you're 194 | // looping throuhg a rope. 195 | static inline size_t rope_node_wchars(rope_node *n) { 196 | return n->nexts[0].wchar_size; 197 | } 198 | #endif 199 | 200 | 201 | 202 | // For debugging. 203 | void _rope_check(rope *r); 204 | void _rope_print(rope *r); 205 | 206 | #ifdef __cplusplus 207 | } 208 | #endif 209 | 210 | #endif 211 | -------------------------------------------------------------------------------- /test/benchmark.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | 8 | #include "rope.h" 9 | #include "tests.h" 10 | 11 | #include "slowstring.h" 12 | 13 | #ifdef __cplusplus 14 | #include 15 | #endif 16 | 17 | // Wrapper for rope 18 | static void *_rope_create() { 19 | return (void *)rope_new(); 20 | } 21 | 22 | static void _rope_insert(void *r, size_t pos, const uint8_t *str) { 23 | rope_insert((rope *)r, pos, str); 24 | } 25 | static void _rope_del(void *r, size_t pos, size_t len) { 26 | rope_del((rope *)r, pos, len); 27 | } 28 | static void _rope_destroy(void *r) { 29 | rope_free((rope *)r); 30 | } 31 | 32 | static size_t _rope_num_chars(void *r) { 33 | return rope_char_count((rope *)r); 34 | } 35 | 36 | // Wrapper for a vector-based string 37 | 38 | static void *_str_create() { 39 | return (void *)str_create(); 40 | } 41 | 42 | static void _str_insert(void *r, size_t pos, const uint8_t *str) { 43 | str_insert((_string *)r, pos, str); 44 | } 45 | 46 | static void _str_del(void *r, size_t pos, size_t len) { 47 | str_del((_string *)r, pos, len); 48 | } 49 | 50 | static void _str_destroy(void *r) { 51 | str_destroy((_string *)r); 52 | } 53 | 54 | static size_t _str_num_chars(void *r) { 55 | return str_num_chars((_string *)r); 56 | } 57 | 58 | // SGI C++ rope. To enable these benchmarks, compile this file using a C++ compiler. There's a 59 | // bug with some versions of clang and the rope library - you might have to switch to gcc. 60 | #ifdef __cplusplus 61 | static void *_sgi_create() { 62 | return new __gnu_cxx::crope(); 63 | } 64 | 65 | static void _sgi_insert(void *r, size_t pos, const uint8_t *str) { 66 | __gnu_cxx::crope *rope = (__gnu_cxx::crope *)r; 67 | rope->insert(pos, (const char *)str); 68 | } 69 | static void _sgi_del(void *r, size_t pos, size_t len) { 70 | __gnu_cxx::crope *rope = (__gnu_cxx::crope *)r; 71 | rope->erase(pos, len); 72 | } 73 | static void _sgi_destroy(void *r) { 74 | __gnu_cxx::crope *rope = (__gnu_cxx::crope *)r; 75 | delete rope; 76 | } 77 | 78 | static size_t _sgi_num_chars(void *r) { 79 | __gnu_cxx::crope *rope = (__gnu_cxx::crope *)r; 80 | return rope->size(); 81 | } 82 | #endif 83 | 84 | 85 | struct rope_implementation { 86 | const char *name; 87 | void* (*create)(); 88 | void (*insert)(void *r, size_t pos, const uint8_t *str); 89 | void (*del)(void *r, size_t pos, size_t len); 90 | void (*destroy)(void *r); 91 | size_t (*num_chars)(void *r); 92 | } types[] = { 93 | { "librope", &_rope_create, &_rope_insert, &_rope_del, &_rope_destroy, &_rope_num_chars }, 94 | #ifdef __cplusplus 95 | { "sgirope", &_sgi_create, &_sgi_insert, &_sgi_del, &_sgi_destroy, &_sgi_num_chars }, 96 | #endif 97 | { "c string", &_str_create, &_str_insert, &_str_del, &_str_destroy, &_str_num_chars }, 98 | }; 99 | 100 | void benchmark() { 101 | printf("Benchmarking... (node size = %d, wchar support = %d)\n", 102 | ROPE_NODE_STR_SIZE, ROPE_WCHAR); 103 | 104 | long iterations = 20000000; 105 | // long iterations = 1000000; 106 | struct timeval start, end; 107 | 108 | // Make the test stable 109 | srandom(1234); 110 | 111 | uint8_t *strings[100]; 112 | for (int i = 0; i < 100; i++) { 113 | size_t len = 1 + random() % 2;//i * i + 1; 114 | strings[i] = (uint8_t *)calloc(1, len + 1); 115 | random_ascii_string(strings[i], len + 1); 116 | // random_unicode_string(strings[i], len + 1); 117 | } 118 | 119 | // We should pick the same random sequence each benchmark run. 120 | unsigned long *rvals = (unsigned long *)malloc(sizeof(unsigned long) * iterations); 121 | for (int i = 0; i < iterations; i++) { 122 | rvals[i] = random(); 123 | } 124 | 125 | // for (int t = 0; t < sizeof(types) / sizeof(types[0]); t++) { 126 | for (int t = 0; t < 1; t++) { 127 | for (int i = 0; i < 5; i++) { 128 | printf("benchmarking %s\n", types[t].name); 129 | void *r = types[t].create(); 130 | 131 | gettimeofday(&start, NULL); 132 | 133 | for (long i = 0; i < iterations; i++) { 134 | if (types[t].num_chars(r) == 0 || i % 20 > 0) { 135 | // insert. (Inserts are way more common in practice than deletes.) 136 | uint8_t *str = strings[i % 100]; 137 | types[t].insert(r, rvals[i] % (types[t].num_chars(r) + 1), str); 138 | } else { 139 | size_t pos = rvals[i] % types[t].num_chars(r); 140 | size_t length = MIN(types[t].num_chars(r) - pos, 1 + (~rvals[i]) % 53); 141 | types[t].del(r, pos, length); 142 | } 143 | 144 | //printf("%s\n", rope_createcstr(r, NULL)); 145 | } 146 | 147 | gettimeofday(&end, NULL); 148 | 149 | double elapsedTime = end.tv_sec - start.tv_sec; 150 | elapsedTime += (end.tv_usec - start.tv_usec) / 1e6; 151 | printf("did %ld iterations in %f ms: %f Miter/sec\n", 152 | iterations, elapsedTime * 1000, iterations / elapsedTime / 1000000); 153 | printf("final string length: %zi\n", types[t].num_chars(r)); 154 | 155 | types[t].destroy(r); 156 | } 157 | } 158 | 159 | for (int i = 0; i < 100; i++) { 160 | free(strings[i]); 161 | } 162 | } 163 | 164 | -------------------------------------------------------------------------------- /test/slowstring.c: -------------------------------------------------------------------------------- 1 | // 2 | // slowstring.c 3 | // librope 4 | // 5 | // Created by Joseph Gentle on 28/08/12. 6 | // Copyright (c) 2012 Joseph Gentle. All rights reserved. 7 | // 8 | 9 | #include 10 | #include 11 | 12 | #include "slowstring.h" 13 | 14 | // Private rope methods, stolen for utf8 support in the string. 15 | static size_t codepoint_size(uint8_t byte) { 16 | if (byte <= 0x7f) { return 1; } 17 | else if (byte <= 0xdf) { return 2; } 18 | else if (byte <= 0xef) { return 3; } 19 | else if (byte <= 0xf7) { return 4; } 20 | else if (byte <= 0xfb) { return 5; } 21 | else if (byte <= 0xfd) { return 6; } 22 | else { 23 | // The codepoint is invalid... what do? 24 | //assert(0); 25 | return 1; 26 | } 27 | } 28 | 29 | // This little function counts how many bytes the some characters take up. 30 | static size_t count_bytes_in_chars(const uint8_t *str, size_t num_chars) { 31 | const uint8_t *p = str; 32 | for (int i = 0; i < num_chars; i++) { 33 | p += codepoint_size(*p); 34 | } 35 | return p - str; 36 | } 37 | 38 | static size_t strlen_utf8(const uint8_t *str) { 39 | const uint8_t *p = str; 40 | size_t i = 0; 41 | while (*p) { 42 | p += codepoint_size(*p); 43 | i++; 44 | } 45 | return i; 46 | } 47 | 48 | _string *str_create() { 49 | _string *s = (_string *)malloc(sizeof(_string)); 50 | s->capacity = 64; // A reasonable capacity considering... 51 | s->mem = (uint8_t *)malloc(s->capacity); 52 | s->mem[0] = '\0'; 53 | s->len = 0; 54 | s->num_chars = 0; 55 | return s; 56 | } 57 | 58 | void str_insert(_string *s, size_t pos, const uint8_t *str) { 59 | size_t num_inserted_bytes = strlen((char *)str); 60 | // Offset to insert at in the string. 61 | size_t offset = count_bytes_in_chars(s->mem, pos); 62 | size_t end_size = s->len - offset; 63 | 64 | // Resize if needed. 65 | s->len += num_inserted_bytes; 66 | if (s->len >= s->capacity) { 67 | while (s->len >= s->capacity) { 68 | s->capacity *= 2; 69 | } 70 | s->mem = (uint8_t *)realloc(s->mem, s->capacity); 71 | } 72 | s->num_chars += strlen_utf8(str); 73 | 74 | memmove(&s->mem[offset + num_inserted_bytes], &s->mem[offset], end_size); 75 | memcpy(&s->mem[offset], str, num_inserted_bytes); 76 | s->mem[s->len] = '\0'; 77 | } 78 | 79 | void str_del(_string *s, size_t pos, size_t len) { 80 | // Offset to delete at in the string. 81 | size_t offset = count_bytes_in_chars(s->mem, pos); 82 | size_t num_bytes = count_bytes_in_chars(s->mem + offset, len); 83 | size_t end_size = s->len - offset - num_bytes; 84 | 85 | if (end_size > 0) { 86 | memmove(&s->mem[offset], &s->mem[offset + num_bytes], end_size); 87 | } 88 | s->len -= num_bytes; 89 | s->num_chars -= len; 90 | s->mem[s->len] = '\0'; 91 | } 92 | 93 | void str_destroy(_string *s) { 94 | free(s->mem); 95 | free(s); 96 | } 97 | 98 | size_t str_num_chars(const _string *s) { 99 | return s->num_chars; 100 | } 101 | -------------------------------------------------------------------------------- /test/slowstring.h: -------------------------------------------------------------------------------- 1 | // This is a copy of the rope API using simple C strings. 2 | // 3 | // Its used for testing and benchmarking. 4 | 5 | #ifndef librope_slowstring_h 6 | #define librope_slowstring_h 7 | 8 | #include 9 | 10 | typedef struct { 11 | uint8_t *mem; 12 | size_t capacity; 13 | size_t len; 14 | size_t num_chars; 15 | } _string; 16 | 17 | _string *str_create(); 18 | 19 | void str_insert(_string *s, size_t pos, const uint8_t *str); 20 | 21 | void str_del(_string *s, size_t pos, size_t len); 22 | 23 | void str_destroy(_string *s); 24 | 25 | size_t str_num_chars(const _string *s); 26 | 27 | 28 | #endif 29 | -------------------------------------------------------------------------------- /test/tests.c: -------------------------------------------------------------------------------- 1 | // Tests for librope. 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "tests.h" 9 | #include "slowstring.h" 10 | #include "rope.h" 11 | 12 | static float rand_float() { 13 | return (float)random() / INT32_MAX; 14 | } 15 | 16 | // A selection of different unicode characters to pick from. 17 | // As far as I can tell, there are no unicode characters assigned which 18 | // take up more than 4 bytes in utf-8. 19 | static const char *UCHARS[] = { 20 | "a", "b", "c", "1", "2", "3", " ", "\n", // ASCII 21 | "©", "¥", "½", // The Latin-1 suppliment (U+80 - U+ff) 22 | "Ύ", "Δ", "δ", "Ϡ", // Greek (U+0370 - U+03FF) 23 | "←", "↯", "↻", "⇈", // Arrows (U+2190 – U+21FF) 24 | "𐆐", "𐆔", "𐆘", "𐆚", // Ancient roman symbols (U+10190 – U+101CF) 25 | }; 26 | 27 | // s is the size of the buffer, including the \0. This function might use 28 | // fewer bytes than that. 29 | void random_unicode_string(uint8_t *buffer, size_t s) { 30 | if (s == 0) { return; } 31 | uint8_t *pos = buffer; 32 | 33 | while(1) { 34 | uint8_t *c = (uint8_t *)UCHARS[random() % (sizeof(UCHARS) / sizeof(UCHARS[0]))]; 35 | 36 | size_t bytes = strlen((char *)c); 37 | 38 | size_t remaining_space = buffer + s - pos - 1; 39 | 40 | if (remaining_space < bytes) { 41 | break; 42 | } 43 | 44 | memcpy(pos, c, bytes); 45 | pos += bytes; 46 | } 47 | 48 | *pos = '\0'; 49 | } 50 | 51 | static const char CHARS[] = " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" 52 | "0123456789!@#$%^&*()[]{}<>?,./"; 53 | void random_ascii_string(uint8_t *buffer, size_t len) { 54 | assert(len); 55 | for (int i = 0; i < len - 1; i++) { 56 | buffer[i] = CHARS[random() % (sizeof(CHARS) - 1)]; 57 | } 58 | buffer[len - 1] = '\0'; 59 | } 60 | 61 | static size_t strlen_utf8(uint8_t *data) { 62 | size_t numchars = 0; 63 | 64 | while (*data) { 65 | if ((*data++ & 0xC0) != 0x80) { 66 | ++numchars; 67 | } 68 | } 69 | 70 | return numchars; 71 | } 72 | 73 | #if ROPE_WCHAR 74 | // Count the number of wchars this string would take up if it was encoded using utf16. 75 | static size_t wchar_size_count(uint8_t *data) { 76 | size_t num = 0; 77 | 78 | while (*data) { 79 | if ((*data & 0xC0) != 0x80) { 80 | ++num; 81 | if ((*data & 0xf0) == 0xf0) { 82 | // It'll take up 2 wchars, not just one. 83 | ++num; 84 | } 85 | } 86 | 87 | ++data; 88 | } 89 | 90 | return num; 91 | } 92 | 93 | static size_t count_wchars_in_utf8(const uint8_t *str, size_t num_chars) { 94 | size_t wchars = num_chars; 95 | while (num_chars) { 96 | if ((*str & 0xf0) == 0xf0) { 97 | wchars++; 98 | } 99 | if ((*str & 0xc0) != 0x80) { 100 | num_chars--; 101 | } 102 | ++str; 103 | } 104 | return wchars; 105 | } 106 | #endif 107 | 108 | void test(int cond) { 109 | if (!cond) { 110 | fprintf(stderr, "Test failed\n"); 111 | assert(0); 112 | } 113 | } 114 | 115 | void check(rope *rope, char *expected) { 116 | // Rope will be null when the inserted data is invalid. 117 | assert((rope == NULL) == (expected == NULL)); 118 | 119 | if (rope) { 120 | _rope_check(rope); 121 | test(rope_byte_count(rope) == strlen(expected)); 122 | uint8_t *cstr = rope_create_cstr(rope); 123 | test(strcmp((char *)cstr, expected) == 0); 124 | free(cstr); 125 | } 126 | } 127 | 128 | static void test_empty_rope_has_no_content() { 129 | rope *r = rope_new(); 130 | check(r, ""); 131 | test(rope_char_count(r) == 0); 132 | 133 | uint8_t *bytes = rope_create_cstr(r); 134 | test(bytes[0] == '\0'); 135 | free(bytes); 136 | 137 | rope_free(r); 138 | } 139 | 140 | static void checked_insert(rope *r, size_t pos, char *str) { 141 | ROPE_RESULT result = rope_insert(r, pos, (uint8_t *)str); 142 | assert(result == ROPE_OK); 143 | } 144 | 145 | static void test_insert_at_location() { 146 | rope *r = rope_new(); 147 | 148 | checked_insert(r, 0, "AAA"); 149 | check(r, "AAA"); 150 | 151 | checked_insert(r, 0, "BBB"); 152 | check(r, "BBBAAA"); 153 | 154 | checked_insert(r, 6, "CCC"); 155 | check(r, "BBBAAACCC"); 156 | 157 | checked_insert(r, 5, "DDD"); 158 | check(r, "BBBAADDDACCC"); 159 | 160 | test(rope_char_count(r) == 12); 161 | 162 | rope_free(r); 163 | } 164 | 165 | static void check_invalid(char *err_str) { 166 | rope *r = rope_new(); 167 | ROPE_RESULT result = rope_insert(r, 0, (uint8_t *)err_str); 168 | assert(result == ROPE_INVALID_UTF8); 169 | 170 | // And check that nothing happened. 171 | assert(0 == rope_char_count(r)); 172 | assert(0 == rope_byte_count(r)); 173 | rope_free(r); 174 | } 175 | 176 | static void test_invalid_utf8_rejected() { 177 | check_invalid((char[]){0xb0,0}); // trailing middle byte 178 | check_invalid((char[]){0xc0,0}); // half of 2 byte sequence 179 | check_invalid((char[]){0xc0,0xb0,0xb0,0}); 180 | check_invalid((char[]){0xc0,0xc0,0xb0,0}); 181 | check_invalid((char[]){0xe0,0xb0,0}); // 2/3 in 3 byte sequence 182 | check_invalid((char[]){0xe0,0xb0,0xb0,0xb0,0}); 183 | check_invalid((char[]){0xe0,0xc0,0xb0,0}); 184 | check_invalid((char[]){0xe0,0xc0,0xb0,0xb0,0}); 185 | } 186 | 187 | // A rope initialized with a string has that string as its content 188 | static void test_new_string_has_content() { 189 | rope *r = rope_new_with_utf8((uint8_t *)"Hi there"); 190 | check(r, "Hi there"); 191 | test(rope_char_count(r) == strlen("Hi there")); 192 | rope_free(r); 193 | 194 | // If need be, this could be rewritten as an array of bytes... 195 | r = rope_new_with_utf8((uint8_t *)"κόσμε"); 196 | check(r, "κόσμε"); 197 | test(rope_char_count(r) == 5); 198 | 199 | rope_insert(r, 2, (uint8_t *)"𝕐𝕆𝌀"); 200 | check(r, "κό𝕐𝕆𝌀σμε"); 201 | test(rope_char_count(r) == 8); 202 | rope_free(r); 203 | } 204 | 205 | static void test_delete_at_location() { 206 | rope *r = rope_new_with_utf8((uint8_t *)"012345678"); 207 | 208 | rope_del(r, 8, 1); 209 | check(r, "01234567"); 210 | 211 | rope_del(r, 0, 1); 212 | check(r, "1234567"); 213 | 214 | rope_del(r, 5, 1); 215 | check(r, "123457"); 216 | 217 | rope_del(r, 5, 1); 218 | check(r, "12345"); 219 | 220 | rope_del(r, 0, 5); 221 | check(r, ""); 222 | 223 | test(rope_char_count(r) == 0); 224 | 225 | rope_free(r); 226 | } 227 | 228 | static void test_delete_past_end_of_string() { 229 | rope *r = rope_new(); 230 | 231 | rope_del(r, 0, 100); 232 | check(r, ""); 233 | 234 | rope_insert(r, 0, (uint8_t *)"hi there"); 235 | rope_del(r, 3, 10); 236 | check(r, "hi "); 237 | 238 | test(rope_char_count(r) == 3); 239 | 240 | rope_free(r); 241 | } 242 | 243 | static void test_wchar() { 244 | #if ROPE_WCHAR 245 | rope *r = rope_new_with_utf8((uint8_t *)"𐆔𐆚𐆔"); 246 | test(rope_wchar_count(r) == 6); 247 | 248 | size_t len; 249 | size_t pos = rope_del_at_wchar(r, 2, 2, &len); 250 | check(r, "𐆔𐆔"); 251 | test(pos == 1); 252 | test(len == 1); 253 | 254 | pos = rope_insert_at_wchar(r, 2, (uint8_t *)"abcde"); 255 | check(r, "𐆔abcde𐆔"); 256 | test(pos == 1); 257 | 258 | pos = rope_insert_at_wchar(r, 5, (uint8_t *)"𐆚"); 259 | check(r, "𐆔abc𐆚de𐆔"); 260 | test(pos == 4); 261 | 262 | rope_free(r); 263 | #else 264 | printf("Skipping wchar tests - wchar conversion support disabled.\n"); 265 | #endif 266 | } 267 | 268 | static void test_really_long_ascii_string() { 269 | size_t len = 2000; 270 | uint8_t *str = malloc(len + 1); 271 | random_ascii_string(str, len + 1); 272 | 273 | rope *r = rope_new_with_utf8((uint8_t *)str); 274 | test(rope_char_count(r) == len); 275 | check(r, (char *)str); 276 | 277 | // Iterate through all the characters using the loop macros and make sure it all works. 278 | size_t pos = 0; 279 | ROPE_FOREACH(r, n) { 280 | test(memcmp(rope_node_data(n), &str[pos], rope_node_num_bytes(n)) == 0); 281 | pos += rope_node_num_bytes(n); 282 | } 283 | test(pos == r->num_bytes); 284 | 285 | // Delete everything but the first and last characters. 286 | rope_del(r, 1, len - 2); 287 | assert(r->num_bytes == 2); 288 | assert(r->num_chars == 2); 289 | char *contents = (char *)rope_create_cstr(r); 290 | _rope_check(r); 291 | test(contents[0] == str[0]); 292 | test(contents[1] == str[len - 1]); 293 | free(contents); 294 | 295 | rope_free(r); 296 | } 297 | 298 | static int alloced_regions = 0; 299 | 300 | void *_alloc(size_t size) { 301 | alloced_regions++; 302 | return malloc(size); 303 | } 304 | 305 | void _free(void *mem) { 306 | alloced_regions--; 307 | free(mem); 308 | } 309 | 310 | static void test_custom_allocator() { 311 | // Its really hard to test that malloc is never called, but I can make sure 312 | // custom frees match custom allocs. 313 | rope *r = rope_new2(_alloc, realloc, _free); 314 | for (int i = 0; i < 100; i++) { 315 | rope_insert(r, random() % (rope_char_count(r) + 1), 316 | (uint8_t *)"Whoa super happy fun times!\n"); 317 | } 318 | 319 | rope_free(r); 320 | 321 | test(alloced_regions == 0); 322 | } 323 | 324 | static void test_copy() { 325 | // Copy an empty string. 326 | rope *r1 = rope_new(); 327 | rope *r2 = rope_copy(r1); 328 | check(r2, ""); 329 | rope_free(r2); 330 | 331 | // Insert some text (less than one node worth) 332 | rope_insert(r1, 0, (uint8_t *)"Eureka!"); 333 | r2 = rope_copy(r1); 334 | check(r2, "Eureka!"); 335 | 336 | rope_free(r1); 337 | rope_free(r2); 338 | } 339 | 340 | static void test_random_edits() { 341 | // This string should always have the same content as the rope. 342 | _string *str = str_create(); 343 | rope *r = rope_new(); 344 | 345 | const size_t max_stringsize = 1000; 346 | uint8_t strbuffer[max_stringsize + 1]; 347 | 348 | for (int i = 0; i < 1000; i++) { 349 | // First, some sanity checks. 350 | check(r, (char *)str->mem); 351 | 352 | rope *r2 = rope_copy(r); 353 | check(r2, (char *)str->mem); 354 | rope_free(r2); 355 | 356 | // printf("String contains '%s'\n", str->mem); 357 | test(rope_byte_count(r) == str->len); 358 | size_t len = strlen_utf8(str->mem); 359 | test(rope_char_count(r) == len); 360 | test(str_num_chars(str) == len); 361 | 362 | if (len == 0 || rand_float() < 0.5f) { 363 | // Insert. 364 | random_unicode_string(strbuffer, 1 + random() % max_stringsize); 365 | size_t pos = random() % (len + 1); 366 | 367 | // printf("inserting %s at %zd\n", strbuffer, pos); 368 | rope_insert(r, pos, strbuffer); 369 | str_insert(str, pos, strbuffer); 370 | } else { 371 | // Delete 372 | size_t pos = random() % len; 373 | 374 | size_t dellen = random() % 10; 375 | dellen = MIN(len - pos, dellen); 376 | 377 | // printf("deleting %zd chars at %zd\n", dellen, pos); 378 | rope_del(r, pos, dellen); 379 | str_del(str, pos, dellen); 380 | } 381 | } 382 | 383 | rope_free(r); 384 | str_destroy(str); 385 | } 386 | 387 | static void test_random_wchar_edits() { 388 | #if ROPE_WCHAR 389 | // This string should always have the same content as the rope. 390 | // Both are stored using UTF-8, but we'll make edits using the wchar functions. 391 | _string *str = str_create(); 392 | rope *r = rope_new(); 393 | 394 | const size_t max_stringsize = 1000; 395 | uint8_t strbuffer[max_stringsize + 1]; 396 | 397 | for (int i = 0; i < 1000; i++) { 398 | check(r, (char *)str->mem); 399 | 400 | // printf("String contains '%s'\n", str->mem); 401 | test(rope_byte_count(r) == str->len); 402 | size_t len = strlen_utf8(str->mem); 403 | test(rope_char_count(r) == len); 404 | test(str_num_chars(str) == len); 405 | test(rope_wchar_count(r) == wchar_size_count(str->mem)); 406 | 407 | if (len == 0 || rand_float() < 0.5f) { 408 | // Insert. 409 | random_unicode_string(strbuffer, 1 + random() % max_stringsize); 410 | size_t pos = random() % (len + 1); 411 | 412 | // We need to convert pos to the wchar offset. There's a private function in rope.c for this 413 | // but ... 414 | size_t wchar_pos = count_wchars_in_utf8(str->mem, pos); 415 | 416 | // printf("inserting '%s' at %zd\n", strbuffer, pos); 417 | rope_insert_at_wchar(r, wchar_pos, strbuffer); 418 | str_insert(str, pos, strbuffer); 419 | } else { 420 | // Delete 421 | size_t pos = random() % len; 422 | 423 | size_t dellen = random() % 10; 424 | dellen = MIN(len - pos, dellen); 425 | 426 | size_t wchar_pos = count_wchars_in_utf8(str->mem, pos); 427 | size_t wchar_len = count_wchars_in_utf8(str->mem, pos + dellen) - wchar_pos; 428 | // printf("deleting %zd (%zd) chars at %zd (%zd)\n", dellen, wchar_len, pos, wchar_pos); 429 | rope_del_at_wchar(r, wchar_pos, wchar_len, NULL); 430 | str_del(str, pos, dellen); 431 | } 432 | } 433 | 434 | rope_free(r); 435 | str_destroy(str); 436 | #endif 437 | } 438 | 439 | 440 | void test_all() { 441 | printf("Running tests...\n"); 442 | test_empty_rope_has_no_content(); 443 | test_insert_at_location(); 444 | test_new_string_has_content(); 445 | test_invalid_utf8_rejected(); 446 | test_delete_at_location(); 447 | test_delete_past_end_of_string(); 448 | test_wchar(); 449 | test_really_long_ascii_string(); 450 | test_custom_allocator(); 451 | test_copy(); 452 | printf("Normal tests passed. Running randomizers...\n"); 453 | test_random_edits(); 454 | test_random_wchar_edits(); 455 | printf("Done!\n"); 456 | } 457 | 458 | int main(int argc, const char * argv[]) { 459 | test_all(); 460 | 461 | if (argc > 1 && strcmp(argv[1], "-b") == 0) { 462 | benchmark(); 463 | } 464 | 465 | return 0; 466 | } 467 | 468 | -------------------------------------------------------------------------------- /test/tests.h: -------------------------------------------------------------------------------- 1 | #ifndef librope_test_h 2 | #define librope_test_h 3 | 4 | #include 5 | 6 | #define MIN(x,y) ((x) > (y) ? (y) : (x)) 7 | 8 | #ifdef __cplusplus 9 | extern "C" { 10 | #endif 11 | 12 | void benchmark(); 13 | 14 | // len is approximate. Might use fewer bytes than that. 15 | void random_unicode_string(uint8_t *buffer, size_t len); 16 | 17 | // len includes \0. 18 | void random_ascii_string(uint8_t *buffer, size_t len); 19 | 20 | #ifdef __cplusplus 21 | } 22 | #endif 23 | 24 | #endif 25 | --------------------------------------------------------------------------------