├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── EXAMPLES.md ├── EXAMPLES.objc.md ├── LICENSE ├── README.md ├── build-mac ├── kvdb.xcodeproj │ └── project.pbxproj └── kvdbtest │ └── main.c ├── objc ├── KVDatabase.h ├── KVDatabase.m ├── KVIndexer.h ├── KVIndexer.m ├── KVOrderedDatabase.h └── KVOrderedDatabase.m └── src ├── CMakeLists.txt ├── ConvertUTF.c ├── ConvertUTF.h ├── ConvertUTFNamespace.h ├── kvassert.c ├── kvassert.h ├── kvblock.c ├── kvblock.h ├── kvbloom.h ├── kvdb.c ├── kvdb.h ├── kvdbo.cpp ├── kvdbo.h ├── kvendian.h ├── kvmurmurhash.h ├── kvpaddingutils.h ├── kvprime.c ├── kvprime.h ├── kvserialization.cpp ├── kvserialization.h ├── kvtable.c ├── kvtable.h ├── kvtypes.h ├── kvunicode.c ├── kvunicode.h ├── sfts.cpp └── sfts.h /.gitignore: -------------------------------------------------------------------------------- 1 | # Object files 2 | *.o 3 | 4 | # Libraries 5 | *.lib 6 | *.a 7 | 8 | # Shared objects (inc. Windows DLLs) 9 | *.dll 10 | *.so 11 | *.so.* 12 | *.dylib 13 | 14 | # Executables 15 | *.exe 16 | *.out 17 | *.app 18 | 19 | .DS_Store 20 | xcuserdata 21 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third-party/lz4"] 2 | path = third-party/lz4 3 | url = https://github.com/Cyan4973/lz4 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required (VERSION 2.6) 2 | project (kvdb) 3 | 4 | add_subdirectory (src) 5 | -------------------------------------------------------------------------------- /EXAMPLES.md: -------------------------------------------------------------------------------- 1 | kvdb 2 | ==== 3 | 4 | A Lightweight Key-Value Database. 5 | 6 | - Use only one file 7 | - Low memory usage 8 | - Good performance 9 | 10 | Example: 11 | 12 | ```c 13 | #include 14 | #include 15 | 16 | int main(int argc, char ** argv) 17 | { 18 | struct kvdb * db; 19 | db = kvdb_new("kvdb-test.kvdb"); 20 | kvdb_open(db); 21 | 22 | int r; 23 | 24 | char * key = "some key"; 25 | char * value = "some value"; 26 | r = kvdb_set(db, key, strlen(key), value, strlen(value)); 27 | switch (r) { 28 | case 0: 29 | fprintf(stderr, "value stored\n"); 30 | break; 31 | 32 | case -2: 33 | fprintf(stderr, "I/O error\n"); 34 | break; 35 | } 36 | 37 | key = "some other key"; 38 | char * read_value = NULL; 39 | size_t read_value_size = 0; 40 | r = kvdb_get(db, key, strlen(key), &read_value, &read_value_size); 41 | switch (r) { 42 | case 0: 43 | fprintf(stderr, "key: %.*s\n", (int) read_value, read_value_size); 44 | free(read_value); 45 | break; 46 | 47 | case -1: 48 | fprintf(stderr, "not found\n"); 49 | break; 50 | 51 | case -2: 52 | fprintf(stderr, "I/O error\n"); 53 | break; 54 | } 55 | 56 | key = "yet another key"; 57 | r = kvdb_delete(db, key, strlen(key)); 58 | switch (r) { 59 | case 0: 60 | fprintf(stderr, "value removed\n"); 61 | break; 62 | 63 | case -2: 64 | fprintf(stderr, "I/O error\n"); 65 | break; 66 | } 67 | 68 | kvdb_close(db); 69 | kvdb_free(db); 70 | exit(EXIT_SUCCESS); 71 | } 72 | ``` 73 | 74 | kvdbo 75 | ===== 76 | 77 | A Lightweight ordered Key-Value Database. 78 | 79 | - Use only one file 80 | - Low memory usage 81 | - Good performance 82 | - Iteratable 83 | 84 | Example: 85 | 86 | ```c 87 | #include 88 | #include 89 | 90 | int main(int argc, char ** argv) 91 | { 92 | struct kvdbo * db; 93 | db = kvdbo_new("kvdb-test.kvdbo"); 94 | kvdb_open(db); 95 | 96 | int r; 97 | 98 | char * key = "some key"; 99 | char * value = "some value"; 100 | r = kvdbo_set(db, key, strlen(key), value, strlen(value)); 101 | switch (r) { 102 | case 0: 103 | fprintf(stderr, "value stored\n"); 104 | break; 105 | 106 | case -2: 107 | fprintf(stderr, "I/O error\n"); 108 | break; 109 | } 110 | 111 | key = "some other key"; 112 | char * read_value = NULL; 113 | size_t read_value_size = 0; 114 | r = kvdbo_get(db, key, strlen(key), &read_value, &read_value_size); 115 | switch (r) { 116 | case 0: 117 | fprintf(stderr, "key: %.*s\n", (int) read_value, read_value_size); 118 | free(read_value); 119 | break; 120 | 121 | case -1: 122 | fprintf(stderr, "not found\n"); 123 | break; 124 | 125 | case -2: 126 | fprintf(stderr, "I/O error\n"); 127 | break; 128 | } 129 | 130 | key = "yet another key"; 131 | r = kvdbo_delete(db, key, strlen(key)); 132 | switch (r) { 133 | case 0: 134 | fprintf(stderr, "value removed\n"); 135 | break; 136 | 137 | case -2: 138 | fprintf(stderr, "I/O error\n"); 139 | break; 140 | } 141 | 142 | struct kvdbo_iterator * iterator = kvdbo_iterator_new(db); 143 | kvdbo_iterator_seek_first(iterator); 144 | while (kvdbo_iterator_is_valid(iterator)) { 145 | const char * key; 146 | size_t size; 147 | kvdbo_iterator_get_key(iterator, &key, &size); 148 | printf("key: %.*s\n", size, key); 149 | } 150 | kvdbo_iterator_free(db); 151 | 152 | kvdbo_close(db); 153 | kvdbo_free(db); 154 | exit(EXIT_SUCCESS); 155 | } 156 | ``` 157 | 158 | sfts 159 | ==== 160 | 161 | A Simple Full Text Search. 162 | 163 | - Use only one file 164 | - Low memory usage 165 | - Good performance 166 | - Unicode support 167 | 168 | Example: 169 | 170 | ```c 171 | #include 172 | #include 173 | 174 | int main(int argc, char ** argv) 175 | { 176 | sfts * indexer; 177 | int r; 178 | uint64_t * result; 179 | size_t result_count; 180 | 181 | // Opens the index. 182 | indexer = sfts_new(); 183 | sfts_open(indexer, "index.sfts"); 184 | 185 | // Adds data to the index. 186 | sfts_set(indexer, 0, "George Washington"); 187 | sfts_set(indexer, 1, "John Adams"); 188 | sfts_set(indexer, 2, "Thomas Jefferson"); 189 | sfts_set(indexer, 3, "George Michael"); 190 | sfts_set(indexer, 4, "George Méliès"); 191 | 192 | // Search "geor". 193 | print("searching geor"); 194 | sfts_search(indexer, "geor", sfts_search_kind_prefix, &result, &result_count); 195 | for(size_t i = 0 ; i < result_count ; i ++) { 196 | printf("found: %i\n", result[i]); 197 | } 198 | // returns 0, 3 and 4. 199 | free(result); 200 | 201 | // Search "mel". 202 | print("searching mel"); 203 | sfts_search(indexer, "mel", sfts_search_kind_prefix, &result, &result_count); 204 | for(size_t i = 0 ; i < result_count ; i ++) { 205 | printf("found: %i\n", result[i]); 206 | } 207 | // return 4 208 | free(result); 209 | 210 | sfts_close(indexer); 211 | sfts_free(indexer); 212 | } 213 | ``` 214 | -------------------------------------------------------------------------------- /EXAMPLES.objc.md: -------------------------------------------------------------------------------- 1 | KVDatabase 2 | ========== 3 | 4 | A Lightweight Key-Value Database. 5 | 6 | - Use only one file 7 | - Low memory usage 8 | - Good performance 9 | 10 | Example: 11 | 12 | ```objc 13 | #include 14 | #include 15 | 16 | int main(int argc, char ** argv) 17 | { 18 | KVDatabase * db; 19 | db = [[KVDatabase alloc] initWithPath:@"kvdb-test.kvdb"]; 20 | [db open]; 21 | 22 | [db setData:[NSData dataWithBytes:"some value" length:10] forKey:@"some key"]; 23 | NSData * data = [db dataForKey:@"some other key"]; 24 | NSLog(@"value; %@", data); 25 | 26 | [db removeDataForKey:@"yet another key"]; 27 | 28 | [db close]; 29 | exit(EXIT_SUCCESS); 30 | } 31 | ``` 32 | 33 | KVOrderedDatabase 34 | ================= 35 | 36 | A Lightweight ordered Key-Value Database. 37 | 38 | - Use only one file 39 | - Low memory usage 40 | - Good performance 41 | - Iteratable 42 | 43 | Example: 44 | 45 | ```objc 46 | #include 47 | #include 48 | 49 | int main(int argc, char ** argv) 50 | { 51 | KVOrderedDatabase * db; 52 | db = [[KVOrderedDatabase alloc] initWithPath:@"kvdb-test.kvdb"]; 53 | [db open]; 54 | 55 | [db setData:[NSData dataWithBytes:"some value" length:10] forKey:@"some key"]; 56 | NSData * data = [db dataForKey:@"some other key"]; 57 | NSLog(@"value; %@", data); 58 | 59 | [db removeDataForKey:@"yet another key"]; 60 | 61 | KVOrderedDatabaseIterator * iterator = [db keyIterator]; 62 | [iterator seekToFirstKey]; 63 | while ([iterator isValid]) { 64 | NSLog(@"key: %@", [iterator currentKey]); 65 | [iterator next]; 66 | } 67 | 68 | [db close]; 69 | exit(EXIT_SUCCESS); 70 | } 71 | ``` 72 | 73 | KVIndexer 74 | ========= 75 | 76 | A Simple Full Text Search. 77 | 78 | - Use only one file 79 | - Low memory usage 80 | - Good performance 81 | - Unicode support 82 | 83 | Example: 84 | 85 | ```objc 86 | #include 87 | #include 88 | 89 | int main(int argc, char ** argv) 90 | { 91 | KVIndexer * indexer; 92 | 93 | // Opens the index. 94 | indexer = [[KVIndexer alloc] initWithPath:@"index.sfts"]; 95 | 96 | // Adds data to the index. 97 | [indexer setString:@"George Washington" forDocID:0]; 98 | [indexer setString:@"John Adams" forDocID:1]; 99 | [indexer setString:@"Thomas Jefferson" forDocID:2]; 100 | [indexer setString:@"George Michael" forDocID:3]; 101 | [indexer setString:@"George Méliès" forDocID:4]; 102 | 103 | // Search "geor". 104 | NSLog(@"searching geor"); 105 | NSArray * result = [indexer search:@"geor" kind:KVIndexerSearchKindPrefix]; 106 | NSLog(@"found: %@", result); 107 | 108 | // Search "mel". 109 | NSLog(@"searching mel"); 110 | NSArray * result = [indexer search:@"mel" kind:KVIndexerSearchKindPrefix]; 111 | NSLog(@"found: %@", result); 112 | 113 | [indexer close]; 114 | } 115 | ``` 116 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | KVDB 2 | 3 | Copyright (C) 2001 - 2013 - Hoà V. Dinh 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions 8 | are met: 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 3. Neither the name of the KVDB project nor the names of its 15 | contributors may be used to endorse or promote products derived 16 | from this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 | SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | kvdb 2 | ==== 3 | 4 | This library implements: 5 | - a Key Value Store, 6 | - an ordered Key Value Store, 7 | - a Full Text Search Index. 8 | 9 | It targets embedded platforms where there are memory, disk and file descriptors constraints. 10 | The API are available in C and Objective-C. 11 | 12 | kvdb 13 | ==== 14 | 15 | A Key-Value Database. 16 | 17 | - Use only one file 18 | - Low memory usage 19 | - Good performance 20 | 21 | kvdbo 22 | ===== 23 | 24 | An ordered Key-Value Database. 25 | 26 | - Use only one file 27 | - Low memory usage 28 | - Good performance 29 | - Keys can be iterated in lexicographical order 30 | 31 | sfts 32 | ==== 33 | 34 | A Simple Full Text Search. 35 | 36 | - Use only one file 37 | - Low memory usage 38 | - Good performance 39 | - Unicode support 40 | 41 | Examples 42 | ======== 43 | 44 | - [Examples for C](EXAMPLES.md) 45 | - [Examples for Objective-C](EXAMPLES.objc.md) 46 | -------------------------------------------------------------------------------- /build-mac/kvdb.xcodeproj/project.pbxproj: -------------------------------------------------------------------------------- 1 | // !$*UTF8*$! 2 | { 3 | archiveVersion = 1; 4 | classes = { 5 | }; 6 | objectVersion = 46; 7 | objects = { 8 | 9 | /* Begin PBXBuildFile section */ 10 | BD1E7C841AAA47DD0030673D /* kvdbo.cpp in Sources */ = {isa = PBXBuildFile; fileRef = BD1E7C821AAA47DD0030673D /* kvdbo.cpp */; }; 11 | BD520F301ABAB24E00681B8B /* kvdbo.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = BD1E7C831AAA47DD0030673D /* kvdbo.h */; }; 12 | BD520F391ABB548D00681B8B /* ConvertUTF.c in Sources */ = {isa = PBXBuildFile; fileRef = BD520F311ABB548D00681B8B /* ConvertUTF.c */; }; 13 | BD520F3A1ABB548D00681B8B /* kvserialization.cpp in Sources */ = {isa = PBXBuildFile; fileRef = BD520F331ABB548D00681B8B /* kvserialization.cpp */; }; 14 | BD520F3B1ABB548D00681B8B /* kvunicode.c in Sources */ = {isa = PBXBuildFile; fileRef = BD520F351ABB548D00681B8B /* kvunicode.c */; }; 15 | BD520F3C1ABB548D00681B8B /* sfts.cpp in Sources */ = {isa = PBXBuildFile; fileRef = BD520F371ABB548D00681B8B /* sfts.cpp */; }; 16 | BDB104621ABE82B000FD6FF6 /* KVIndexer.m in Sources */ = {isa = PBXBuildFile; fileRef = BDB104611ABE82B000FD6FF6 /* KVIndexer.m */; }; 17 | BDB104691ABE82CB00FD6FF6 /* KVDatabase.m in Sources */ = {isa = PBXBuildFile; fileRef = BDB104681ABE82CB00FD6FF6 /* KVDatabase.m */; }; 18 | BDB1046C1ABE82D900FD6FF6 /* KVOrderedDatabase.m in Sources */ = {isa = PBXBuildFile; fileRef = BDB1046B1ABE82D900FD6FF6 /* KVOrderedDatabase.m */; }; 19 | BDB104841AC4D55E00FD6FF6 /* lz4.c in Sources */ = {isa = PBXBuildFile; fileRef = BDB104791AC4D55E00FD6FF6 /* lz4.c */; }; 20 | BDB104851AC4D55E00FD6FF6 /* lz4frame.c in Sources */ = {isa = PBXBuildFile; fileRef = BDB1047B1AC4D55E00FD6FF6 /* lz4frame.c */; }; 21 | BDB104861AC4D55E00FD6FF6 /* lz4hc.c in Sources */ = {isa = PBXBuildFile; fileRef = BDB1047E1AC4D55E00FD6FF6 /* lz4hc.c */; }; 22 | BDB104891AC4D55E00FD6FF6 /* xxhash.c in Sources */ = {isa = PBXBuildFile; fileRef = BDB104821AC4D55E00FD6FF6 /* xxhash.c */; }; 23 | C618377C1763F6B8009E00E4 /* kvdb.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = C66823611763C472000C603C /* kvdb.h */; }; 24 | C618377F1763F6CC009E00E4 /* kvdb.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = C66823611763C472000C603C /* kvdb.h */; }; 25 | C668236A1763C472000C603C /* kvassert.c in Sources */ = {isa = PBXBuildFile; fileRef = C668235B1763C472000C603C /* kvassert.c */; }; 26 | C668236C1763C472000C603C /* kvblock.c in Sources */ = {isa = PBXBuildFile; fileRef = C668235D1763C472000C603C /* kvblock.c */; }; 27 | C668236F1763C472000C603C /* kvdb.c in Sources */ = {isa = PBXBuildFile; fileRef = C66823601763C472000C603C /* kvdb.c */; }; 28 | C66823741763C472000C603C /* kvprime.c in Sources */ = {isa = PBXBuildFile; fileRef = C66823651763C472000C603C /* kvprime.c */; }; 29 | C66823761763C472000C603C /* kvtable.c in Sources */ = {isa = PBXBuildFile; fileRef = C66823671763C472000C603C /* kvtable.c */; }; 30 | C66823821763C48F000C603C /* main.c in Sources */ = {isa = PBXBuildFile; fileRef = C66823811763C48F000C603C /* main.c */; }; 31 | C66823881763C4D6000C603C /* libkvdb.a in Frameworks */ = {isa = PBXBuildFile; fileRef = C66823531763C246000C603C /* libkvdb.a */; }; 32 | C668239B1763EA77000C603C /* kvassert.c in Sources */ = {isa = PBXBuildFile; fileRef = C668235B1763C472000C603C /* kvassert.c */; }; 33 | C668239D1763EA77000C603C /* kvblock.c in Sources */ = {isa = PBXBuildFile; fileRef = C668235D1763C472000C603C /* kvblock.c */; }; 34 | C66823A01763EA77000C603C /* kvdb.c in Sources */ = {isa = PBXBuildFile; fileRef = C66823601763C472000C603C /* kvdb.c */; }; 35 | C66823A51763EA77000C603C /* kvprime.c in Sources */ = {isa = PBXBuildFile; fileRef = C66823651763C472000C603C /* kvprime.c */; }; 36 | C66823A71763EA77000C603C /* kvtable.c in Sources */ = {isa = PBXBuildFile; fileRef = C66823671763C472000C603C /* kvtable.c */; }; 37 | C698FAF01AC66D2F00501892 /* KVIndexer.m in Sources */ = {isa = PBXBuildFile; fileRef = BDB104611ABE82B000FD6FF6 /* KVIndexer.m */; }; 38 | C698FAF11AC66D3300501892 /* KVDatabase.m in Sources */ = {isa = PBXBuildFile; fileRef = BDB104681ABE82CB00FD6FF6 /* KVDatabase.m */; }; 39 | C698FAF21AC66D3600501892 /* KVOrderedDatabase.m in Sources */ = {isa = PBXBuildFile; fileRef = BDB1046B1ABE82D900FD6FF6 /* KVOrderedDatabase.m */; }; 40 | C698FAF31AC66D4200501892 /* lz4.c in Sources */ = {isa = PBXBuildFile; fileRef = BDB104791AC4D55E00FD6FF6 /* lz4.c */; }; 41 | C698FAF41AC66D4500501892 /* lz4frame.c in Sources */ = {isa = PBXBuildFile; fileRef = BDB1047B1AC4D55E00FD6FF6 /* lz4frame.c */; }; 42 | C698FAF51AC66D4D00501892 /* lz4hc.c in Sources */ = {isa = PBXBuildFile; fileRef = BDB1047E1AC4D55E00FD6FF6 /* lz4hc.c */; }; 43 | C698FAF61AC66D5100501892 /* xxhash.c in Sources */ = {isa = PBXBuildFile; fileRef = BDB104821AC4D55E00FD6FF6 /* xxhash.c */; }; 44 | C698FAF71AC66D5900501892 /* ConvertUTF.c in Sources */ = {isa = PBXBuildFile; fileRef = BD520F311ABB548D00681B8B /* ConvertUTF.c */; }; 45 | C698FAF81AC66D6200501892 /* kvserialization.cpp in Sources */ = {isa = PBXBuildFile; fileRef = BD520F331ABB548D00681B8B /* kvserialization.cpp */; }; 46 | C698FAF91AC66D6A00501892 /* kvunicode.c in Sources */ = {isa = PBXBuildFile; fileRef = BD520F351ABB548D00681B8B /* kvunicode.c */; }; 47 | C698FAFA1AC66D6D00501892 /* sfts.cpp in Sources */ = {isa = PBXBuildFile; fileRef = BD520F371ABB548D00681B8B /* sfts.cpp */; }; 48 | C698FAFB1AC66D7200501892 /* kvdbo.cpp in Sources */ = {isa = PBXBuildFile; fileRef = BD1E7C821AAA47DD0030673D /* kvdbo.cpp */; }; 49 | C6B7F6671AC66DD700444CFB /* sfts.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = BD520F381ABB548D00681B8B /* sfts.h */; }; 50 | C6B7F6681AC66DDF00444CFB /* KVIndexer.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = BDB104601ABE82B000FD6FF6 /* KVIndexer.h */; }; 51 | C6B7F66A1AC66DE900444CFB /* KVOrderedDatabase.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = BDB1046A1ABE82D900FD6FF6 /* KVOrderedDatabase.h */; }; 52 | C6B7F66B1AC66DEF00444CFB /* KVDatabase.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = BDB104671ABE82CB00FD6FF6 /* KVDatabase.h */; }; 53 | C6B7F66C1AC66E0700444CFB /* KVIndexer.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = BDB104601ABE82B000FD6FF6 /* KVIndexer.h */; }; 54 | C6B7F66D1AC66E0900444CFB /* KVDatabase.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = BDB104671ABE82CB00FD6FF6 /* KVDatabase.h */; }; 55 | C6B7F66E1AC66E0C00444CFB /* KVOrderedDatabase.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = BDB1046A1ABE82D900FD6FF6 /* KVOrderedDatabase.h */; }; 56 | C6B7F66F1AC66E1100444CFB /* sfts.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = BD520F381ABB548D00681B8B /* sfts.h */; }; 57 | C6B7F6701AC66E1500444CFB /* kvdbo.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = BD1E7C831AAA47DD0030673D /* kvdbo.h */; }; 58 | /* End PBXBuildFile section */ 59 | 60 | /* Begin PBXCopyFilesBuildPhase section */ 61 | C618377B1763F6AE009E00E4 /* CopyFiles */ = { 62 | isa = PBXCopyFilesBuildPhase; 63 | buildActionMask = 2147483647; 64 | dstPath = include/kvdb; 65 | dstSubfolderSpec = 16; 66 | files = ( 67 | C6B7F66C1AC66E0700444CFB /* KVIndexer.h in CopyFiles */, 68 | C6B7F66D1AC66E0900444CFB /* KVDatabase.h in CopyFiles */, 69 | C6B7F66E1AC66E0C00444CFB /* KVOrderedDatabase.h in CopyFiles */, 70 | C618377C1763F6B8009E00E4 /* kvdb.h in CopyFiles */, 71 | C6B7F66F1AC66E1100444CFB /* sfts.h in CopyFiles */, 72 | C6B7F6701AC66E1500444CFB /* kvdbo.h in CopyFiles */, 73 | ); 74 | runOnlyForDeploymentPostprocessing = 0; 75 | }; 76 | C618377E1763F6C3009E00E4 /* CopyFiles */ = { 77 | isa = PBXCopyFilesBuildPhase; 78 | buildActionMask = 2147483647; 79 | dstPath = include/kvdb; 80 | dstSubfolderSpec = 16; 81 | files = ( 82 | C618377F1763F6CC009E00E4 /* kvdb.h in CopyFiles */, 83 | BD520F301ABAB24E00681B8B /* kvdbo.h in CopyFiles */, 84 | C6B7F6671AC66DD700444CFB /* sfts.h in CopyFiles */, 85 | C6B7F6681AC66DDF00444CFB /* KVIndexer.h in CopyFiles */, 86 | C6B7F66A1AC66DE900444CFB /* KVOrderedDatabase.h in CopyFiles */, 87 | C6B7F66B1AC66DEF00444CFB /* KVDatabase.h in CopyFiles */, 88 | ); 89 | runOnlyForDeploymentPostprocessing = 0; 90 | }; 91 | C668237D1763C48F000C603C /* CopyFiles */ = { 92 | isa = PBXCopyFilesBuildPhase; 93 | buildActionMask = 2147483647; 94 | dstPath = /usr/share/man/man1/; 95 | dstSubfolderSpec = 0; 96 | files = ( 97 | ); 98 | runOnlyForDeploymentPostprocessing = 1; 99 | }; 100 | /* End PBXCopyFilesBuildPhase section */ 101 | 102 | /* Begin PBXFileReference section */ 103 | BD1E7C821AAA47DD0030673D /* kvdbo.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = kvdbo.cpp; sourceTree = ""; }; 104 | BD1E7C831AAA47DD0030673D /* kvdbo.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kvdbo.h; sourceTree = ""; }; 105 | BD520F311ABB548D00681B8B /* ConvertUTF.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = ConvertUTF.c; sourceTree = ""; }; 106 | BD520F321ABB548D00681B8B /* ConvertUTF.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ConvertUTF.h; sourceTree = ""; }; 107 | BD520F331ABB548D00681B8B /* kvserialization.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = kvserialization.cpp; sourceTree = ""; }; 108 | BD520F341ABB548D00681B8B /* kvserialization.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kvserialization.h; sourceTree = ""; }; 109 | BD520F351ABB548D00681B8B /* kvunicode.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = kvunicode.c; sourceTree = ""; }; 110 | BD520F361ABB548D00681B8B /* kvunicode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kvunicode.h; sourceTree = ""; }; 111 | BD520F371ABB548D00681B8B /* sfts.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sfts.cpp; sourceTree = ""; }; 112 | BD520F381ABB548D00681B8B /* sfts.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sfts.h; sourceTree = ""; }; 113 | BD520F3D1ABBE79C00681B8B /* ConvertUTFNamespace.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ConvertUTFNamespace.h; sourceTree = ""; }; 114 | BDB104601ABE82B000FD6FF6 /* KVIndexer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KVIndexer.h; sourceTree = ""; }; 115 | BDB104611ABE82B000FD6FF6 /* KVIndexer.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = KVIndexer.m; sourceTree = ""; }; 116 | BDB104671ABE82CB00FD6FF6 /* KVDatabase.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KVDatabase.h; sourceTree = ""; }; 117 | BDB104681ABE82CB00FD6FF6 /* KVDatabase.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = KVDatabase.m; sourceTree = ""; }; 118 | BDB1046A1ABE82D900FD6FF6 /* KVOrderedDatabase.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KVOrderedDatabase.h; sourceTree = ""; }; 119 | BDB1046B1ABE82D900FD6FF6 /* KVOrderedDatabase.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = KVOrderedDatabase.m; sourceTree = ""; }; 120 | BDB104791AC4D55E00FD6FF6 /* lz4.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = lz4.c; sourceTree = ""; }; 121 | BDB1047A1AC4D55E00FD6FF6 /* lz4.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = lz4.h; sourceTree = ""; }; 122 | BDB1047B1AC4D55E00FD6FF6 /* lz4frame.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = lz4frame.c; sourceTree = ""; }; 123 | BDB1047C1AC4D55E00FD6FF6 /* lz4frame.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = lz4frame.h; sourceTree = ""; }; 124 | BDB1047D1AC4D55E00FD6FF6 /* lz4frame_static.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = lz4frame_static.h; sourceTree = ""; }; 125 | BDB1047E1AC4D55E00FD6FF6 /* lz4hc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = lz4hc.c; sourceTree = ""; }; 126 | BDB1047F1AC4D55E00FD6FF6 /* lz4hc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = lz4hc.h; sourceTree = ""; }; 127 | BDB104821AC4D55E00FD6FF6 /* xxhash.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = xxhash.c; sourceTree = ""; }; 128 | BDB104831AC4D55E00FD6FF6 /* xxhash.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = xxhash.h; sourceTree = ""; }; 129 | C66823531763C246000C603C /* libkvdb.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libkvdb.a; sourceTree = BUILT_PRODUCTS_DIR; }; 130 | C668235B1763C472000C603C /* kvassert.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = kvassert.c; sourceTree = ""; }; 131 | C668235C1763C472000C603C /* kvassert.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kvassert.h; sourceTree = ""; }; 132 | C668235D1763C472000C603C /* kvblock.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = kvblock.c; sourceTree = ""; }; 133 | C668235E1763C472000C603C /* kvblock.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kvblock.h; sourceTree = ""; }; 134 | C668235F1763C472000C603C /* kvbloom.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kvbloom.h; sourceTree = ""; }; 135 | C66823601763C472000C603C /* kvdb.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = kvdb.c; sourceTree = ""; }; 136 | C66823611763C472000C603C /* kvdb.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kvdb.h; sourceTree = ""; }; 137 | C66823621763C472000C603C /* kvendian.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kvendian.h; sourceTree = ""; }; 138 | C66823631763C472000C603C /* kvmurmurhash.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kvmurmurhash.h; sourceTree = ""; }; 139 | C66823641763C472000C603C /* kvpaddingutils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kvpaddingutils.h; sourceTree = ""; }; 140 | C66823651763C472000C603C /* kvprime.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = kvprime.c; sourceTree = ""; }; 141 | C66823661763C472000C603C /* kvprime.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kvprime.h; sourceTree = ""; }; 142 | C66823671763C472000C603C /* kvtable.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = kvtable.c; sourceTree = ""; }; 143 | C66823681763C472000C603C /* kvtable.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kvtable.h; sourceTree = ""; }; 144 | C66823691763C472000C603C /* kvtypes.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kvtypes.h; sourceTree = ""; }; 145 | C668237F1763C48F000C603C /* kvdbtest */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = kvdbtest; sourceTree = BUILT_PRODUCTS_DIR; }; 146 | C66823811763C48F000C603C /* main.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; path = main.c; sourceTree = ""; }; 147 | C668238D1763EA47000C603C /* libkvdb-ios.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libkvdb-ios.a"; sourceTree = BUILT_PRODUCTS_DIR; }; 148 | C668238F1763EA47000C603C /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; }; 149 | /* End PBXFileReference section */ 150 | 151 | /* Begin PBXFrameworksBuildPhase section */ 152 | C66823501763C246000C603C /* Frameworks */ = { 153 | isa = PBXFrameworksBuildPhase; 154 | buildActionMask = 2147483647; 155 | files = ( 156 | ); 157 | runOnlyForDeploymentPostprocessing = 0; 158 | }; 159 | C668237C1763C48F000C603C /* Frameworks */ = { 160 | isa = PBXFrameworksBuildPhase; 161 | buildActionMask = 2147483647; 162 | files = ( 163 | C66823881763C4D6000C603C /* libkvdb.a in Frameworks */, 164 | ); 165 | runOnlyForDeploymentPostprocessing = 0; 166 | }; 167 | C668238A1763EA47000C603C /* Frameworks */ = { 168 | isa = PBXFrameworksBuildPhase; 169 | buildActionMask = 2147483647; 170 | files = ( 171 | ); 172 | runOnlyForDeploymentPostprocessing = 0; 173 | }; 174 | /* End PBXFrameworksBuildPhase section */ 175 | 176 | /* Begin PBXGroup section */ 177 | BDB1045D1ABE825800FD6FF6 /* objc */ = { 178 | isa = PBXGroup; 179 | children = ( 180 | BDB104601ABE82B000FD6FF6 /* KVIndexer.h */, 181 | BDB104611ABE82B000FD6FF6 /* KVIndexer.m */, 182 | BDB104671ABE82CB00FD6FF6 /* KVDatabase.h */, 183 | BDB104681ABE82CB00FD6FF6 /* KVDatabase.m */, 184 | BDB1046A1ABE82D900FD6FF6 /* KVOrderedDatabase.h */, 185 | BDB1046B1ABE82D900FD6FF6 /* KVOrderedDatabase.m */, 186 | ); 187 | name = objc; 188 | path = ../objc; 189 | sourceTree = ""; 190 | }; 191 | BDB104761AC4D55E00FD6FF6 /* lz4 */ = { 192 | isa = PBXGroup; 193 | children = ( 194 | BDB104791AC4D55E00FD6FF6 /* lz4.c */, 195 | BDB1047A1AC4D55E00FD6FF6 /* lz4.h */, 196 | BDB1047B1AC4D55E00FD6FF6 /* lz4frame.c */, 197 | BDB1047C1AC4D55E00FD6FF6 /* lz4frame.h */, 198 | BDB1047D1AC4D55E00FD6FF6 /* lz4frame_static.h */, 199 | BDB1047E1AC4D55E00FD6FF6 /* lz4hc.c */, 200 | BDB1047F1AC4D55E00FD6FF6 /* lz4hc.h */, 201 | BDB104821AC4D55E00FD6FF6 /* xxhash.c */, 202 | BDB104831AC4D55E00FD6FF6 /* xxhash.h */, 203 | ); 204 | name = lz4; 205 | path = "../third-party/lz4/lib"; 206 | sourceTree = ""; 207 | }; 208 | C668234A1763C246000C603C = { 209 | isa = PBXGroup; 210 | children = ( 211 | BDB104761AC4D55E00FD6FF6 /* lz4 */, 212 | BDB1045D1ABE825800FD6FF6 /* objc */, 213 | C668235A1763C472000C603C /* src */, 214 | C66823801763C48F000C603C /* kvdbtest */, 215 | C668238E1763EA47000C603C /* Frameworks */, 216 | C66823541763C246000C603C /* Products */, 217 | ); 218 | sourceTree = ""; 219 | }; 220 | C66823541763C246000C603C /* Products */ = { 221 | isa = PBXGroup; 222 | children = ( 223 | C66823531763C246000C603C /* libkvdb.a */, 224 | C668237F1763C48F000C603C /* kvdbtest */, 225 | C668238D1763EA47000C603C /* libkvdb-ios.a */, 226 | ); 227 | name = Products; 228 | sourceTree = ""; 229 | }; 230 | C668235A1763C472000C603C /* src */ = { 231 | isa = PBXGroup; 232 | children = ( 233 | BD520F311ABB548D00681B8B /* ConvertUTF.c */, 234 | BD520F321ABB548D00681B8B /* ConvertUTF.h */, 235 | BD520F3D1ABBE79C00681B8B /* ConvertUTFNamespace.h */, 236 | BD520F331ABB548D00681B8B /* kvserialization.cpp */, 237 | BD520F341ABB548D00681B8B /* kvserialization.h */, 238 | BD520F351ABB548D00681B8B /* kvunicode.c */, 239 | BD520F361ABB548D00681B8B /* kvunicode.h */, 240 | BD520F371ABB548D00681B8B /* sfts.cpp */, 241 | BD520F381ABB548D00681B8B /* sfts.h */, 242 | BD1E7C821AAA47DD0030673D /* kvdbo.cpp */, 243 | BD1E7C831AAA47DD0030673D /* kvdbo.h */, 244 | C668235B1763C472000C603C /* kvassert.c */, 245 | C668235C1763C472000C603C /* kvassert.h */, 246 | C668235D1763C472000C603C /* kvblock.c */, 247 | C668235E1763C472000C603C /* kvblock.h */, 248 | C668235F1763C472000C603C /* kvbloom.h */, 249 | C66823601763C472000C603C /* kvdb.c */, 250 | C66823611763C472000C603C /* kvdb.h */, 251 | C66823621763C472000C603C /* kvendian.h */, 252 | C66823631763C472000C603C /* kvmurmurhash.h */, 253 | C66823641763C472000C603C /* kvpaddingutils.h */, 254 | C66823651763C472000C603C /* kvprime.c */, 255 | C66823661763C472000C603C /* kvprime.h */, 256 | C66823671763C472000C603C /* kvtable.c */, 257 | C66823681763C472000C603C /* kvtable.h */, 258 | C66823691763C472000C603C /* kvtypes.h */, 259 | ); 260 | name = src; 261 | path = ../src; 262 | sourceTree = ""; 263 | }; 264 | C66823801763C48F000C603C /* kvdbtest */ = { 265 | isa = PBXGroup; 266 | children = ( 267 | C66823811763C48F000C603C /* main.c */, 268 | ); 269 | path = kvdbtest; 270 | sourceTree = ""; 271 | }; 272 | C668238E1763EA47000C603C /* Frameworks */ = { 273 | isa = PBXGroup; 274 | children = ( 275 | C668238F1763EA47000C603C /* Foundation.framework */, 276 | ); 277 | name = Frameworks; 278 | sourceTree = ""; 279 | }; 280 | /* End PBXGroup section */ 281 | 282 | /* Begin PBXNativeTarget section */ 283 | C66823521763C246000C603C /* kvdb */ = { 284 | isa = PBXNativeTarget; 285 | buildConfigurationList = C66823571763C246000C603C /* Build configuration list for PBXNativeTarget "kvdb" */; 286 | buildPhases = ( 287 | C618377E1763F6C3009E00E4 /* CopyFiles */, 288 | C668234F1763C246000C603C /* Sources */, 289 | C66823501763C246000C603C /* Frameworks */, 290 | ); 291 | buildRules = ( 292 | ); 293 | dependencies = ( 294 | ); 295 | name = kvdb; 296 | productName = kvdb; 297 | productReference = C66823531763C246000C603C /* libkvdb.a */; 298 | productType = "com.apple.product-type.library.static"; 299 | }; 300 | C668237E1763C48F000C603C /* kvdbtest */ = { 301 | isa = PBXNativeTarget; 302 | buildConfigurationList = C66823851763C48F000C603C /* Build configuration list for PBXNativeTarget "kvdbtest" */; 303 | buildPhases = ( 304 | C668237B1763C48F000C603C /* Sources */, 305 | C668237C1763C48F000C603C /* Frameworks */, 306 | C668237D1763C48F000C603C /* CopyFiles */, 307 | ); 308 | buildRules = ( 309 | ); 310 | dependencies = ( 311 | ); 312 | name = kvdbtest; 313 | productName = kvdbtest; 314 | productReference = C668237F1763C48F000C603C /* kvdbtest */; 315 | productType = "com.apple.product-type.tool"; 316 | }; 317 | C668238C1763EA47000C603C /* kvdb-ios */ = { 318 | isa = PBXNativeTarget; 319 | buildConfigurationList = C66823981763EA47000C603C /* Build configuration list for PBXNativeTarget "kvdb-ios" */; 320 | buildPhases = ( 321 | C618377B1763F6AE009E00E4 /* CopyFiles */, 322 | C66823891763EA47000C603C /* Sources */, 323 | C668238A1763EA47000C603C /* Frameworks */, 324 | ); 325 | buildRules = ( 326 | ); 327 | dependencies = ( 328 | ); 329 | name = "kvdb-ios"; 330 | productName = "kvdb-ios"; 331 | productReference = C668238D1763EA47000C603C /* libkvdb-ios.a */; 332 | productType = "com.apple.product-type.library.static"; 333 | }; 334 | /* End PBXNativeTarget section */ 335 | 336 | /* Begin PBXProject section */ 337 | C668234B1763C246000C603C /* Project object */ = { 338 | isa = PBXProject; 339 | attributes = { 340 | LastUpgradeCheck = 0460; 341 | ORGANIZATIONNAME = etpan; 342 | }; 343 | buildConfigurationList = C668234E1763C246000C603C /* Build configuration list for PBXProject "kvdb" */; 344 | compatibilityVersion = "Xcode 3.2"; 345 | developmentRegion = English; 346 | hasScannedForEncodings = 0; 347 | knownRegions = ( 348 | en, 349 | ); 350 | mainGroup = C668234A1763C246000C603C; 351 | productRefGroup = C66823541763C246000C603C /* Products */; 352 | projectDirPath = ""; 353 | projectRoot = ""; 354 | targets = ( 355 | C66823521763C246000C603C /* kvdb */, 356 | C668237E1763C48F000C603C /* kvdbtest */, 357 | C668238C1763EA47000C603C /* kvdb-ios */, 358 | ); 359 | }; 360 | /* End PBXProject section */ 361 | 362 | /* Begin PBXSourcesBuildPhase section */ 363 | C668234F1763C246000C603C /* Sources */ = { 364 | isa = PBXSourcesBuildPhase; 365 | buildActionMask = 2147483647; 366 | files = ( 367 | BDB104841AC4D55E00FD6FF6 /* lz4.c in Sources */, 368 | BDB104691ABE82CB00FD6FF6 /* KVDatabase.m in Sources */, 369 | BD1E7C841AAA47DD0030673D /* kvdbo.cpp in Sources */, 370 | BD520F3B1ABB548D00681B8B /* kvunicode.c in Sources */, 371 | C668236A1763C472000C603C /* kvassert.c in Sources */, 372 | BD520F391ABB548D00681B8B /* ConvertUTF.c in Sources */, 373 | BDB104891AC4D55E00FD6FF6 /* xxhash.c in Sources */, 374 | BDB104851AC4D55E00FD6FF6 /* lz4frame.c in Sources */, 375 | BD520F3A1ABB548D00681B8B /* kvserialization.cpp in Sources */, 376 | BDB1046C1ABE82D900FD6FF6 /* KVOrderedDatabase.m in Sources */, 377 | BDB104621ABE82B000FD6FF6 /* KVIndexer.m in Sources */, 378 | C668236C1763C472000C603C /* kvblock.c in Sources */, 379 | C668236F1763C472000C603C /* kvdb.c in Sources */, 380 | C66823741763C472000C603C /* kvprime.c in Sources */, 381 | BDB104861AC4D55E00FD6FF6 /* lz4hc.c in Sources */, 382 | BD520F3C1ABB548D00681B8B /* sfts.cpp in Sources */, 383 | C66823761763C472000C603C /* kvtable.c in Sources */, 384 | ); 385 | runOnlyForDeploymentPostprocessing = 0; 386 | }; 387 | C668237B1763C48F000C603C /* Sources */ = { 388 | isa = PBXSourcesBuildPhase; 389 | buildActionMask = 2147483647; 390 | files = ( 391 | C66823821763C48F000C603C /* main.c in Sources */, 392 | ); 393 | runOnlyForDeploymentPostprocessing = 0; 394 | }; 395 | C66823891763EA47000C603C /* Sources */ = { 396 | isa = PBXSourcesBuildPhase; 397 | buildActionMask = 2147483647; 398 | files = ( 399 | C698FAF71AC66D5900501892 /* ConvertUTF.c in Sources */, 400 | C698FAF81AC66D6200501892 /* kvserialization.cpp in Sources */, 401 | C698FAF91AC66D6A00501892 /* kvunicode.c in Sources */, 402 | C698FAFA1AC66D6D00501892 /* sfts.cpp in Sources */, 403 | C698FAFB1AC66D7200501892 /* kvdbo.cpp in Sources */, 404 | C668239B1763EA77000C603C /* kvassert.c in Sources */, 405 | C668239D1763EA77000C603C /* kvblock.c in Sources */, 406 | C66823A01763EA77000C603C /* kvdb.c in Sources */, 407 | C66823A51763EA77000C603C /* kvprime.c in Sources */, 408 | C66823A71763EA77000C603C /* kvtable.c in Sources */, 409 | C698FAF01AC66D2F00501892 /* KVIndexer.m in Sources */, 410 | C698FAF11AC66D3300501892 /* KVDatabase.m in Sources */, 411 | C698FAF21AC66D3600501892 /* KVOrderedDatabase.m in Sources */, 412 | C698FAF31AC66D4200501892 /* lz4.c in Sources */, 413 | C698FAF41AC66D4500501892 /* lz4frame.c in Sources */, 414 | C698FAF51AC66D4D00501892 /* lz4hc.c in Sources */, 415 | C698FAF61AC66D5100501892 /* xxhash.c in Sources */, 416 | ); 417 | runOnlyForDeploymentPostprocessing = 0; 418 | }; 419 | /* End PBXSourcesBuildPhase section */ 420 | 421 | /* Begin XCBuildConfiguration section */ 422 | C66823551763C246000C603C /* Debug */ = { 423 | isa = XCBuildConfiguration; 424 | buildSettings = { 425 | ALWAYS_SEARCH_USER_PATHS = NO; 426 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; 427 | CLANG_CXX_LIBRARY = "libc++"; 428 | CLANG_ENABLE_OBJC_ARC = YES; 429 | CLANG_WARN_CONSTANT_CONVERSION = YES; 430 | CLANG_WARN_EMPTY_BODY = YES; 431 | CLANG_WARN_ENUM_CONVERSION = YES; 432 | CLANG_WARN_INT_CONVERSION = YES; 433 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; 434 | COPY_PHASE_STRIP = NO; 435 | GCC_C_LANGUAGE_STANDARD = gnu99; 436 | GCC_DYNAMIC_NO_PIC = NO; 437 | GCC_ENABLE_OBJC_EXCEPTIONS = YES; 438 | GCC_OPTIMIZATION_LEVEL = 0; 439 | GCC_PREPROCESSOR_DEFINITIONS = ( 440 | "DEBUG=1", 441 | "$(inherited)", 442 | ); 443 | GCC_SYMBOLS_PRIVATE_EXTERN = NO; 444 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES; 445 | GCC_WARN_ABOUT_RETURN_TYPE = YES; 446 | GCC_WARN_UNINITIALIZED_AUTOS = YES; 447 | GCC_WARN_UNUSED_VARIABLE = YES; 448 | HEADER_SEARCH_PATHS = ( 449 | "$(inherited)", 450 | /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include, 451 | "$(SRCROOT)/../third-party/lz4/lib", 452 | ); 453 | ONLY_ACTIVE_ARCH = YES; 454 | }; 455 | name = Debug; 456 | }; 457 | C66823561763C246000C603C /* Release */ = { 458 | isa = XCBuildConfiguration; 459 | buildSettings = { 460 | ALWAYS_SEARCH_USER_PATHS = NO; 461 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; 462 | CLANG_CXX_LIBRARY = "libc++"; 463 | CLANG_ENABLE_OBJC_ARC = YES; 464 | CLANG_WARN_CONSTANT_CONVERSION = YES; 465 | CLANG_WARN_EMPTY_BODY = YES; 466 | CLANG_WARN_ENUM_CONVERSION = YES; 467 | CLANG_WARN_INT_CONVERSION = YES; 468 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; 469 | COPY_PHASE_STRIP = YES; 470 | DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; 471 | GCC_C_LANGUAGE_STANDARD = gnu99; 472 | GCC_ENABLE_OBJC_EXCEPTIONS = YES; 473 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES; 474 | GCC_WARN_ABOUT_RETURN_TYPE = YES; 475 | GCC_WARN_UNINITIALIZED_AUTOS = YES; 476 | GCC_WARN_UNUSED_VARIABLE = YES; 477 | HEADER_SEARCH_PATHS = ( 478 | "$(inherited)", 479 | /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/include, 480 | "$(SRCROOT)/../third-party/lz4/lib", 481 | ); 482 | }; 483 | name = Release; 484 | }; 485 | C66823581763C246000C603C /* Debug */ = { 486 | isa = XCBuildConfiguration; 487 | buildSettings = { 488 | ARCHS = "$(ARCHS_STANDARD_64_BIT)"; 489 | EXECUTABLE_PREFIX = lib; 490 | MACOSX_DEPLOYMENT_TARGET = 10.8; 491 | PRODUCT_NAME = "$(TARGET_NAME)"; 492 | SDKROOT = macosx; 493 | }; 494 | name = Debug; 495 | }; 496 | C66823591763C246000C603C /* Release */ = { 497 | isa = XCBuildConfiguration; 498 | buildSettings = { 499 | ARCHS = "$(ARCHS_STANDARD_64_BIT)"; 500 | EXECUTABLE_PREFIX = lib; 501 | MACOSX_DEPLOYMENT_TARGET = 10.8; 502 | PRODUCT_NAME = "$(TARGET_NAME)"; 503 | SDKROOT = macosx; 504 | }; 505 | name = Release; 506 | }; 507 | C66823861763C48F000C603C /* Debug */ = { 508 | isa = XCBuildConfiguration; 509 | buildSettings = { 510 | PRODUCT_NAME = "$(TARGET_NAME)"; 511 | }; 512 | name = Debug; 513 | }; 514 | C66823871763C48F000C603C /* Release */ = { 515 | isa = XCBuildConfiguration; 516 | buildSettings = { 517 | PRODUCT_NAME = "$(TARGET_NAME)"; 518 | }; 519 | name = Release; 520 | }; 521 | C66823991763EA47000C603C /* Debug */ = { 522 | isa = XCBuildConfiguration; 523 | buildSettings = { 524 | DSTROOT = /tmp/kvdb_ios.dst; 525 | IPHONEOS_DEPLOYMENT_TARGET = 6.1; 526 | PRODUCT_NAME = "$(TARGET_NAME)"; 527 | SDKROOT = iphoneos; 528 | SKIP_INSTALL = YES; 529 | }; 530 | name = Debug; 531 | }; 532 | C668239A1763EA47000C603C /* Release */ = { 533 | isa = XCBuildConfiguration; 534 | buildSettings = { 535 | DSTROOT = /tmp/kvdb_ios.dst; 536 | IPHONEOS_DEPLOYMENT_TARGET = 6.1; 537 | PRODUCT_NAME = "$(TARGET_NAME)"; 538 | SDKROOT = iphoneos; 539 | SKIP_INSTALL = YES; 540 | VALIDATE_PRODUCT = YES; 541 | }; 542 | name = Release; 543 | }; 544 | /* End XCBuildConfiguration section */ 545 | 546 | /* Begin XCConfigurationList section */ 547 | C668234E1763C246000C603C /* Build configuration list for PBXProject "kvdb" */ = { 548 | isa = XCConfigurationList; 549 | buildConfigurations = ( 550 | C66823551763C246000C603C /* Debug */, 551 | C66823561763C246000C603C /* Release */, 552 | ); 553 | defaultConfigurationIsVisible = 0; 554 | defaultConfigurationName = Release; 555 | }; 556 | C66823571763C246000C603C /* Build configuration list for PBXNativeTarget "kvdb" */ = { 557 | isa = XCConfigurationList; 558 | buildConfigurations = ( 559 | C66823581763C246000C603C /* Debug */, 560 | C66823591763C246000C603C /* Release */, 561 | ); 562 | defaultConfigurationIsVisible = 0; 563 | defaultConfigurationName = Release; 564 | }; 565 | C66823851763C48F000C603C /* Build configuration list for PBXNativeTarget "kvdbtest" */ = { 566 | isa = XCConfigurationList; 567 | buildConfigurations = ( 568 | C66823861763C48F000C603C /* Debug */, 569 | C66823871763C48F000C603C /* Release */, 570 | ); 571 | defaultConfigurationIsVisible = 0; 572 | defaultConfigurationName = Release; 573 | }; 574 | C66823981763EA47000C603C /* Build configuration list for PBXNativeTarget "kvdb-ios" */ = { 575 | isa = XCConfigurationList; 576 | buildConfigurations = ( 577 | C66823991763EA47000C603C /* Debug */, 578 | C668239A1763EA47000C603C /* Release */, 579 | ); 580 | defaultConfigurationIsVisible = 0; 581 | defaultConfigurationName = Release; 582 | }; 583 | /* End XCConfigurationList section */ 584 | }; 585 | rootObject = C668234B1763C246000C603C /* Project object */; 586 | } 587 | -------------------------------------------------------------------------------- /build-mac/kvdbtest/main.c: -------------------------------------------------------------------------------- 1 | // 2 | // main.c 3 | // kvdbtest 4 | // 5 | // Created by DINH Viêt Hoà on 6/8/13. 6 | // Copyright (c) 2013 etpan. All rights reserved. 7 | // 8 | 9 | #include 10 | #include "kvdb.h" 11 | #include 12 | #include 13 | #include 14 | 15 | static void enumerate_keys_callback(kvdb * db, struct kvdb_enumerate_cb_params * params, void * data, int * stop) { 16 | printf("key = %.*s\n", (int) params->key_size, params->key); 17 | } 18 | 19 | int main(void) 20 | { 21 | uuid_t key; 22 | uuid_string_t keyString; 23 | uuid_t value; 24 | uuid_string_t valueString; 25 | 26 | struct kvdb * db; 27 | db = kvdb_new("kvdb-test.kvdb"); 28 | kvdb_open(db); 29 | 30 | int r; 31 | 32 | char * data; 33 | size_t data_size; 34 | r = kvdb_get(db, "hoa", 3, &data, &data_size); 35 | fprintf(stderr, "1: "); 36 | if (r == 0) { 37 | fprintf(stderr, "found\n"); 38 | free(data); 39 | } 40 | else { 41 | fprintf(stderr, "not found\n"); 42 | } 43 | 44 | kvdb_set(db, "hoa", 3, "test", 4); 45 | r = kvdb_get(db, "hoa", 3, &data, &data_size); 46 | fprintf(stderr, "2: "); 47 | if (r == 0) { 48 | fprintf(stderr, "found\n"); 49 | free(data); 50 | } 51 | else { 52 | fprintf(stderr, "not found\n"); 53 | } 54 | 55 | r = kvdb_enumerate_keys(db, enumerate_keys_callback, NULL); 56 | 57 | kvdb_delete(db, "hoa", 3); 58 | r = kvdb_get(db, "hoa", 3, &data, &data_size); 59 | fprintf(stderr, "3: "); 60 | if (r == 0) { 61 | fprintf(stderr, "found\n"); 62 | free(data); 63 | } 64 | else { 65 | fprintf(stderr, "not found\n"); 66 | } 67 | 68 | kvdb_set(db, "hoa", 3, "test", 4); 69 | r = kvdb_get(db, "hoa", 3, &data, &data_size); 70 | fprintf(stderr, "4: "); 71 | if (r == 0) { 72 | fprintf(stderr, "found\n"); 73 | free(data); 74 | } 75 | else { 76 | fprintf(stderr, "not found\n"); 77 | } 78 | kvdb_delete(db, "hoa", 3); 79 | 80 | #define COUNT 1000 81 | char * keys[COUNT]; 82 | for(unsigned int i = 0 ; i < COUNT ; i ++) { 83 | //fprintf(stderr, "add %i\n", i); 84 | uuid_generate(key); 85 | uuid_unparse_lower(key, keyString); 86 | uuid_generate(value); 87 | uuid_unparse_lower(value, valueString); 88 | char * dupKey = malloc(37); 89 | memcpy(dupKey, keyString, 36); 90 | dupKey[36] = 0; 91 | keys[i] = dupKey; 92 | kvdb_set(db, keyString, 36, valueString, 36); 93 | } 94 | 95 | kvdb_close(db); 96 | kvdb_free(db); 97 | 98 | db = kvdb_new("kvdb-test.kvdb"); 99 | kvdb_open(db); 100 | 101 | for(unsigned int i = 0 ; i < COUNT / 2 ; i ++) { 102 | char * key = keys[i]; 103 | kvdb_delete(db, key, 36); 104 | } 105 | 106 | for(unsigned int i = 0 ; i < COUNT / 2 ; i ++) { 107 | char * value; 108 | size_t value_size; 109 | char * key = keys[i]; 110 | int r = kvdb_get(db, key, 36, &value, &value_size); 111 | if (r == 0) { 112 | fprintf(stderr, "still exists %s\n", key); 113 | free(value); 114 | } 115 | } 116 | 117 | for(unsigned int i = COUNT / 2 ; i < COUNT ; i ++) { 118 | char * value; 119 | size_t value_size; 120 | char * key = keys[i]; 121 | int r = kvdb_get(db, key, 36, &value, &value_size); 122 | if (r < 0) { 123 | fprintf(stderr, "could not get key %s %i\n", key, i); 124 | } 125 | else { 126 | free(value); 127 | } 128 | } 129 | 130 | kvdb_close(db); 131 | kvdb_free(db); 132 | } 133 | -------------------------------------------------------------------------------- /objc/KVDatabase.h: -------------------------------------------------------------------------------- 1 | #import 2 | 3 | @interface KVDatabase : NSObject 4 | 5 | @property (nonatomic, copy, readonly) NSString *path; 6 | 7 | // Create a key value store. 8 | - (id) initWithPath:(NSString *)path; 9 | 10 | // Opens the database. 11 | - (BOOL) open; 12 | 13 | // Closes the database. 14 | - (void) close; 15 | 16 | // Returns the data associated with the key. 17 | - (NSData *) dataForKey:(NSString *)key; 18 | 19 | // Sets the data to associate with a key. 20 | - (BOOL) setData:(NSData *)data forKey:(NSString *)key; 21 | 22 | // Remove the given key. 23 | - (void) removeDataForKey:(NSString *)key; 24 | 25 | // Enumerate all keys of the database. 26 | // Be careful, this method will iterate over all the on-disk database, then 27 | // will perform slowly. 28 | - (void)enumerateKeysAndValuesUsingBlock:(void(^)(NSString *key, BOOL *stop))block; 29 | 30 | @end 31 | -------------------------------------------------------------------------------- /objc/KVDatabase.m: -------------------------------------------------------------------------------- 1 | #import "KVDatabase.h" 2 | 3 | #include "kvdb.h" 4 | 5 | enum { 6 | KVDBIOErrorCode = -2, 7 | KVDBNotFoundErrorCode = -1, 8 | }; 9 | 10 | @interface KVDatabase () 11 | 12 | @property (nonatomic, copy) void(^enumerationBlock)(NSString *, BOOL *); 13 | 14 | @end 15 | 16 | @implementation KVDatabase { 17 | kvdb * _db; 18 | } 19 | 20 | - (id) initWithPath:(NSString *)path 21 | { 22 | self = [super init]; 23 | _path = [path copy]; 24 | _db = kvdb_new([path fileSystemRepresentation]); 25 | return self; 26 | } 27 | 28 | - (BOOL) open 29 | { 30 | int r = kvdb_open(_db); 31 | if (r < 0) { 32 | return NO; 33 | } 34 | return YES; 35 | } 36 | 37 | - (void) close 38 | { 39 | kvdb_close(_db); 40 | } 41 | 42 | - (NSData *) dataForKey:(NSString *)key 43 | { 44 | const char * cKey = [key UTF8String]; 45 | char * value = NULL; 46 | size_t value_size; 47 | int code = kvdb_get(_db, cKey, strlen(cKey), &value, &value_size); 48 | if (code == KVDBIOErrorCode) { 49 | NSLog(@"[%@]: I/O error reading key \"%@\"", self, key); 50 | return nil; 51 | } 52 | else if (code < 0) { 53 | return nil; 54 | } 55 | else { 56 | return [NSData dataWithBytesNoCopy:value length:value_size freeWhenDone:YES]; 57 | } 58 | } 59 | 60 | - (BOOL) setData:(NSData *)data forKey:(NSString *)key 61 | { 62 | const char * cKey = [key UTF8String]; 63 | int code = kvdb_set(_db, cKey, strlen(cKey), [data bytes], [data length]); 64 | if (code == KVDBIOErrorCode) { 65 | NSLog(@"[%@]: I/O error writing key \"%@\"", self, key); 66 | return NO; 67 | } 68 | else if (code < 0) { 69 | return NO; 70 | } 71 | else { 72 | return YES; 73 | } 74 | } 75 | 76 | - (void) removeDataForKey:(NSString *)key 77 | { 78 | const char * cKey = [key UTF8String]; 79 | int code = kvdb_delete(_db, cKey, strlen(cKey)); 80 | if (code == KVDBIOErrorCode) { 81 | NSLog(@"[%@]: I/O error removing key \"%@\"", self, key); 82 | } 83 | } 84 | 85 | - (void)enumerateKeysAndValuesUsingBlock:(void(^)(NSString *key, BOOL * stop))block 86 | { 87 | if (block == nil) { 88 | return; 89 | } 90 | self.enumerationBlock = block; 91 | kvdb_enumerate_keys(_db, enumeration_callback, (__bridge void *)self); 92 | } 93 | 94 | static void enumeration_callback(kvdb * db, struct kvdb_enumerate_cb_params * params, 95 | void * data, int * stop) 96 | { 97 | KVDatabase * database = (__bridge id) data; 98 | NSString * key = [[NSString alloc] initWithBytes:params->key length:params->key_size encoding:NSUTF8StringEncoding]; 99 | database.enumerationBlock(key, (BOOL *) stop); 100 | } 101 | 102 | @end 103 | -------------------------------------------------------------------------------- /objc/KVIndexer.h: -------------------------------------------------------------------------------- 1 | #import 2 | 3 | typedef enum { 4 | KVIndexerSearchKindPrefix, // Search documents that has strings that start with the given token. 5 | KVIndexerSearchKindSubstr, // Search documents that has strings that contain the given token. 6 | KVIndexerSearchKindSuffix, // Search documents that has strings that end the given token. 7 | } KVIndexerSearchKind; 8 | // KVIndexerSearchKindPrefix provides the best performance. 9 | 10 | @interface KVIndexer : NSObject 11 | 12 | @property (nonatomic, copy, readonly) NSString *path; 13 | 14 | // Create a full text indexer. 15 | - (id) initWithPath:(NSString *)path; 16 | 17 | // Opens the indexer. 18 | - (BOOL) open; 19 | 20 | // Closes the indexer. 21 | - (void) close; 22 | 23 | // Write pending changes to disk. 24 | - (BOOL) flush; 25 | 26 | // Add a document to the indexer. string is the content to index. 27 | // the string will be tokenized. 28 | // The document is designated by an identifier docID. 29 | - (BOOL) setString:(NSString *)string forDocID:(uint64_t)docID; 30 | 31 | // Add a document to the indexer. strings is the result of a custom tokenizer. 32 | // It's the list of tokens to index. 33 | // The document is designated by an identifier docID. 34 | - (BOOL) setStrings:(NSArray * /* NSString */)strings forDocID:(uint64_t)docID; 35 | 36 | // Remove a document from the indexer. 37 | - (void) removeDocID:(uint64_t)docID; 38 | 39 | // Search a token. Returns a list of documents IDs. 40 | - (NSArray *) search:(NSString *)token kind:(KVIndexerSearchKind)kind; 41 | 42 | @end 43 | -------------------------------------------------------------------------------- /objc/KVIndexer.m: -------------------------------------------------------------------------------- 1 | #import "KVIndexer.h" 2 | 3 | #include "sfts.h" 4 | 5 | enum { 6 | KVDBIOErrorCode = -2, 7 | KVDBNotFoundErrorCode = -1, 8 | }; 9 | 10 | @implementation KVIndexer { 11 | sfts * _db; 12 | } 13 | 14 | - (id) initWithPath:(NSString *)path 15 | { 16 | self = [super init]; 17 | _path = [path copy]; 18 | _db = sfts_new([_path fileSystemRepresentation]); 19 | return self; 20 | } 21 | 22 | - (BOOL) open 23 | { 24 | int r = sfts_open(_db); 25 | if (r < 0) { 26 | return NO; 27 | } 28 | return YES; 29 | } 30 | 31 | - (void) close 32 | { 33 | sfts_close(_db); 34 | } 35 | 36 | - (BOOL) flush 37 | { 38 | int r = sfts_flush(_db); 39 | if (r < 0) { 40 | return NO; 41 | } 42 | return YES; 43 | } 44 | 45 | - (BOOL) setString:(NSString *)string forDocID:(uint64_t)docID 46 | { 47 | unichar * buffer = malloc(sizeof(* buffer) * ([string length] + 1)); 48 | [string getCharacters:buffer range:NSMakeRange(0, [string length])]; 49 | buffer[[string length]] = 0; 50 | int r = sfts_u_set(_db, docID, buffer); 51 | free(buffer); 52 | if (r == KVDBIOErrorCode) { 53 | NSLog(@"[%@]: I/O error indexing document \"%llu\"", self, (unsigned long long) docID); 54 | return NO; 55 | } 56 | else if (r < 0) { 57 | return NO; 58 | } 59 | else { 60 | return YES; 61 | } 62 | } 63 | 64 | - (BOOL) setStrings:(NSArray * /* NSString */)strings forDocID:(uint64_t)docID 65 | { 66 | UChar ** table = malloc(sizeof(* table) * [strings count]); 67 | for(unsigned int i = 0 ; i < [strings count] ; i ++) { 68 | unichar * buffer = malloc(sizeof(* buffer) * ([strings[i] length] + 1)); 69 | table[i] = buffer; 70 | [strings[i] getCharacters:buffer range:NSMakeRange(0, [strings[i] length])]; 71 | buffer[[strings[i] length]] = 0; 72 | } 73 | int r = sfts_u_set2(_db, docID, (const UChar **) table, (int) [strings count]); 74 | for(unsigned int i = 0 ; i < [strings count] ; i ++) { 75 | free(table[i]); 76 | } 77 | free(table); 78 | if (r == KVDBIOErrorCode) { 79 | NSLog(@"[%@]: I/O error indexing document \"%llu\"", self, (unsigned long long) docID); 80 | return NO; 81 | } 82 | else if (r < 0) { 83 | return NO; 84 | } 85 | else { 86 | return YES; 87 | } 88 | } 89 | 90 | - (void) removeDocID:(uint64_t)docID 91 | { 92 | int r = sfts_remove(_db, docID); 93 | if (r == KVDBIOErrorCode) { 94 | NSLog(@"[%@]: I/O error removing indexed document \"%llu\"", self, (unsigned long long) docID); 95 | } 96 | } 97 | 98 | - (NSArray *) search:(NSString *)token kind:(KVIndexerSearchKind)kind 99 | { 100 | uint64_t * docids = NULL; 101 | size_t count = 0; 102 | UChar * buffer = malloc(sizeof(* buffer) * ([token length] + 1)); 103 | [token getCharacters:buffer range:NSMakeRange(0, [token length])]; 104 | int r = sfts_u_search(_db, buffer, (sfts_search_kind) kind, &docids, &count); 105 | free(buffer); 106 | if (r == KVDBIOErrorCode) { 107 | NSLog(@"[%@]: I/O error searching for token \"%@\"", self, token); 108 | return nil; 109 | } 110 | else if (r < 0) { 111 | return nil; 112 | } 113 | 114 | NSMutableArray * result = [NSMutableArray array]; 115 | for(size_t i = 0 ; i < count ; i ++) { 116 | [result addObject:[NSNumber numberWithUnsignedLongLong:docids[i]]]; 117 | } 118 | free(docids); 119 | 120 | return result; 121 | } 122 | 123 | 124 | @end 125 | -------------------------------------------------------------------------------- /objc/KVOrderedDatabase.h: -------------------------------------------------------------------------------- 1 | #import 2 | 3 | @class KVOrderedDatabaseIterator; 4 | 5 | @interface KVOrderedDatabase : NSObject 6 | 7 | @property (nonatomic, copy, readonly) NSString *path; 8 | 9 | // Create a ordered key value store. 10 | - (id) initWithPath:(NSString *)path; 11 | 12 | // Opens the database. 13 | - (BOOL) open; 14 | 15 | // Closes the database. 16 | - (void) close; 17 | 18 | // Write pending changes to disk. 19 | - (BOOL) flush; 20 | 21 | // Returns the data associated with the key. 22 | - (NSData *) dataForKey:(NSString *)key; 23 | 24 | // Sets the data to associate with a key. 25 | - (BOOL) setData:(NSData *)data forKey:(NSString *)key; 26 | 27 | // Remove the given key. 28 | - (void) removeDataForKey:(NSString *)key; 29 | 30 | // Returns an efficient ordered iterator. 31 | // The order is lexicographical. 32 | - (KVOrderedDatabaseIterator *) keyIterator; 33 | 34 | @end 35 | 36 | @interface KVOrderedDatabaseIterator : NSObject 37 | 38 | // Seeks to the first key. 39 | - (void) seekToFirstKey; 40 | 41 | // Seeks to the last key. 42 | - (void) seekToLastKey; 43 | 44 | // Seeks to the key larger or equal to the given key. 45 | - (void) seekAfterKey:(NSString *)key; 46 | 47 | // Iterate to the next key. 48 | - (void) next; 49 | 50 | // Iterate to the previous key. 51 | - (void) previous; 52 | 53 | // Returns the current key. 54 | - (NSString *) currentKey; 55 | 56 | // Returns whether the iterator is at a valid location. 57 | - (BOOL) isValid; 58 | 59 | @end 60 | 61 | -------------------------------------------------------------------------------- /objc/KVOrderedDatabase.m: -------------------------------------------------------------------------------- 1 | #import "KVOrderedDatabase.h" 2 | 3 | #include "kvdbo.h" 4 | 5 | enum { 6 | KVDBIOErrorCode = -2, 7 | KVDBNotFoundErrorCode = -1, 8 | }; 9 | 10 | @interface KVOrderedDatabaseIterator () 11 | 12 | - (id) initWithDatabase:(KVOrderedDatabase *)database; 13 | 14 | @end 15 | 16 | @implementation KVOrderedDatabase { 17 | kvdbo * _db; 18 | } 19 | 20 | - (id) initWithPath:(NSString *)path 21 | { 22 | self = [super init]; 23 | _path = [path copy]; 24 | _db = kvdbo_new([path fileSystemRepresentation]); 25 | return self; 26 | } 27 | 28 | - (void) dealloc 29 | { 30 | kvdbo_free(_db); 31 | } 32 | 33 | - (BOOL) open 34 | { 35 | int r = kvdbo_open(_db); 36 | if (r < 0) { 37 | return NO; 38 | } 39 | return YES; 40 | } 41 | 42 | - (void) close 43 | { 44 | kvdbo_close(_db); 45 | } 46 | 47 | - (BOOL) flush 48 | { 49 | int r = kvdbo_flush(_db); 50 | if (r < 0) { 51 | return NO; 52 | } 53 | return YES; 54 | } 55 | 56 | - (NSData *) dataForKey:(NSString *)key 57 | { 58 | const char * cKey = [key UTF8String]; 59 | char * value = NULL; 60 | size_t value_size; 61 | int code = kvdbo_get(_db, cKey, strlen(cKey), &value, &value_size); 62 | if (code == KVDBIOErrorCode) { 63 | NSLog(@"[%@]: I/O error reading key \"%@\"", self, key); 64 | return nil; 65 | } 66 | else if (code < 0) { 67 | return nil; 68 | } 69 | else { 70 | return [NSData dataWithBytesNoCopy:value length:value_size freeWhenDone:YES]; 71 | } 72 | } 73 | 74 | - (BOOL) setData:(NSData *)data forKey:(NSString *)key 75 | { 76 | const char * cKey = [key UTF8String]; 77 | int code = kvdbo_set(_db, cKey, strlen(cKey), [data bytes], [data length]); 78 | if (code == KVDBIOErrorCode) { 79 | NSLog(@"[%@]: I/O error writing key \"%@\"", self, key); 80 | return NO; 81 | } 82 | else if (code < 0) { 83 | return NO; 84 | } 85 | else { 86 | return YES; 87 | } 88 | } 89 | 90 | - (void) removeDataForKey:(NSString *)key 91 | { 92 | const char * cKey = [key UTF8String]; 93 | int code = kvdbo_delete(_db, cKey, strlen(cKey)); 94 | if (code == KVDBIOErrorCode) { 95 | NSLog(@"[%@]: I/O error removing key \"%@\"", self, key); 96 | } 97 | } 98 | 99 | - (kvdbo *) _db 100 | { 101 | return _db; 102 | } 103 | 104 | - (KVOrderedDatabaseIterator *) keyIterator 105 | { 106 | return [[KVOrderedDatabaseIterator alloc] initWithDatabase:self]; 107 | } 108 | 109 | @end 110 | 111 | 112 | @implementation KVOrderedDatabaseIterator { 113 | kvdbo_iterator * _iterator; 114 | } 115 | 116 | - (id) initWithDatabase:(KVOrderedDatabase *)database 117 | { 118 | self = [super init]; 119 | _iterator = kvdbo_iterator_new([database _db]); 120 | return self; 121 | } 122 | 123 | - (void) dealloc 124 | { 125 | kvdbo_iterator_free(_iterator); 126 | } 127 | 128 | - (void) seekToFirstKey 129 | { 130 | kvdbo_iterator_seek_first(_iterator); 131 | } 132 | 133 | - (void) seekToLastKey 134 | { 135 | kvdbo_iterator_seek_last(_iterator); 136 | } 137 | 138 | - (void) seekAfterKey:(NSString *)key 139 | { 140 | const char * cKey = [key UTF8String]; 141 | kvdbo_iterator_seek_after(_iterator, cKey, strlen(cKey)); 142 | } 143 | 144 | - (void) next 145 | { 146 | kvdbo_iterator_next(_iterator); 147 | } 148 | 149 | - (void) previous 150 | { 151 | kvdbo_iterator_previous(_iterator); 152 | } 153 | 154 | - (NSString *) currentKey 155 | { 156 | const char * key; 157 | size_t size; 158 | kvdbo_iterator_get_key(_iterator, &key, &size); 159 | return [[NSString alloc] initWithBytes:key length:size encoding:NSUTF8StringEncoding]; 160 | } 161 | 162 | - (BOOL) isValid 163 | { 164 | return kvdbo_iterator_is_valid(_iterator); 165 | } 166 | 167 | @end 168 | 169 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | #set(CMAKE_INCLUDE_CURRENT_DIR ON) 2 | set(CMAKE_C_FLAGS "-g -O2") 3 | IF(APPLE) 4 | set(CMAKE_CXX_FLAGS "-std=c++11 -stdlib=libc++ -g -O2") 5 | ELSE() 6 | set(CMAKE_CXX_FLAGS "-std=gnu++0x") 7 | ENDIF() 8 | 9 | file(GLOB_RECURSE 10 | source_files 11 | *.h 12 | *.m 13 | *.c 14 | ) 15 | 16 | SET_SOURCE_FILES_PROPERTIES( 17 | ${source_files} 18 | PROPERTIES LANGUAGE C 19 | ) 20 | 21 | file(COPY 22 | kvdb.h 23 | kvdbo.h 24 | sfts.h 25 | DESTINATION 26 | ${CMAKE_CURRENT_BINARY_DIR}/include/kvdb 27 | ) 28 | 29 | set(LZ4_DIR "../third-party/lz4/lib") 30 | 31 | include_directories(${LZ4_DIR}) 32 | 33 | add_library (kvdb 34 | kvassert.c 35 | kvblock.c 36 | kvdb.c 37 | kvprime.c 38 | kvtable.c 39 | kvdbo.cpp 40 | sfts.cpp 41 | kvunicode.c 42 | kvserialization.cpp 43 | ConvertUTF.c 44 | ${LZ4_DIR}/lz4.c 45 | ${LZ4_DIR}/lz4hc.c 46 | ${LZ4_DIR}/lz4frame.c 47 | ${LZ4_DIR}/xxhash.c 48 | ) 49 | -------------------------------------------------------------------------------- /src/ConvertUTF.c: -------------------------------------------------------------------------------- 1 | /*===--- ConvertUTF.c - Universal Character Names conversions ---------------=== 2 | * 3 | * The LLVM Compiler Infrastructure 4 | * 5 | * This file is distributed under the University of Illinois Open Source 6 | * License. See LICENSE.TXT for details. 7 | * 8 | *===------------------------------------------------------------------------=*/ 9 | /* 10 | * Copyright 2001-2004 Unicode, Inc. 11 | * 12 | * Disclaimer 13 | * 14 | * This source code is provided as is by Unicode, Inc. No claims are 15 | * made as to fitness for any particular purpose. No warranties of any 16 | * kind are expressed or implied. The recipient agrees to determine 17 | * applicability of information provided. If this file has been 18 | * purchased on magnetic or optical media from Unicode, Inc., the 19 | * sole remedy for any claim will be exchange of defective media 20 | * within 90 days of receipt. 21 | * 22 | * Limitations on Rights to Redistribute This Code 23 | * 24 | * Unicode, Inc. hereby grants the right to freely use the information 25 | * supplied in this file in the creation of products supporting the 26 | * Unicode Standard, and to make copies of this file in any form 27 | * for internal or external distribution as long as this notice 28 | * remains attached. 29 | */ 30 | 31 | /* --------------------------------------------------------------------- 32 | 33 | Conversions between UTF32, UTF-16, and UTF-8. Source code file. 34 | Author: Mark E. Davis, 1994. 35 | Rev History: Rick McGowan, fixes & updates May 2001. 36 | Sept 2001: fixed const & error conditions per 37 | mods suggested by S. Parent & A. Lillich. 38 | June 2002: Tim Dodd added detection and handling of incomplete 39 | source sequences, enhanced error detection, added casts 40 | to eliminate compiler warnings. 41 | July 2003: slight mods to back out aggressive FFFE detection. 42 | Jan 2004: updated switches in from-UTF8 conversions. 43 | Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. 44 | 45 | See the header file "ConvertUTF.h" for complete documentation. 46 | 47 | ------------------------------------------------------------------------ */ 48 | 49 | 50 | #include "ConvertUTF.h" 51 | #ifdef CVTUTF_DEBUG 52 | #include 53 | #endif 54 | #include 55 | 56 | static const int halfShift = 10; /* used for shifting by 10 bits */ 57 | 58 | static const UTF32 halfBase = 0x0010000UL; 59 | static const UTF32 halfMask = 0x3FFUL; 60 | 61 | #define UNI_SUR_HIGH_START (UTF32)0xD800 62 | #define UNI_SUR_HIGH_END (UTF32)0xDBFF 63 | #define UNI_SUR_LOW_START (UTF32)0xDC00 64 | #define UNI_SUR_LOW_END (UTF32)0xDFFF 65 | #define false 0 66 | #define true 1 67 | 68 | /* --------------------------------------------------------------------- */ 69 | 70 | /* 71 | * Index into the table below with the first byte of a UTF-8 sequence to 72 | * get the number of trailing bytes that are supposed to follow it. 73 | * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is 74 | * left as-is for anyone who may want to do such conversion, which was 75 | * allowed in earlier algorithms. 76 | */ 77 | static const char trailingBytesForUTF8[256] = { 78 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 79 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 80 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 81 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 82 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 83 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 84 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 85 | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 86 | }; 87 | 88 | /* 89 | * Magic values subtracted from a buffer value during UTF8 conversion. 90 | * This table contains as many values as there might be trailing bytes 91 | * in a UTF-8 sequence. 92 | */ 93 | static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 94 | 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; 95 | 96 | /* 97 | * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed 98 | * into the first byte, depending on how many bytes follow. There are 99 | * as many entries in this table as there are UTF-8 sequence types. 100 | * (I.e., one byte sequence, two byte... etc.). Remember that sequencs 101 | * for *legal* UTF-8 will be 4 or fewer bytes total. 102 | */ 103 | static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 104 | 105 | /* --------------------------------------------------------------------- */ 106 | 107 | /* The interface converts a whole buffer to avoid function-call overhead. 108 | * Constants have been gathered. Loops & conditionals have been removed as 109 | * much as possible for efficiency, in favor of drop-through switches. 110 | * (See "Note A" at the bottom of the file for equivalent code.) 111 | * If your compiler supports it, the "isLegalUTF8" call can be turned 112 | * into an inline function. 113 | */ 114 | 115 | 116 | /* --------------------------------------------------------------------- */ 117 | 118 | ConversionResult ConvertUTF32toUTF16 ( 119 | const UTF32** sourceStart, const UTF32* sourceEnd, 120 | UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { 121 | ConversionResult result = conversionOK; 122 | const UTF32* source = *sourceStart; 123 | UTF16* target = *targetStart; 124 | while (source < sourceEnd) { 125 | UTF32 ch; 126 | if (target >= targetEnd) { 127 | result = targetExhausted; break; 128 | } 129 | ch = *source++; 130 | if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 131 | /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ 132 | if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 133 | if (flags == strictConversion) { 134 | --source; /* return to the illegal value itself */ 135 | result = sourceIllegal; 136 | break; 137 | } else { 138 | *target++ = UNI_REPLACEMENT_CHAR; 139 | } 140 | } else { 141 | *target++ = (UTF16)ch; /* normal case */ 142 | } 143 | } else if (ch > UNI_MAX_LEGAL_UTF32) { 144 | if (flags == strictConversion) { 145 | result = sourceIllegal; 146 | } else { 147 | *target++ = UNI_REPLACEMENT_CHAR; 148 | } 149 | } else { 150 | /* target is a character in range 0xFFFF - 0x10FFFF. */ 151 | if (target + 1 >= targetEnd) { 152 | --source; /* Back up source pointer! */ 153 | result = targetExhausted; break; 154 | } 155 | ch -= halfBase; 156 | *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); 157 | *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); 158 | } 159 | } 160 | *sourceStart = source; 161 | *targetStart = target; 162 | return result; 163 | } 164 | 165 | /* --------------------------------------------------------------------- */ 166 | 167 | ConversionResult ConvertUTF16toUTF32 ( 168 | const UTF16** sourceStart, const UTF16* sourceEnd, 169 | UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { 170 | ConversionResult result = conversionOK; 171 | const UTF16* source = *sourceStart; 172 | UTF32* target = *targetStart; 173 | UTF32 ch, ch2; 174 | while (source < sourceEnd) { 175 | const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ 176 | ch = *source++; 177 | /* If we have a surrogate pair, convert to UTF32 first. */ 178 | if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 179 | /* If the 16 bits following the high surrogate are in the source buffer... */ 180 | if (source < sourceEnd) { 181 | ch2 = *source; 182 | /* If it's a low surrogate, convert to UTF32. */ 183 | if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { 184 | ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 185 | + (ch2 - UNI_SUR_LOW_START) + halfBase; 186 | ++source; 187 | } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ 188 | --source; /* return to the illegal value itself */ 189 | result = sourceIllegal; 190 | break; 191 | } 192 | } else { /* We don't have the 16 bits following the high surrogate. */ 193 | --source; /* return to the high surrogate */ 194 | result = sourceExhausted; 195 | break; 196 | } 197 | } else if (flags == strictConversion) { 198 | /* UTF-16 surrogate values are illegal in UTF-32 */ 199 | if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { 200 | --source; /* return to the illegal value itself */ 201 | result = sourceIllegal; 202 | break; 203 | } 204 | } 205 | if (target >= targetEnd) { 206 | source = oldSource; /* Back up source pointer! */ 207 | result = targetExhausted; break; 208 | } 209 | *target++ = ch; 210 | } 211 | *sourceStart = source; 212 | *targetStart = target; 213 | #ifdef CVTUTF_DEBUG 214 | if (result == sourceIllegal) { 215 | fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); 216 | fflush(stderr); 217 | } 218 | #endif 219 | return result; 220 | } 221 | ConversionResult ConvertUTF16toUTF8 ( 222 | const UTF16** sourceStart, const UTF16* sourceEnd, 223 | UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { 224 | ConversionResult result = conversionOK; 225 | const UTF16* source = *sourceStart; 226 | UTF8* target = *targetStart; 227 | while (source < sourceEnd) { 228 | UTF32 ch; 229 | unsigned short bytesToWrite = 0; 230 | const UTF32 byteMask = 0xBF; 231 | const UTF32 byteMark = 0x80; 232 | const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ 233 | ch = *source++; 234 | /* If we have a surrogate pair, convert to UTF32 first. */ 235 | if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 236 | /* If the 16 bits following the high surrogate are in the source buffer... */ 237 | if (source < sourceEnd) { 238 | UTF32 ch2 = *source; 239 | /* If it's a low surrogate, convert to UTF32. */ 240 | if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { 241 | ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 242 | + (ch2 - UNI_SUR_LOW_START) + halfBase; 243 | ++source; 244 | } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ 245 | --source; /* return to the illegal value itself */ 246 | result = sourceIllegal; 247 | break; 248 | } 249 | } else { /* We don't have the 16 bits following the high surrogate. */ 250 | --source; /* return to the high surrogate */ 251 | result = sourceExhausted; 252 | break; 253 | } 254 | } else if (flags == strictConversion) { 255 | /* UTF-16 surrogate values are illegal in UTF-32 */ 256 | if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { 257 | --source; /* return to the illegal value itself */ 258 | result = sourceIllegal; 259 | break; 260 | } 261 | } 262 | /* Figure out how many bytes the result will require */ 263 | if (ch < (UTF32)0x80) { bytesToWrite = 1; 264 | } else if (ch < (UTF32)0x800) { bytesToWrite = 2; 265 | } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; 266 | } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; 267 | } else { bytesToWrite = 3; 268 | ch = UNI_REPLACEMENT_CHAR; 269 | } 270 | 271 | target += bytesToWrite; 272 | if (target > targetEnd) { 273 | source = oldSource; /* Back up source pointer! */ 274 | target -= bytesToWrite; result = targetExhausted; break; 275 | } 276 | switch (bytesToWrite) { /* note: everything falls through. */ 277 | case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 278 | case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 279 | case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 280 | case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); 281 | } 282 | target += bytesToWrite; 283 | } 284 | *sourceStart = source; 285 | *targetStart = target; 286 | return result; 287 | } 288 | 289 | /* --------------------------------------------------------------------- */ 290 | 291 | ConversionResult ConvertUTF32toUTF8 ( 292 | const UTF32** sourceStart, const UTF32* sourceEnd, 293 | UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { 294 | ConversionResult result = conversionOK; 295 | const UTF32* source = *sourceStart; 296 | UTF8* target = *targetStart; 297 | while (source < sourceEnd) { 298 | UTF32 ch; 299 | unsigned short bytesToWrite = 0; 300 | const UTF32 byteMask = 0xBF; 301 | const UTF32 byteMark = 0x80; 302 | ch = *source++; 303 | if (flags == strictConversion ) { 304 | /* UTF-16 surrogate values are illegal in UTF-32 */ 305 | if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 306 | --source; /* return to the illegal value itself */ 307 | result = sourceIllegal; 308 | break; 309 | } 310 | } 311 | /* 312 | * Figure out how many bytes the result will require. Turn any 313 | * illegally large UTF32 things (> Plane 17) into replacement chars. 314 | */ 315 | if (ch < (UTF32)0x80) { bytesToWrite = 1; 316 | } else if (ch < (UTF32)0x800) { bytesToWrite = 2; 317 | } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; 318 | } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; 319 | } else { bytesToWrite = 3; 320 | ch = UNI_REPLACEMENT_CHAR; 321 | result = sourceIllegal; 322 | } 323 | 324 | target += bytesToWrite; 325 | if (target > targetEnd) { 326 | --source; /* Back up source pointer! */ 327 | target -= bytesToWrite; result = targetExhausted; break; 328 | } 329 | switch (bytesToWrite) { /* note: everything falls through. */ 330 | case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 331 | case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 332 | case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 333 | case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); 334 | } 335 | target += bytesToWrite; 336 | } 337 | *sourceStart = source; 338 | *targetStart = target; 339 | return result; 340 | } 341 | 342 | /* --------------------------------------------------------------------- */ 343 | 344 | /* 345 | * Utility routine to tell whether a sequence of bytes is legal UTF-8. 346 | * This must be called with the length pre-determined by the first byte. 347 | * If not calling this from ConvertUTF8to*, then the length can be set by: 348 | * length = trailingBytesForUTF8[*source]+1; 349 | * and the sequence is illegal right away if there aren't that many bytes 350 | * available. 351 | * If presented with a length > 4, this returns false. The Unicode 352 | * definition of UTF-8 goes up to 4-byte sequences. 353 | */ 354 | 355 | static Boolean isLegalUTF8(const UTF8 *source, int length) { 356 | UTF8 a; 357 | const UTF8 *srcptr = source+length; 358 | switch (length) { 359 | default: return false; 360 | /* Everything else falls through when "true"... */ 361 | case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 362 | case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 363 | case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 364 | 365 | switch (*source) { 366 | /* no fall-through in this inner switch */ 367 | case 0xE0: if (a < 0xA0) return false; break; 368 | case 0xED: if (a > 0x9F) return false; break; 369 | case 0xF0: if (a < 0x90) return false; break; 370 | case 0xF4: if (a > 0x8F) return false; break; 371 | default: if (a < 0x80) return false; 372 | } 373 | 374 | case 1: if (*source >= 0x80 && *source < 0xC2) return false; 375 | } 376 | if (*source > 0xF4) return false; 377 | return true; 378 | } 379 | 380 | /* --------------------------------------------------------------------- */ 381 | 382 | /* 383 | * Exported function to return whether a UTF-8 sequence is legal or not. 384 | * This is not used here; it's just exported. 385 | */ 386 | Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { 387 | int length = trailingBytesForUTF8[*source]+1; 388 | if (length > sourceEnd - source) { 389 | return false; 390 | } 391 | return isLegalUTF8(source, length); 392 | } 393 | 394 | /* --------------------------------------------------------------------- */ 395 | 396 | static unsigned 397 | findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source, 398 | const UTF8 *sourceEnd) { 399 | UTF8 b1, b2, b3; 400 | 401 | assert(!isLegalUTF8Sequence(source, sourceEnd)); 402 | 403 | /* 404 | * Unicode 6.3.0, D93b: 405 | * 406 | * Maximal subpart of an ill-formed subsequence: The longest code unit 407 | * subsequence starting at an unconvertible offset that is either: 408 | * a. the initial subsequence of a well-formed code unit sequence, or 409 | * b. a subsequence of length one. 410 | */ 411 | 412 | if (source == sourceEnd) 413 | return 0; 414 | 415 | /* 416 | * Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8 417 | * Byte Sequences. 418 | */ 419 | 420 | b1 = *source; 421 | ++source; 422 | if (b1 >= 0xC2 && b1 <= 0xDF) { 423 | /* 424 | * First byte is valid, but we know that this code unit sequence is 425 | * invalid, so the maximal subpart has to end after the first byte. 426 | */ 427 | return 1; 428 | } 429 | 430 | if (source == sourceEnd) 431 | return 1; 432 | 433 | b2 = *source; 434 | ++source; 435 | 436 | if (b1 == 0xE0) { 437 | return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1; 438 | } 439 | if (b1 >= 0xE1 && b1 <= 0xEC) { 440 | return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1; 441 | } 442 | if (b1 == 0xED) { 443 | return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1; 444 | } 445 | if (b1 >= 0xEE && b1 <= 0xEF) { 446 | return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1; 447 | } 448 | if (b1 == 0xF0) { 449 | if (b2 >= 0x90 && b2 <= 0xBF) { 450 | if (source == sourceEnd) 451 | return 2; 452 | 453 | b3 = *source; 454 | return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; 455 | } 456 | return 1; 457 | } 458 | if (b1 >= 0xF1 && b1 <= 0xF3) { 459 | if (b2 >= 0x80 && b2 <= 0xBF) { 460 | if (source == sourceEnd) 461 | return 2; 462 | 463 | b3 = *source; 464 | return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; 465 | } 466 | return 1; 467 | } 468 | if (b1 == 0xF4) { 469 | if (b2 >= 0x80 && b2 <= 0x8F) { 470 | if (source == sourceEnd) 471 | return 2; 472 | 473 | b3 = *source; 474 | return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; 475 | } 476 | return 1; 477 | } 478 | 479 | assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5); 480 | /* 481 | * There are no valid sequences that start with these bytes. Maximal subpart 482 | * is defined to have length 1 in these cases. 483 | */ 484 | return 1; 485 | } 486 | 487 | /* --------------------------------------------------------------------- */ 488 | 489 | /* 490 | * Exported function to return the total number of bytes in a codepoint 491 | * represented in UTF-8, given the value of the first byte. 492 | */ 493 | unsigned getNumBytesForUTF8(UTF8 first) { 494 | return trailingBytesForUTF8[first] + 1; 495 | } 496 | 497 | /* --------------------------------------------------------------------- */ 498 | 499 | /* 500 | * Exported function to return whether a UTF-8 string is legal or not. 501 | * This is not used here; it's just exported. 502 | */ 503 | Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) { 504 | while (*source != sourceEnd) { 505 | int length = trailingBytesForUTF8[**source] + 1; 506 | if (length > sourceEnd - *source || !isLegalUTF8(*source, length)) 507 | return false; 508 | *source += length; 509 | } 510 | return true; 511 | } 512 | 513 | /* --------------------------------------------------------------------- */ 514 | 515 | ConversionResult ConvertUTF8toUTF16 ( 516 | const UTF8** sourceStart, const UTF8* sourceEnd, 517 | UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { 518 | ConversionResult result = conversionOK; 519 | const UTF8* source = *sourceStart; 520 | UTF16* target = *targetStart; 521 | while (source < sourceEnd) { 522 | UTF32 ch = 0; 523 | unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 524 | if (extraBytesToRead >= sourceEnd - source) { 525 | result = sourceExhausted; break; 526 | } 527 | /* Do this check whether lenient or strict */ 528 | if (!isLegalUTF8(source, extraBytesToRead+1)) { 529 | result = sourceIllegal; 530 | if (flags == strictConversion) { 531 | /* Abort conversion. */ 532 | break; 533 | } else { 534 | /* 535 | * Replace the maximal subpart of ill-formed sequence with 536 | * replacement character. 537 | */ 538 | source += findMaximalSubpartOfIllFormedUTF8Sequence(source, 539 | sourceEnd); 540 | *target++ = UNI_REPLACEMENT_CHAR; 541 | continue; 542 | } 543 | } 544 | /* 545 | * The cases all fall through. See "Note A" below. 546 | */ 547 | switch (extraBytesToRead) { 548 | case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ 549 | case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ 550 | case 3: ch += *source++; ch <<= 6; 551 | case 2: ch += *source++; ch <<= 6; 552 | case 1: ch += *source++; ch <<= 6; 553 | case 0: ch += *source++; 554 | } 555 | ch -= offsetsFromUTF8[extraBytesToRead]; 556 | 557 | if (target >= targetEnd) { 558 | source -= (extraBytesToRead+1); /* Back up source pointer! */ 559 | result = targetExhausted; break; 560 | } 561 | if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 562 | /* UTF-16 surrogate values are illegal in UTF-32 */ 563 | if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 564 | if (flags == strictConversion) { 565 | source -= (extraBytesToRead+1); /* return to the illegal value itself */ 566 | result = sourceIllegal; 567 | break; 568 | } else { 569 | *target++ = UNI_REPLACEMENT_CHAR; 570 | } 571 | } else { 572 | *target++ = (UTF16)ch; /* normal case */ 573 | } 574 | } else if (ch > UNI_MAX_UTF16) { 575 | if (flags == strictConversion) { 576 | result = sourceIllegal; 577 | source -= (extraBytesToRead+1); /* return to the start */ 578 | break; /* Bail out; shouldn't continue */ 579 | } else { 580 | *target++ = UNI_REPLACEMENT_CHAR; 581 | } 582 | } else { 583 | /* target is a character in range 0xFFFF - 0x10FFFF. */ 584 | if (target + 1 >= targetEnd) { 585 | source -= (extraBytesToRead+1); /* Back up source pointer! */ 586 | result = targetExhausted; break; 587 | } 588 | ch -= halfBase; 589 | *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); 590 | *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); 591 | } 592 | } 593 | *sourceStart = source; 594 | *targetStart = target; 595 | return result; 596 | } 597 | 598 | /* --------------------------------------------------------------------- */ 599 | 600 | static ConversionResult ConvertUTF8toUTF32Impl( 601 | const UTF8** sourceStart, const UTF8* sourceEnd, 602 | UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags, 603 | Boolean InputIsPartial) { 604 | ConversionResult result = conversionOK; 605 | const UTF8* source = *sourceStart; 606 | UTF32* target = *targetStart; 607 | while (source < sourceEnd) { 608 | UTF32 ch = 0; 609 | unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 610 | if (extraBytesToRead >= sourceEnd - source) { 611 | if (flags == strictConversion || InputIsPartial) { 612 | result = sourceExhausted; 613 | break; 614 | } else { 615 | result = sourceIllegal; 616 | 617 | /* 618 | * Replace the maximal subpart of ill-formed sequence with 619 | * replacement character. 620 | */ 621 | source += findMaximalSubpartOfIllFormedUTF8Sequence(source, 622 | sourceEnd); 623 | *target++ = UNI_REPLACEMENT_CHAR; 624 | continue; 625 | } 626 | } 627 | if (target >= targetEnd) { 628 | result = targetExhausted; break; 629 | } 630 | 631 | /* Do this check whether lenient or strict */ 632 | if (!isLegalUTF8(source, extraBytesToRead+1)) { 633 | result = sourceIllegal; 634 | if (flags == strictConversion) { 635 | /* Abort conversion. */ 636 | break; 637 | } else { 638 | /* 639 | * Replace the maximal subpart of ill-formed sequence with 640 | * replacement character. 641 | */ 642 | source += findMaximalSubpartOfIllFormedUTF8Sequence(source, 643 | sourceEnd); 644 | *target++ = UNI_REPLACEMENT_CHAR; 645 | continue; 646 | } 647 | } 648 | /* 649 | * The cases all fall through. See "Note A" below. 650 | */ 651 | switch (extraBytesToRead) { 652 | case 5: ch += *source++; ch <<= 6; 653 | case 4: ch += *source++; ch <<= 6; 654 | case 3: ch += *source++; ch <<= 6; 655 | case 2: ch += *source++; ch <<= 6; 656 | case 1: ch += *source++; ch <<= 6; 657 | case 0: ch += *source++; 658 | } 659 | ch -= offsetsFromUTF8[extraBytesToRead]; 660 | 661 | if (ch <= UNI_MAX_LEGAL_UTF32) { 662 | /* 663 | * UTF-16 surrogate values are illegal in UTF-32, and anything 664 | * over Plane 17 (> 0x10FFFF) is illegal. 665 | */ 666 | if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 667 | if (flags == strictConversion) { 668 | source -= (extraBytesToRead+1); /* return to the illegal value itself */ 669 | result = sourceIllegal; 670 | break; 671 | } else { 672 | *target++ = UNI_REPLACEMENT_CHAR; 673 | } 674 | } else { 675 | *target++ = ch; 676 | } 677 | } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ 678 | result = sourceIllegal; 679 | *target++ = UNI_REPLACEMENT_CHAR; 680 | } 681 | } 682 | *sourceStart = source; 683 | *targetStart = target; 684 | return result; 685 | } 686 | 687 | ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart, 688 | const UTF8 *sourceEnd, 689 | UTF32 **targetStart, 690 | UTF32 *targetEnd, 691 | ConversionFlags flags) { 692 | return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd, 693 | flags, /*InputIsPartial=*/true); 694 | } 695 | 696 | ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, 697 | const UTF8 *sourceEnd, UTF32 **targetStart, 698 | UTF32 *targetEnd, ConversionFlags flags) { 699 | return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd, 700 | flags, /*InputIsPartial=*/false); 701 | } 702 | 703 | /* --------------------------------------------------------------------- 704 | 705 | Note A. 706 | The fall-through switches in UTF-8 reading code save a 707 | temp variable, some decrements & conditionals. The switches 708 | are equivalent to the following loop: 709 | { 710 | int tmpBytesToRead = extraBytesToRead+1; 711 | do { 712 | ch += *source++; 713 | --tmpBytesToRead; 714 | if (tmpBytesToRead) ch <<= 6; 715 | } while (tmpBytesToRead > 0); 716 | } 717 | In UTF-8 writing code, the switches on "bytesToWrite" are 718 | similarly unrolled loops. 719 | 720 | --------------------------------------------------------------------- */ 721 | -------------------------------------------------------------------------------- /src/ConvertUTF.h: -------------------------------------------------------------------------------- 1 | /*===--- ConvertUTF.h - Universal Character Names conversions ---------------=== 2 | * 3 | * The LLVM Compiler Infrastructure 4 | * 5 | * This file is distributed under the University of Illinois Open Source 6 | * License. See LICENSE.TXT for details. 7 | * 8 | *==------------------------------------------------------------------------==*/ 9 | /* 10 | * Copyright 2001-2004 Unicode, Inc. 11 | * 12 | * Disclaimer 13 | * 14 | * This source code is provided as is by Unicode, Inc. No claims are 15 | * made as to fitness for any particular purpose. No warranties of any 16 | * kind are expressed or implied. The recipient agrees to determine 17 | * applicability of information provided. If this file has been 18 | * purchased on magnetic or optical media from Unicode, Inc., the 19 | * sole remedy for any claim will be exchange of defective media 20 | * within 90 days of receipt. 21 | * 22 | * Limitations on Rights to Redistribute This Code 23 | * 24 | * Unicode, Inc. hereby grants the right to freely use the information 25 | * supplied in this file in the creation of products supporting the 26 | * Unicode Standard, and to make copies of this file in any form 27 | * for internal or external distribution as long as this notice 28 | * remains attached. 29 | */ 30 | 31 | /* --------------------------------------------------------------------- 32 | 33 | Conversions between UTF32, UTF-16, and UTF-8. Header file. 34 | 35 | Several funtions are included here, forming a complete set of 36 | conversions between the three formats. UTF-7 is not included 37 | here, but is handled in a separate source file. 38 | 39 | Each of these routines takes pointers to input buffers and output 40 | buffers. The input buffers are const. 41 | 42 | Each routine converts the text between *sourceStart and sourceEnd, 43 | putting the result into the buffer between *targetStart and 44 | targetEnd. Note: the end pointers are *after* the last item: e.g. 45 | *(sourceEnd - 1) is the last item. 46 | 47 | The return result indicates whether the conversion was successful, 48 | and if not, whether the problem was in the source or target buffers. 49 | (Only the first encountered problem is indicated.) 50 | 51 | After the conversion, *sourceStart and *targetStart are both 52 | updated to point to the end of last text successfully converted in 53 | the respective buffers. 54 | 55 | Input parameters: 56 | sourceStart - pointer to a pointer to the source buffer. 57 | The contents of this are modified on return so that 58 | it points at the next thing to be converted. 59 | targetStart - similarly, pointer to pointer to the target buffer. 60 | sourceEnd, targetEnd - respectively pointers to the ends of the 61 | two buffers, for overflow checking only. 62 | 63 | These conversion functions take a ConversionFlags argument. When this 64 | flag is set to strict, both irregular sequences and isolated surrogates 65 | will cause an error. When the flag is set to lenient, both irregular 66 | sequences and isolated surrogates are converted. 67 | 68 | Whether the flag is strict or lenient, all illegal sequences will cause 69 | an error return. This includes sequences such as: , , 70 | or in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code 71 | must check for illegal sequences. 72 | 73 | When the flag is set to lenient, characters over 0x10FFFF are converted 74 | to the replacement character; otherwise (when the flag is set to strict) 75 | they constitute an error. 76 | 77 | Output parameters: 78 | The value "sourceIllegal" is returned from some routines if the input 79 | sequence is malformed. When "sourceIllegal" is returned, the source 80 | value will point to the illegal value that caused the problem. E.g., 81 | in UTF-8 when a sequence is malformed, it points to the start of the 82 | malformed sequence. 83 | 84 | Author: Mark E. Davis, 1994. 85 | Rev History: Rick McGowan, fixes & updates May 2001. 86 | Fixes & updates, Sept 2001. 87 | 88 | ------------------------------------------------------------------------ */ 89 | 90 | #ifndef LLVM_SUPPORT_CONVERTUTF_H 91 | #define LLVM_SUPPORT_CONVERTUTF_H 92 | 93 | /* --------------------------------------------------------------------- 94 | The following 4 definitions are compiler-specific. 95 | The C standard does not guarantee that wchar_t has at least 96 | 16 bits, so wchar_t is no less portable than unsigned short! 97 | All should be unsigned values to avoid sign extension during 98 | bit mask & shift operations. 99 | ------------------------------------------------------------------------ */ 100 | 101 | typedef unsigned int UTF32; /* at least 32 bits */ 102 | typedef unsigned short UTF16; /* at least 16 bits */ 103 | typedef unsigned char UTF8; /* typically 8 bits */ 104 | typedef unsigned char Boolean; /* 0 or 1 */ 105 | 106 | /* Some fundamental constants */ 107 | #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD 108 | #define UNI_MAX_BMP (UTF32)0x0000FFFF 109 | #define UNI_MAX_UTF16 (UTF32)0x0010FFFF 110 | #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF 111 | #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF 112 | 113 | #define UNI_MAX_UTF8_BYTES_PER_CODE_POINT 4 114 | 115 | #define UNI_UTF16_BYTE_ORDER_MARK_NATIVE 0xFEFF 116 | #define UNI_UTF16_BYTE_ORDER_MARK_SWAPPED 0xFFFE 117 | 118 | typedef enum { 119 | conversionOK, /* conversion successful */ 120 | sourceExhausted, /* partial character in source, but hit end */ 121 | targetExhausted, /* insuff. room in target for conversion */ 122 | sourceIllegal /* source sequence is illegal/malformed */ 123 | } ConversionResult; 124 | 125 | typedef enum { 126 | strictConversion = 0, 127 | lenientConversion 128 | } ConversionFlags; 129 | 130 | /* This is for C++ and does no harm in C */ 131 | #ifdef __cplusplus 132 | extern "C" { 133 | #endif 134 | 135 | #include "ConvertUTFNamespace.h" 136 | 137 | ConversionResult ConvertUTF8toUTF16 ( 138 | const UTF8** sourceStart, const UTF8* sourceEnd, 139 | UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); 140 | 141 | /** 142 | * Convert a partial UTF8 sequence to UTF32. If the sequence ends in an 143 | * incomplete code unit sequence, returns \c sourceExhausted. 144 | */ 145 | ConversionResult ConvertUTF8toUTF32Partial( 146 | const UTF8** sourceStart, const UTF8* sourceEnd, 147 | UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); 148 | 149 | /** 150 | * Convert a partial UTF8 sequence to UTF32. If the sequence ends in an 151 | * incomplete code unit sequence, returns \c sourceIllegal. 152 | */ 153 | ConversionResult ConvertUTF8toUTF32( 154 | const UTF8** sourceStart, const UTF8* sourceEnd, 155 | UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); 156 | 157 | ConversionResult ConvertUTF16toUTF8 ( 158 | const UTF16** sourceStart, const UTF16* sourceEnd, 159 | UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); 160 | 161 | ConversionResult ConvertUTF32toUTF8 ( 162 | const UTF32** sourceStart, const UTF32* sourceEnd, 163 | UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); 164 | 165 | ConversionResult ConvertUTF16toUTF32 ( 166 | const UTF16** sourceStart, const UTF16* sourceEnd, 167 | UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); 168 | 169 | ConversionResult ConvertUTF32toUTF16 ( 170 | const UTF32** sourceStart, const UTF32* sourceEnd, 171 | UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); 172 | 173 | Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd); 174 | 175 | Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd); 176 | 177 | unsigned getNumBytesForUTF8(UTF8 firstByte); 178 | 179 | #ifdef __cplusplus 180 | } 181 | 182 | #if 0 // ignored in mailcore2 183 | /*************************************************************************/ 184 | /* Below are LLVM-specific wrappers of the functions above. */ 185 | 186 | #include "llvm/ADT/ArrayRef.h" 187 | #include "llvm/ADT/StringRef.h" 188 | 189 | namespace llvm { 190 | 191 | /** 192 | * Convert an UTF8 StringRef to UTF8, UTF16, or UTF32 depending on 193 | * WideCharWidth. The converted data is written to ResultPtr, which needs to 194 | * point to at least WideCharWidth * (Source.Size() + 1) bytes. On success, 195 | * ResultPtr will point one after the end of the copied string. On failure, 196 | * ResultPtr will not be changed, and ErrorPtr will be set to the location of 197 | * the first character which could not be converted. 198 | * \return true on success. 199 | */ 200 | bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source, 201 | char *&ResultPtr, const UTF8 *&ErrorPtr); 202 | 203 | /** 204 | * Convert an Unicode code point to UTF8 sequence. 205 | * 206 | * \param Source a Unicode code point. 207 | * \param [in,out] ResultPtr pointer to the output buffer, needs to be at least 208 | * \c UNI_MAX_UTF8_BYTES_PER_CODE_POINT bytes. On success \c ResultPtr is 209 | * updated one past end of the converted sequence. 210 | * 211 | * \returns true on success. 212 | */ 213 | bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr); 214 | 215 | /** 216 | * Convert the first UTF8 sequence in the given source buffer to a UTF32 217 | * code point. 218 | * 219 | * \param [in,out] source A pointer to the source buffer. If the conversion 220 | * succeeds, this pointer will be updated to point to the byte just past the 221 | * end of the converted sequence. 222 | * \param sourceEnd A pointer just past the end of the source buffer. 223 | * \param [out] target The converted code 224 | * \param flags Whether the conversion is strict or lenient. 225 | * 226 | * \returns conversionOK on success 227 | * 228 | * \sa ConvertUTF8toUTF32 229 | */ 230 | static inline ConversionResult convertUTF8Sequence(const UTF8 **source, 231 | const UTF8 *sourceEnd, 232 | UTF32 *target, 233 | ConversionFlags flags) { 234 | if (*source == sourceEnd) 235 | return sourceExhausted; 236 | unsigned size = getNumBytesForUTF8(**source); 237 | if ((ptrdiff_t)size > sourceEnd - *source) 238 | return sourceExhausted; 239 | return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags); 240 | } 241 | 242 | /** 243 | * Returns true if a blob of text starts with a UTF-16 big or little endian byte 244 | * order mark. 245 | */ 246 | bool hasUTF16ByteOrderMark(ArrayRef SrcBytes); 247 | 248 | /** 249 | * Converts a stream of raw bytes assumed to be UTF16 into a UTF8 std::string. 250 | * 251 | * \param [in] SrcBytes A buffer of what is assumed to be UTF-16 encoded text. 252 | * \param [out] Out Converted UTF-8 is stored here on success. 253 | * \returns true on success 254 | */ 255 | bool convertUTF16ToUTF8String(ArrayRef SrcBytes, std::string &Out); 256 | 257 | } /* end namespace llvm */ 258 | #endif // ignored in mailcore2 259 | 260 | #endif 261 | 262 | /* --------------------------------------------------------------------- */ 263 | 264 | #endif 265 | -------------------------------------------------------------------------------- /src/ConvertUTFNamespace.h: -------------------------------------------------------------------------------- 1 | // 2 | // ConvertUTFNamespace.h 3 | // kvdb 4 | // 5 | // Created by Hoa Dinh on 3/19/15. 6 | // Copyright (c) 2015 etpan. All rights reserved. 7 | // 8 | 9 | #ifndef kvdb_ConvertUTFNamespace_h 10 | #define kvdb_ConvertUTFNamespace_h 11 | 12 | #define ConvertUTF16toUTF32 kv_ConvertUTF16toUTF32 13 | #define ConvertUTF16toUTF8 kv_ConvertUTF16toUTF8 14 | #define ConvertUTF32toUTF16 kv_ConvertUTF32toUTF16 15 | #define ConvertUTF32toUTF8 kv_ConvertUTF32toUTF8 16 | #define ConvertUTF8toUTF16 kv_ConvertUTF8toUTF16 17 | #define ConvertUTF8toUTF32 kv_ConvertUTF8toUTF32 18 | #define ConvertUTF8toUTF32Partial kv_ConvertUTF8toUTF32Partial 19 | #define getNumBytesForUTF8 kv_getNumBytesForUTF8 20 | #define isLegalUTF8Sequence kv_isLegalUTF8Sequence 21 | #define isLegalUTF8String kv_isLegalUTF8String 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /src/kvassert.c: -------------------------------------------------------------------------------- 1 | // 2 | // kvassert.c 3 | // kvdb 4 | // 5 | // Created by DINH Viêt Hoà on 6/1/13. 6 | // Copyright (c) 2013 etpan. All rights reserved. 7 | // 8 | 9 | #include 10 | #include 11 | 12 | void assertInternal(const char * filename, unsigned int line, int cond, const char * condString) 13 | { 14 | if (cond) { 15 | return; 16 | } 17 | 18 | fprintf(stderr, "%s:%i: assert %s\n", filename, line, condString); 19 | abort(); 20 | } 21 | -------------------------------------------------------------------------------- /src/kvassert.h: -------------------------------------------------------------------------------- 1 | // 2 | // kvassert.h 3 | // kvdb 4 | // 5 | // Created by DINH Viêt Hoà on 6/1/13. 6 | // Copyright (c) 2013 etpan. All rights reserved. 7 | // 8 | 9 | #ifndef KV_ASSERT_H 10 | #define KV_ASSERT_H 11 | 12 | #ifdef __cplusplus 13 | extern "C" { 14 | #endif 15 | 16 | #define KVDBAssert(cond) assertInternal(__FILE__, __LINE__, cond, #cond) 17 | 18 | void assertInternal(const char * filename, unsigned int line, int cond, const char * condString); 19 | 20 | #ifdef __cplusplus 21 | } 22 | #endif 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /src/kvblock.c: -------------------------------------------------------------------------------- 1 | // 2 | // kvblock.c 3 | // kvdb 4 | // 5 | // Created by DINH Viêt Hoà on 6/2/13. 6 | // Copyright (c) 2013 etpan. All rights reserved. 7 | // 8 | 9 | #include "kvblock.h" 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "kvdb.h" 18 | #include "kvtypes.h" 19 | #include "kvendian.h" 20 | #include "kvpaddingutils.h" 21 | 22 | int kv_block_recycle(kvdb * db, uint64_t offset) 23 | { 24 | uint8_t log2_size; 25 | ssize_t count; 26 | 27 | count = pread(db->kv_fd, &log2_size, 1, offset + 8 + 4); 28 | if (count < 0) 29 | return -1; 30 | uint64_t next_free_offset = db->kv_free_blocks[log2_size]; 31 | // keep it in network order. 32 | count = pwrite(db->kv_fd, &next_free_offset, sizeof(next_free_offset), offset); 33 | if (count < 0) 34 | return -1; 35 | db->kv_free_blocks[log2_size] = hton64(offset); 36 | 37 | return 0; 38 | } 39 | 40 | uint64_t kv_block_create(kvdb * db, uint64_t next_block_offset, uint32_t hash_value, 41 | const char * key, size_t key_size, 42 | const char * value, size_t value_size) 43 | { 44 | uint64_t block_size = block_size_round_up(key_size + value_size); 45 | uint8_t log2_size = log2_round_up(block_size); 46 | uint64_t offset = ntoh64(db->kv_free_blocks[log2_size]); 47 | int use_new_block = 0; 48 | //fprintf(stderr, "key, value: %i %i\n", (int) key_size, (int) value_size); 49 | if (offset != 0) { 50 | // Use free block. 51 | uint64_t next_free_offset; 52 | //fprintf(stderr, "Use free block %i %i %i\n", (int) offset, (int) log2_size, (int)block_size); 53 | // keep it in network order. 54 | pread(db->kv_fd, &next_free_offset, sizeof(next_free_offset), offset); 55 | db->kv_free_blocks[log2_size] = next_free_offset; 56 | } 57 | else { 58 | // Use new block. 59 | offset = ntoh64(* db->kv_filesize); 60 | use_new_block = 1; 61 | } 62 | 63 | uint64_t current_key_size = key_size; 64 | uint64_t current_value_size = value_size; 65 | char * data; 66 | char * allocated = NULL; 67 | if (8 + 4 + 1 + 8 + 8 + block_size > 4096) { 68 | allocated = calloc(1, 8 + 4 + 1 + 8 + 8 + (size_t) block_size); 69 | data = allocated; 70 | } 71 | else { 72 | data = alloca(8 + 4 + 1 + 8 + 8 + (size_t) block_size); 73 | bzero(data, 8 + 4 + 1 + 8 + 8 + (size_t) block_size); 74 | } 75 | char * p = data; 76 | next_block_offset = hton64(next_block_offset); 77 | memcpy(p, &next_block_offset, sizeof(next_block_offset)); 78 | p += sizeof(next_block_offset); 79 | hash_value = htonl(hash_value); 80 | memcpy(p, &hash_value, sizeof(hash_value)); 81 | p += sizeof(hash_value); 82 | memcpy(p, &log2_size, sizeof(log2_size)); 83 | p += sizeof(log2_size); 84 | current_key_size = hton64(current_key_size); 85 | memcpy(p, ¤t_key_size, sizeof(current_key_size)); 86 | p += sizeof(current_key_size); 87 | memcpy(p, key, key_size); 88 | p += key_size; 89 | current_value_size = hton64(current_value_size); 90 | memcpy(p, ¤t_value_size, sizeof(current_value_size)); 91 | p += sizeof(current_value_size); 92 | memcpy(p, value, value_size); 93 | p += value_size; 94 | size_t remaining = (8 + 4 + 1 + 8 + 8 + block_size); 95 | size_t write_offset = offset; 96 | char * remaining_data = data; 97 | while (remaining > 0) { 98 | ssize_t count = pwrite(db->kv_fd, remaining_data, remaining, write_offset); 99 | if (count < 0) { 100 | if (allocated != NULL) { 101 | free(allocated); 102 | } 103 | return 0; 104 | } 105 | write_offset += count; 106 | remaining_data += count; 107 | remaining -= count; 108 | } 109 | if (allocated != NULL) { 110 | free(allocated); 111 | } 112 | if (use_new_block) { 113 | uint64_t filesize = ntoh64(* db->kv_filesize); 114 | filesize += 8 + 4 + 1 + 8 + 8 + block_size; 115 | (* db->kv_filesize) = hton64(filesize); 116 | } 117 | 118 | return offset; 119 | } 120 | -------------------------------------------------------------------------------- /src/kvblock.h: -------------------------------------------------------------------------------- 1 | // 2 | // kvblock.h 3 | // kvdb 4 | // 5 | // Created by DINH Viêt Hoà on 6/2/13. 6 | // Copyright (c) 2013 etpan. All rights reserved. 7 | // 8 | 9 | #ifndef kvdb_kvblock_h 10 | #define kvdb_kvblock_h 11 | 12 | #include "kvtypes.h" 13 | 14 | uint64_t kv_block_create(kvdb * db, uint64_t next_block_offset, uint32_t hash_value, 15 | const char * key, size_t key_size, 16 | const char * value, size_t value_size); 17 | 18 | int kv_block_recycle(kvdb * db, uint64_t offset); 19 | 20 | #endif 21 | -------------------------------------------------------------------------------- /src/kvbloom.h: -------------------------------------------------------------------------------- 1 | // 2 | // kvbloom.h 3 | // kvdb 4 | // 5 | // Created by DINH Viêt Hoà on 6/2/13. 6 | // Copyright (c) 2013 etpan. All rights reserved. 7 | // 8 | 9 | #ifndef KVBLOOM_H 10 | #define KVBLOOM_H 11 | 12 | #include "kvmurmurhash.h" 13 | 14 | static inline void table_bloom_filter_set(struct kvdb_table * table, uint32_t * hash_values, 15 | int hash_count) 16 | { 17 | //fprintf(stderr, "----set\n"); 18 | for(unsigned int i = 0 ; i < hash_count ; i ++) { 19 | uint64_t idx = hash_values[i] % ntoh64(* table->kv_bloom_filter_size); 20 | //fprintf(stderr, "%u\n", (unsigned int) idx); 21 | table->kv_bloom_filter[idx / 8] |= 1 << (idx % 8); 22 | } 23 | } 24 | 25 | static inline int table_bloom_filter_might_contain(struct kvdb_table * table, uint32_t * hash_values, 26 | int hash_count) 27 | { 28 | //fprintf(stderr, "----get\n"); 29 | for(unsigned int i = 0 ; i < hash_count ; i ++) { 30 | uint64_t idx = hash_values[i] % ntoh64(* table->kv_bloom_filter_size); 31 | //fprintf(stderr, "%u\n", (unsigned int) idx); 32 | if ((table->kv_bloom_filter[idx / 8] & (1 << (idx % 8))) == 0) { 33 | //fprintf(stderr, "----not found\n"); 34 | return 0; 35 | } 36 | } 37 | //fprintf(stderr, "----found\n"); 38 | return 1; 39 | } 40 | 41 | static inline void table_bloom_filter_compute_hash(uint32_t * hash_values, unsigned int hash_count, 42 | const char * key, size_t key_size) 43 | { 44 | uint32_t previous_hash_value = 0; 45 | for(unsigned int i = 0 ; i < hash_count ; i ++) { 46 | hash_values[i] = kv_murmur_hash(key, key_size, previous_hash_value); 47 | previous_hash_value = hash_values[i]; 48 | } 49 | } 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /src/kvdb.c: -------------------------------------------------------------------------------- 1 | #include "kvdb.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | 18 | #include "kvassert.h" 19 | #include "kvendian.h" 20 | #include "kvtypes.h" 21 | #include "kvprime.h" 22 | #include "kvpaddingutils.h" 23 | #include "kvbloom.h" 24 | #include "kvmurmurhash.h" 25 | #include "kvtable.h" 26 | #include "kvblock.h" 27 | 28 | #define MARKER "KVDB" 29 | #define VERSION 5 30 | 31 | static int kvdb_debug = 0; 32 | 33 | static int internal_kvdb_set(kvdb * db, const char * key, size_t key_size, const char * value, size_t value_size); 34 | static int internal_kvdb_get2(kvdb * db, const char * key, size_t key_size, 35 | char ** p_value, size_t * p_value_size, size_t * p_free_size); 36 | static int kvdb_get2(kvdb * db, const char * key, size_t key_size, 37 | char ** p_value, size_t * p_value_size, size_t * p_free_size); 38 | 39 | kvdb * kvdb_new(const char * filename) 40 | { 41 | kvdb * db = malloc(sizeof(* db)); 42 | KVDBAssert(filename != NULL); 43 | db->kv_filename = strdup(filename); 44 | KVDBAssert(db->kv_filename != NULL); 45 | db->kv_fd = -1; 46 | db->kv_opened = 0; 47 | db->kv_firstmaxcount = kv_getnextprime(KV_FIRST_TABLE_MAX_COUNT); 48 | db->kv_compression_type = KVDB_COMPRESSION_TYPE_LZ4; 49 | db->kv_filesize = NULL; 50 | db->kv_free_blocks = NULL; 51 | db->kv_first_table = NULL; 52 | db->kv_current_table = NULL; 53 | 54 | return db; 55 | } 56 | 57 | void kvdb_free(kvdb * db) 58 | { 59 | if (db->kv_opened) { 60 | fprintf(stderr, "should be closed before freeing - %s\n", db->kv_filename); 61 | } 62 | free(db->kv_filename); 63 | free(db); 64 | } 65 | 66 | void kvdb_set_compression_type(kvdb * db, int compression_type) 67 | { 68 | if (db->kv_opened) { 69 | return; 70 | } 71 | db->kv_compression_type = compression_type; 72 | } 73 | 74 | int kvdb_get_compression_type(kvdb * db) 75 | { 76 | return db->kv_compression_type; 77 | } 78 | 79 | int kvdb_open(kvdb * db) 80 | { 81 | int r; 82 | struct stat stat_buf; 83 | int create_file = 0; 84 | 85 | if (db->kv_opened) 86 | return -1; 87 | 88 | db->kv_pagesize = getpagesize(); 89 | 90 | db->kv_fd = open(db->kv_filename, O_RDWR | O_CREAT, 0600); 91 | if (db->kv_fd == -1) { 92 | fprintf(stderr, "open failed\n"); 93 | return -1; 94 | } 95 | 96 | r = fstat(db->kv_fd, &stat_buf); 97 | if (r < 0) { 98 | close(db->kv_fd); 99 | // close file. 100 | fprintf(stderr, "fstat failed\n"); 101 | return -1; 102 | } 103 | 104 | uint64_t firstmaxcount = kv_getnextprime(KV_FIRST_TABLE_MAX_COUNT); 105 | uint64_t first_mapping_size = KV_HEADER_SIZE + KV_TABLE_SIZE(firstmaxcount); 106 | 107 | char data[4 + 4 + 8 + 1]; 108 | 109 | if (stat_buf.st_size == 0) { 110 | create_file = 1; 111 | r = ftruncate(db->kv_fd, KV_PAGE_ROUND_UP(db, first_mapping_size)); 112 | if (r < 0) { 113 | close(db->kv_fd); 114 | // close file. 115 | fprintf(stderr, "truncate failed\n"); 116 | return -1; 117 | } 118 | memcpy(data, MARKER, 4); 119 | h32_to_bytes(&data[4], VERSION); 120 | h64_to_bytes(&data[4 + 4], firstmaxcount); 121 | data[4 + 4 + 8] = db->kv_compression_type; 122 | write(db->kv_fd, data, sizeof(data)); 123 | 124 | kv_table_header_write(db, KV_HEADER_SIZE, firstmaxcount); 125 | } 126 | 127 | char marker[4]; 128 | uint32_t version; 129 | int compression_type; 130 | pread(db->kv_fd, data, sizeof(data), 0); 131 | memcpy(marker, data, 4); 132 | version = bytes_to_h32(&data[4]); 133 | firstmaxcount = bytes_to_h64(&data[4 + 4]); 134 | compression_type = data[4 + 4 + 8]; 135 | 136 | r = memcmp(marker, MARKER, 4); 137 | if (r != 0) { 138 | fprintf(stderr, "file corrupted\n"); 139 | return -1; 140 | } 141 | if (version != VERSION) { 142 | fprintf(stderr, "bad file version\n"); 143 | return -1; 144 | } 145 | 146 | db->kv_firstmaxcount = firstmaxcount; 147 | db->kv_compression_type = compression_type; 148 | db->kv_opened = 1; 149 | 150 | r = kv_tables_setup(db); 151 | if (r < 0) { 152 | fprintf(stderr, "can't map files\n"); 153 | return -1; 154 | } 155 | 156 | char * first_mapping = db->kv_first_table->kv_mapping.kv_bytes; 157 | db->kv_filesize = (uint64_t *) (first_mapping + KV_HEADER_FILESIZE_OFFSET); 158 | db->kv_free_blocks = (uint64_t *) (first_mapping + KV_HEADER_FREELIST_OFFSET); 159 | if (create_file) { 160 | * db->kv_filesize = hton64(first_mapping_size); 161 | } 162 | 163 | return 0; 164 | } 165 | 166 | void kvdb_close(kvdb * db) 167 | { 168 | if (!db->kv_opened) { 169 | return; 170 | } 171 | 172 | kv_tables_unsetup(db); 173 | close(db->kv_fd); 174 | db->kv_opened = 0; 175 | } 176 | 177 | int kvdb_set(kvdb * db, const char * key, size_t key_size, const char * value, size_t value_size) 178 | { 179 | if (db->kv_compression_type == KVDB_COMPRESSION_TYPE_RAW) { 180 | return internal_kvdb_set(db, key, key_size, value, value_size); 181 | } 182 | else if (db->kv_compression_type == KVDB_COMPRESSION_TYPE_LZ4) { 183 | if (value_size == 0) { 184 | return internal_kvdb_set(db, key, key_size, value, value_size); 185 | } 186 | else { 187 | int max_compressed_size = LZ4_compressBound((int) value_size); 188 | char * compressed_value = NULL; 189 | int allocated = 0; 190 | if (max_compressed_size < 4096) { 191 | compressed_value = alloca(sizeof(uint32_t) + max_compressed_size); 192 | } 193 | else { 194 | allocated = 1; 195 | compressed_value = malloc(sizeof(uint32_t) + max_compressed_size); 196 | } 197 | * (uint32_t *) compressed_value = htonl(value_size); 198 | int compressed_value_size = LZ4_compress(value, compressed_value + sizeof(uint32_t), (int) value_size); 199 | int r = internal_kvdb_set(db, key, key_size, compressed_value, sizeof(uint32_t) + compressed_value_size); 200 | if (allocated) { 201 | free(compressed_value); 202 | } 203 | return r; 204 | } 205 | } 206 | else { 207 | KVDBAssert(0); 208 | return 0; 209 | } 210 | } 211 | 212 | static int internal_kvdb_set(kvdb * db, const char * key, size_t key_size, const char * value, size_t value_size) 213 | { 214 | uint32_t hash_value[KV_BLOOM_FILTER_HASH_COUNT]; 215 | table_bloom_filter_compute_hash(hash_value, KV_BLOOM_FILTER_HASH_COUNT, key, key_size); 216 | 217 | int r; 218 | r = kvdb_delete(db, key, key_size); 219 | if (r == -1) { 220 | // Not found: ignore. 221 | } 222 | else if (r == -2) { 223 | return -2; 224 | } 225 | 226 | r = kv_select_table(db); 227 | if (r < 0) { 228 | return -2; 229 | } 230 | struct kvdb_table * table = db->kv_current_table; 231 | 232 | uint32_t idx = hash_value[0] % ntoh64(* table->kv_maxcount); 233 | struct kvdb_item * item = &table->kv_items[idx]; 234 | uint64_t offset = kv_block_create(db, ntoh64(item->kv_offset), hash_value[0], key, key_size, value, value_size); 235 | if (offset == 0) { 236 | return -2; 237 | } 238 | item->kv_offset = hton64(offset); 239 | table_bloom_filter_set(table, hash_value + 1, KV_BLOOM_FILTER_HASH_COUNT - 1); 240 | 241 | uint64_t count; 242 | count = ntoh64(* table->kv_count); 243 | count ++; 244 | * table->kv_count = hton64(count); 245 | 246 | return 0; 247 | } 248 | 249 | #define PRE_READ_KEY_SIZE 128 250 | #define MAX_ALLOCA_SIZE 4096 251 | 252 | static void show_bucket(kvdb * db, uint32_t idx) 253 | { 254 | struct kvdb_table * table = db->kv_first_table; 255 | struct kvdb_item * item = &table->kv_items[idx]; 256 | uint64_t next_offset = ntoh64(item->kv_offset); 257 | 258 | fprintf(stderr, "bucket: %llu\n", (unsigned long long) idx); 259 | 260 | uint64_t previous_offset = 0; 261 | 262 | // Run through all chained blocks in the bucket. 263 | while (next_offset != 0) { 264 | uint32_t current_hash_value; 265 | uint64_t current_offset; 266 | uint8_t log2_size; 267 | uint64_t current_key_size; 268 | char * current_key; 269 | ssize_t r; 270 | 271 | current_offset = next_offset; 272 | char block_header_data[KV_BLOCK_KEY_BYTES_OFFSET + PRE_READ_KEY_SIZE]; 273 | 274 | r = pread(db->kv_fd, block_header_data, sizeof(block_header_data), (off_t) next_offset); 275 | if (r < 0) 276 | return; 277 | char * p = block_header_data; 278 | next_offset = bytes_to_h64(p); 279 | p += 8; 280 | current_hash_value = bytes_to_h32(p); 281 | p += 4; 282 | log2_size = bytes_to_h8(p); 283 | p += 1; 284 | current_key_size = bytes_to_h64(p); 285 | p += 8; 286 | current_key = block_header_data + KV_BLOCK_KEY_BYTES_OFFSET; 287 | 288 | fprintf(stderr, "previous, current, next: %llu, %llu , %llu\n", (unsigned long long) previous_offset, (unsigned long long) current_offset, (unsigned long long) next_offset); 289 | fprintf(stderr, "hash: %llu\n", (unsigned long long) current_hash_value); 290 | 291 | char * allocated = NULL; 292 | if (current_key_size > PRE_READ_KEY_SIZE) { 293 | if (current_key_size <= MAX_ALLOCA_SIZE) { 294 | current_key = alloca(current_key_size); 295 | } 296 | else { 297 | allocated = malloc((size_t) current_key_size); 298 | current_key = allocated; 299 | } 300 | r = pread(db->kv_fd, current_key, (size_t) current_key_size, (off_t) (current_offset + KV_BLOCK_KEY_BYTES_OFFSET)); 301 | if (r < 0) { 302 | if (allocated != NULL) { 303 | free(allocated); 304 | } 305 | return; 306 | } 307 | } 308 | fprintf(stderr, "key: %.*s\n", (int) current_key_size, current_key); 309 | if (allocated != NULL) { 310 | free(allocated); 311 | } 312 | previous_offset = current_offset; 313 | } 314 | fprintf(stderr, "-----\n"); 315 | } 316 | 317 | static int find_key(kvdb * db, const char * key, size_t key_size, 318 | findkey_callback callback, void * cb_data) 319 | { 320 | uint32_t hash_values[KV_BLOOM_FILTER_HASH_COUNT]; 321 | table_bloom_filter_compute_hash(hash_values, KV_BLOOM_FILTER_HASH_COUNT, key, key_size); 322 | 323 | struct find_key_cb_params params; 324 | params.key = key; 325 | params.key_size = key_size; 326 | 327 | // Run through all tables. 328 | struct kvdb_table * table = db->kv_first_table; 329 | while (table != NULL) { 330 | // Is the key likely to be in this table? 331 | // Use a bloom filter to guess. 332 | if (!table_bloom_filter_might_contain(table, hash_values + 1, KV_BLOOM_FILTER_HASH_COUNT - 1)) { 333 | table = table->kv_next_table; 334 | continue; 335 | } 336 | 337 | // Find a bucket. 338 | uint64_t previous_offset = 0; 339 | uint32_t idx = hash_values[0] % ntoh64(* table->kv_maxcount); 340 | struct kvdb_item * item = &table->kv_items[idx]; 341 | uint64_t next_offset = ntoh64(item->kv_offset); 342 | if (kvdb_debug) { 343 | fprintf(stderr, "before\n"); 344 | show_bucket(db, idx); 345 | } 346 | 347 | // Run through all chained blocks in the bucket. 348 | while (next_offset != 0) { 349 | uint32_t current_hash_value; 350 | uint64_t current_offset; 351 | uint8_t log2_size; 352 | uint64_t current_key_size; 353 | char * current_key; 354 | ssize_t r; 355 | 356 | current_offset = next_offset; 357 | char block_header_data[KV_BLOCK_KEY_BYTES_OFFSET + PRE_READ_KEY_SIZE]; 358 | 359 | r = pread(db->kv_fd, block_header_data, sizeof(block_header_data), (off_t) next_offset); 360 | if (r < 0) 361 | return -1; 362 | char * p = block_header_data; 363 | next_offset = bytes_to_h64(p); 364 | p += 8; 365 | current_hash_value = bytes_to_h32(p); 366 | p += 4; 367 | log2_size = bytes_to_h8(p); 368 | p += 1; 369 | current_key_size = bytes_to_h64(p); 370 | p += 8; 371 | current_key = block_header_data + KV_BLOCK_KEY_BYTES_OFFSET; 372 | 373 | if (current_hash_value != hash_values[0]) { 374 | previous_offset = current_offset; 375 | continue; 376 | } 377 | 378 | int cmp_result; 379 | 380 | if (current_key_size != key_size) { 381 | previous_offset = current_offset; 382 | continue; 383 | } 384 | char * allocated = NULL; 385 | if (current_key_size > PRE_READ_KEY_SIZE) { 386 | if (current_key_size <= MAX_ALLOCA_SIZE) { 387 | current_key = alloca(current_key_size); 388 | } 389 | else { 390 | allocated = malloc((size_t) current_key_size); 391 | current_key = allocated; 392 | } 393 | r = pread(db->kv_fd, current_key, (size_t) current_key_size, (off_t) (current_offset + KV_BLOCK_KEY_BYTES_OFFSET)); 394 | if (r < 0) { 395 | if (allocated != NULL) { 396 | free(allocated); 397 | } 398 | return -1; 399 | } 400 | } 401 | cmp_result = memcmp(key, current_key, key_size) != 0; 402 | if (allocated != NULL) { 403 | free(allocated); 404 | } 405 | if (cmp_result != 0) { 406 | previous_offset = current_offset; 407 | continue; 408 | } 409 | 410 | params.previous_offset = previous_offset; 411 | params.current_offset = current_offset; 412 | params.next_offset = next_offset; 413 | params.item = item; 414 | params.table_count = table->kv_count; 415 | params.log2_size = log2_size; 416 | 417 | callback(db, ¶ms, cb_data); 418 | 419 | if (kvdb_debug) { 420 | fprintf(stderr, "after\n"); 421 | show_bucket(db, idx); 422 | } 423 | 424 | return 0; 425 | } 426 | table = table->kv_next_table; 427 | } 428 | 429 | return 0; 430 | } 431 | 432 | struct delete_key_params { 433 | int result; 434 | int found; 435 | }; 436 | 437 | static void delete_key_callback(kvdb * db, struct find_key_cb_params * params, 438 | void * data) { 439 | struct delete_key_params * deletekeyparams = data; 440 | ssize_t write_count; 441 | int r; 442 | 443 | if (params->previous_offset == 0) { 444 | params->item->kv_offset = hton64(params->next_offset); 445 | } 446 | else { 447 | uint64_t offset_to_write = hton64(params->next_offset); 448 | write_count = pwrite(db->kv_fd, &offset_to_write, sizeof(offset_to_write), params->previous_offset); 449 | if (write_count < 0) { 450 | deletekeyparams->result = -2; 451 | return; 452 | } 453 | } 454 | r = kv_block_recycle(db, params->current_offset); 455 | if (r < 0) { 456 | deletekeyparams->result = -2; 457 | return; 458 | } 459 | 460 | * params->table_count = hton64(ntoh64(* params->table_count) - 1); 461 | deletekeyparams->result = 0; 462 | deletekeyparams->found = 1; 463 | } 464 | 465 | int kvdb_delete(kvdb * db, const char * key, size_t key_size) 466 | { 467 | int r; 468 | struct delete_key_params data; 469 | 470 | data.found = 0; 471 | data.result = -1; 472 | 473 | r = find_key(db, key, key_size, delete_key_callback, &data); 474 | if (r < 0) { 475 | return -2; 476 | } 477 | if (data.result < 0) { 478 | return data.result; 479 | } 480 | if (!data.found) { 481 | return -1; 482 | } 483 | 484 | return 0; 485 | } 486 | 487 | struct read_value_params { 488 | uint64_t value_size; 489 | char * value; 490 | int result; 491 | int found; 492 | size_t free_size; 493 | }; 494 | 495 | static void read_value_callback(kvdb * db, struct find_key_cb_params * params, 496 | void * data) 497 | { 498 | struct read_value_params * readparams = data; 499 | ssize_t r; 500 | 501 | uint64_t value_size; 502 | r = pread(db->kv_fd, &value_size, sizeof(value_size), 503 | params->current_offset + 8 + 4 + 1 + 8 + params->key_size); 504 | if (r < 0) { 505 | readparams->result = -2; 506 | return; 507 | } 508 | 509 | value_size = ntoh64(value_size); 510 | readparams->value_size = value_size; 511 | readparams->value = malloc((size_t) value_size); 512 | 513 | uint64_t remaining = value_size; 514 | char * value_p = readparams->value; 515 | while (remaining > 0) { 516 | ssize_t count = pread(db->kv_fd, value_p, (size_t) remaining, 517 | params->current_offset + 8 + 4 + 1 + 8 + params->key_size + 8); 518 | if (count < 0) { 519 | readparams->result = -2; 520 | free(readparams->value); 521 | readparams->value = NULL; 522 | return; 523 | } 524 | remaining -= count; 525 | value_p += count; 526 | } 527 | 528 | readparams->result = 0; 529 | readparams->found = 1; 530 | readparams->free_size = (1 << params->log2_size) - (value_size + params->key_size); 531 | } 532 | 533 | int kvdb_get(kvdb * db, const char * key, size_t key_size, 534 | char ** p_value, size_t * p_value_size) 535 | { 536 | return kvdb_get2(db, key, key_size, p_value, p_value_size, NULL); 537 | } 538 | 539 | static int kvdb_get2(kvdb * db, const char * key, size_t key_size, 540 | char ** p_value, size_t * p_value_size, size_t * p_free_size) 541 | { 542 | if (db->kv_compression_type == KVDB_COMPRESSION_TYPE_RAW) { 543 | return internal_kvdb_get2(db, key, key_size, p_value, p_value_size, p_free_size); 544 | } 545 | else if (db->kv_compression_type == KVDB_COMPRESSION_TYPE_LZ4) { 546 | char * compressed_value; 547 | size_t compressed_value_size; 548 | int r = internal_kvdb_get2(db, key, key_size, &compressed_value, &compressed_value_size, p_free_size); 549 | if (r < 0) { 550 | return r; 551 | } 552 | if (compressed_value_size == 0) { 553 | * p_value = NULL; 554 | * p_value_size = 0; 555 | return 0; 556 | } 557 | 558 | size_t value_size = ntohl(* (uint32_t *) compressed_value); 559 | char * value = malloc(value_size); 560 | LZ4_decompress_fast(compressed_value + sizeof(uint32_t), value, (int) value_size); 561 | free(compressed_value); 562 | if (p_free_size != NULL) { 563 | * p_free_size = 0; 564 | } 565 | * p_value_size = value_size; 566 | * p_value = value; 567 | return 0; 568 | } 569 | else { 570 | KVDBAssert(0); 571 | return 0; 572 | } 573 | } 574 | 575 | static int internal_kvdb_get2(kvdb * db, const char * key, size_t key_size, 576 | char ** p_value, size_t * p_value_size, size_t * p_free_size) 577 | { 578 | int r; 579 | struct read_value_params data; 580 | 581 | data.value_size = 0; 582 | data.value = NULL; 583 | data.result = -1; 584 | data.found = 0; 585 | data.free_size = 0; 586 | 587 | r = find_key(db, key, key_size, read_value_callback, &data); 588 | if (r < 0) { 589 | return -2; 590 | } 591 | if (data.result < 0) { 592 | return data.result; 593 | } 594 | if (!data.found) { 595 | return -1; 596 | } 597 | 598 | if (p_free_size != NULL) { 599 | * p_free_size = data.free_size; 600 | } 601 | 602 | * p_value = data.value; 603 | * p_value_size = (size_t) data.value_size; 604 | 605 | return 0; 606 | } 607 | 608 | int kvdb_enumerate_keys(kvdb * db, kvdb_enumerate_callback callback, void * cb_data) 609 | { 610 | struct kvdb_table * table = db->kv_first_table; 611 | struct kvdb_enumerate_cb_params cb_params; 612 | int stop = 0; 613 | 614 | // Run through all tables. 615 | while (table != NULL) { 616 | struct kvdb_item * item = table->kv_items; 617 | // Run through all buckets. 618 | uint64_t count = ntoh64(*table->kv_maxcount); 619 | while (count) { 620 | uint64_t current_offset = ntoh64(item->kv_offset); 621 | // Run through all chained blocks in the bucket. 622 | while (current_offset != 0) { 623 | char block_header_data[KV_BLOCK_KEY_BYTES_OFFSET + PRE_READ_KEY_SIZE]; 624 | ssize_t r = pread(db->kv_fd, block_header_data, sizeof(block_header_data), (off_t) current_offset); 625 | if (r < 0) { 626 | return -2; 627 | } 628 | char * p = block_header_data; 629 | uint64_t next_offset = bytes_to_h64(p); 630 | p += 8+4+1; // ignore hash_value and log2_size 631 | size_t current_key_size = (size_t) bytes_to_h64(p); 632 | p += 8; 633 | char * current_key = block_header_data + KV_BLOCK_KEY_BYTES_OFFSET; 634 | char * allocated = NULL; 635 | if (current_key_size > PRE_READ_KEY_SIZE) { 636 | if (current_key_size <= MAX_ALLOCA_SIZE) { 637 | current_key = alloca(current_key_size); 638 | } 639 | else { 640 | allocated = malloc(current_key_size); 641 | current_key = allocated; 642 | } 643 | r = pread(db->kv_fd, current_key, current_key_size, (off_t) (current_offset + KV_BLOCK_KEY_BYTES_OFFSET)); 644 | if (r < 0) { 645 | if (allocated != NULL) { 646 | free(allocated); 647 | } 648 | return -2; 649 | } 650 | } 651 | cb_params.key = current_key; 652 | cb_params.key_size = current_key_size; 653 | callback(db, &cb_params, cb_data, &stop); 654 | if (allocated != NULL) { 655 | free(allocated); 656 | } 657 | if (stop) { 658 | return 0; 659 | } 660 | current_offset = next_offset; 661 | } 662 | item ++; 663 | count --; 664 | } 665 | table = table->kv_next_table; 666 | } 667 | return 0; 668 | } 669 | -------------------------------------------------------------------------------- /src/kvdb.h: -------------------------------------------------------------------------------- 1 | #ifndef KVDB_H 2 | 3 | #define KVDB_H 4 | 5 | #include 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | // kvdb is a key-value database. 12 | 13 | typedef struct kvdb kvdb; 14 | 15 | enum { 16 | KVDB_COMPRESSION_TYPE_RAW, 17 | KVDB_COMPRESSION_TYPE_LZ4, 18 | }; 19 | 20 | // creates a kvdb. 21 | kvdb * kvdb_new(const char * filename); 22 | 23 | void kvdb_set_compression_type(kvdb * db, int compression_type); 24 | int kvdb_get_compression_type(kvdb * db); 25 | 26 | // destroy a kvdb. 27 | void kvdb_free(kvdb * db); 28 | 29 | // opens a kvdb. 30 | int kvdb_open(kvdb * db); 31 | 32 | // closes a kvdb. 33 | void kvdb_close(kvdb * db); 34 | 35 | // insert a key / value in the database. 36 | // Returns -2 if there's a I/O error. 37 | int kvdb_set(kvdb * db, const char * key, size_t key_size, 38 | const char * value, size_t value_size); 39 | 40 | // result stored in p_value should be released using free(). 41 | // Returns -1 if item is not found. 42 | // Returns -2 if there's a I/O error. 43 | int kvdb_get(kvdb * db, const char * key, size_t key_size, 44 | char ** p_value, size_t * p_value_size); 45 | 46 | // Returns -1 if item is not found. 47 | // Returns -2 if there's a I/O error. 48 | int kvdb_delete(kvdb * db, const char * key, size_t key_size); 49 | 50 | struct kvdb_enumerate_cb_params { 51 | const char * key; 52 | size_t key_size; 53 | }; 54 | 55 | typedef void kvdb_enumerate_callback(kvdb * db, 56 | struct kvdb_enumerate_cb_params * params, 57 | void * data, int * stop); 58 | 59 | // Returns -2 if there's a I/O error. 60 | int kvdb_enumerate_keys(kvdb * db, kvdb_enumerate_callback callback, void * cb_data); 61 | 62 | #ifdef __cplusplus 63 | } 64 | #endif 65 | 66 | #endif 67 | -------------------------------------------------------------------------------- /src/kvdbo.cpp: -------------------------------------------------------------------------------- 1 | #include "kvdbo.h" 2 | 3 | #include "kvdb.h" 4 | #include "kvendian.h" 5 | #include "kvassert.h" 6 | #include "kvserialization.h" 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | struct kvdbo { 14 | // underlaying kvdb. 15 | kvdb * db; 16 | 17 | // in memory buffers for operations. 18 | std::set pending_keys; 19 | std::set pending_keys_delete; 20 | // node identifier allocation. 21 | uint64_t next_node_id; 22 | 23 | // master node. 24 | // identifiers of the nodes. 25 | std::vector nodes_ids; 26 | // first keys of the nodes. 27 | std::vector nodes_first_keys; 28 | // number of keys in each node. 29 | std::vector nodes_keys_count; 30 | }; 31 | 32 | // iterator over kvdbo. 33 | // it will also cache the keys of the current node. 34 | struct kvdbo_iterator { 35 | kvdbo * db; 36 | // identifier of the node. 37 | uint64_t node_id; 38 | // index of the node. 39 | unsigned int node_index; 40 | // keys in the node. 41 | std::vector keys; 42 | // current key index in the node. 43 | int key_index; 44 | }; 45 | 46 | #define NODE_PREFIX "n" 47 | 48 | static int flush_pending_keys(kvdbo * db); 49 | static int write_master_node(kvdbo * db); 50 | static int read_master_node(kvdbo * db); 51 | static unsigned int find_node(kvdbo * db, const std::string key); 52 | static unsigned int find_key(kvdbo_iterator * iterator, const std::string key); 53 | static void unserialize_words_list(std::vector & word_list, char * value, size_t size); 54 | static void unserialize_words_set(std::set & word_set, char * value, size_t size, bool clear_words_set); 55 | static void serialize_words_set(std::string & value, std::set & word_set); 56 | static int iterator_load_node(kvdbo_iterator * iterator, uint64_t node_id); 57 | static int add_first_node(kvdbo * db); 58 | static int load_node(struct modified_node * node, unsigned int node_index); 59 | static int load_from_node_id(struct modified_node * node, uint64_t node_id); 60 | static int write_loaded_node(struct modified_node * node); 61 | static int write_single_loaded_node(struct modified_node * node); 62 | static int try_merge(kvdbo * db, unsigned int node_index, bool * pDidMerge); 63 | static int remove_node_id(kvdbo * db, uint64_t node_id); 64 | static int remove_node(kvdbo * db, unsigned int node_index); 65 | static int split_node(kvdbo * db, unsigned int node_index, unsigned int count, 66 | std::set & keys); 67 | 68 | static void show_nodes(kvdbo * db); 69 | 70 | #pragma mark kvdbo data structure management. 71 | 72 | kvdbo * kvdbo_new(const char* filename) 73 | { 74 | kvdbo * db; 75 | db = new kvdbo; 76 | db->db = kvdb_new(filename); 77 | db->next_node_id = 1; 78 | return db; 79 | } 80 | 81 | void kvdbo_free(kvdbo * db) 82 | { 83 | kvdb_free(db->db); 84 | delete db; 85 | } 86 | 87 | #pragma mark opening / closing the database. 88 | 89 | int kvdbo_open(kvdbo * db) 90 | { 91 | int r = kvdb_open(db->db); 92 | if (r < 0) { 93 | return r; 94 | } 95 | r = read_master_node(db); 96 | if (r < 0) { 97 | kvdbo_close(db); 98 | return r; 99 | } 100 | return 0; 101 | } 102 | 103 | void kvdbo_close(kvdbo * db) 104 | { 105 | db->nodes_keys_count.clear(); 106 | db->nodes_first_keys.clear(); 107 | db->nodes_ids.clear(); 108 | flush_pending_keys(db); 109 | kvdb_close(db->db); 110 | } 111 | 112 | int kvdbo_flush(kvdbo * db) 113 | { 114 | return flush_pending_keys(db); 115 | } 116 | 117 | #pragma mark key insertion / deletion / retrieval. 118 | 119 | const char METAKEY_PREFIX[7] = "\0kvdbo"; 120 | #define METAKEY_PREFIX_SIZE (sizeof(METAKEY_PREFIX) - 1) 121 | 122 | int kvdbo_set(kvdbo * db, 123 | const char * key, 124 | size_t key_size, 125 | const char * value, 126 | size_t value_size) 127 | { 128 | int r; 129 | 130 | std::string key_str(key, key_size); 131 | if (key_str.find(std::string(METAKEY_PREFIX, METAKEY_PREFIX_SIZE)) == 0) { 132 | // invalid key. 133 | return -3; 134 | } 135 | db->pending_keys_delete.erase(key_str); 136 | db->pending_keys.insert(key_str); 137 | r = kvdb_set(db->db, key, key_size, value, value_size); 138 | if (r != 0) { 139 | return r; 140 | } 141 | return 0; 142 | } 143 | 144 | int kvdbo_get(kvdbo * db, 145 | const char * key, 146 | size_t key_size, 147 | char ** p_value, 148 | size_t * p_value_size) 149 | { 150 | if (db->pending_keys_delete.find(std::string(key, key_size)) != db->pending_keys_delete.end()) { 151 | return -1; 152 | } 153 | return kvdb_get(db->db, key, key_size, p_value, p_value_size); 154 | } 155 | 156 | int kvdbo_delete(kvdbo * db, const char* key, size_t key_size) 157 | { 158 | std::string key_str(key, key_size); 159 | db->pending_keys.erase(key_str); 160 | db->pending_keys_delete.insert(key_str); 161 | return kvdb_delete(db->db, key, key_size); 162 | } 163 | 164 | #pragma mark iterator management. 165 | 166 | kvdbo_iterator * kvdbo_iterator_new(kvdbo * db) 167 | { 168 | kvdbo_iterator * iterator = new kvdbo_iterator; 169 | iterator->key_index = -1; 170 | iterator->db = db; 171 | return iterator; 172 | } 173 | 174 | void kvdbo_iterator_free(kvdbo_iterator * iterator) 175 | { 176 | delete iterator; 177 | } 178 | 179 | void kvdbo_iterator_seek_first(kvdbo_iterator * iterator) 180 | { 181 | if (iterator->db->nodes_ids.size() == 0) { 182 | return; 183 | } 184 | uint64_t node_id = iterator->db->nodes_ids[0]; 185 | int r = iterator_load_node(iterator, node_id); 186 | KVDBAssert(r == 0); 187 | iterator->node_index = 0; 188 | iterator->key_index = 0; 189 | } 190 | 191 | void kvdbo_iterator_seek_last(kvdbo_iterator * iterator) 192 | { 193 | if (iterator->db->nodes_ids.size() == 0) { 194 | return; 195 | } 196 | uint64_t node_id = iterator->db->nodes_ids[iterator->db->nodes_ids.size() - 1]; 197 | int r = iterator_load_node(iterator, node_id); 198 | KVDBAssert(r == 0); 199 | iterator->node_index = (unsigned int) (iterator->db->nodes_ids.size() - 1); 200 | iterator->key_index = (unsigned int) (iterator->keys.size() - 1); 201 | } 202 | 203 | void kvdbo_iterator_seek_after(kvdbo_iterator * iterator, 204 | const char * key, 205 | size_t key_size) 206 | { 207 | if (iterator->db->nodes_ids.size() == 0) { 208 | return; 209 | } 210 | std::string key_string(key, key_size); 211 | unsigned int idx = find_node(iterator->db, key_string); 212 | uint64_t node_id = iterator->db->nodes_ids[idx]; 213 | int r = iterator_load_node(iterator, node_id); 214 | KVDBAssert(r == 0); 215 | iterator->node_index = idx; 216 | iterator->key_index = find_key(iterator, key_string); 217 | while (kvdbo_iterator_is_valid(iterator)) { 218 | const char * current_key; 219 | size_t current_key_len; 220 | kvdbo_iterator_get_key(iterator, ¤t_key, ¤t_key_len); 221 | if (std::string(current_key, current_key_len) >= key_string) { 222 | break; 223 | } 224 | kvdbo_iterator_next(iterator); 225 | } 226 | } 227 | 228 | void kvdbo_iterator_next(kvdbo_iterator * iterator) 229 | { 230 | iterator->key_index ++; 231 | if (iterator->key_index < iterator->keys.size()) { 232 | return; 233 | } 234 | 235 | // reached end of the node. 236 | if (iterator->node_index == iterator->db->nodes_ids.size() - 1) { 237 | // was in the last node. 238 | return; 239 | } 240 | iterator->node_index ++; 241 | 242 | uint64_t node_id = iterator->db->nodes_ids[iterator->node_index]; 243 | int r = iterator_load_node(iterator, node_id); 244 | KVDBAssert(r == 0); 245 | iterator->key_index = 0; 246 | } 247 | 248 | void kvdbo_iterator_previous(kvdbo_iterator * iterator) 249 | { 250 | iterator->key_index --; 251 | if (iterator->key_index >= 0) { 252 | return; 253 | } 254 | 255 | // reached beginning of the node. 256 | if (iterator->node_index == 0) { 257 | // was in the first node. 258 | return; 259 | } 260 | iterator->node_index --; 261 | 262 | uint64_t node_id = iterator->db->nodes_ids[iterator->node_index]; 263 | int r= iterator_load_node(iterator, node_id); 264 | KVDBAssert(r == 0); 265 | iterator->key_index = (unsigned int) (iterator->keys.size() - 1); 266 | } 267 | 268 | void kvdbo_iterator_get_key(kvdbo_iterator * iterator, const char ** p_key, size_t * p_key_size) 269 | { 270 | if (!kvdbo_iterator_is_valid(iterator)) { 271 | * p_key = NULL; 272 | * p_key_size = 0; 273 | return; 274 | } 275 | 276 | std::string & key = iterator->keys[iterator->key_index]; 277 | * p_key = key.c_str(); 278 | * p_key_size = key.length(); 279 | } 280 | 281 | int kvdbo_iterator_is_valid(kvdbo_iterator * iterator) 282 | { 283 | return (iterator->key_index != -1) && (iterator->key_index < iterator->keys.size()); 284 | } 285 | 286 | static int iterator_load_node(kvdbo_iterator * iterator, uint64_t node_id) 287 | { 288 | iterator->node_id = node_id; 289 | 290 | // load all keys of the node in memory. 291 | std::string node_key; 292 | node_key.append(METAKEY_PREFIX, METAKEY_PREFIX_SIZE); 293 | node_key.append(NODE_PREFIX, strlen(NODE_PREFIX)); 294 | uint64_t identifier = hton64(node_id); 295 | node_key.append((const char *) &identifier, sizeof(identifier)); 296 | char * value = NULL; 297 | size_t size = 0; 298 | int r = kvdb_get(iterator->db->db, node_key.c_str(), node_key.length(), &value, &size); 299 | if (r == -1) { 300 | return 0; 301 | } 302 | if (r == -2) { 303 | return -2; 304 | } 305 | // load all nodes in a vector. 306 | unserialize_words_list(iterator->keys, value, size); 307 | free(value); 308 | return 0; 309 | } 310 | 311 | #pragma mark master node reading / writing. 312 | 313 | #define MASTER_NODE_KEY "m" 314 | 315 | static int write_master_node(kvdbo * db) 316 | { 317 | std::string buffer; 318 | kv_encode_uint64(buffer, db->nodes_ids.size()); 319 | for(uint64_t i = 0 ; i < db->nodes_ids.size() ; i ++) { 320 | kv_encode_uint64(buffer, db->nodes_ids[i]); 321 | } 322 | for(uint64_t i = 0 ; i < db->nodes_keys_count.size() ; i ++) { 323 | kv_encode_uint64(buffer, db->nodes_keys_count[i]); 324 | } 325 | for(uint64_t i = 0 ; i < db->nodes_first_keys.size() ; i ++) { 326 | // write first key of the node. 327 | std::string key = db->nodes_first_keys[i]; 328 | buffer.append(key.c_str(), key.length()); 329 | buffer.push_back(0); 330 | } 331 | std::string master_node_key; 332 | master_node_key.append(METAKEY_PREFIX, METAKEY_PREFIX_SIZE); 333 | master_node_key.append(MASTER_NODE_KEY, strlen(MASTER_NODE_KEY)); 334 | int r = kvdb_set(db->db, master_node_key.c_str(), master_node_key.length(), 335 | buffer.c_str(), buffer.length()); 336 | return r; 337 | } 338 | 339 | static int read_master_node(kvdbo * db) 340 | { 341 | char * value = NULL; 342 | size_t size = 0; 343 | uint64_t max_node_id = 0; 344 | 345 | std::string master_node_key; 346 | master_node_key.append(METAKEY_PREFIX, METAKEY_PREFIX_SIZE); 347 | master_node_key.append(MASTER_NODE_KEY, strlen(MASTER_NODE_KEY)); 348 | int r = kvdb_get(db->db, master_node_key.c_str(), master_node_key.length(), 349 | &value, &size); 350 | if (r == -1) { 351 | return 0; 352 | } 353 | if (r == -2) { 354 | return -2; 355 | } 356 | std::string buffer(value, size); 357 | db->nodes_ids.clear(); 358 | uint64_t count = 0; 359 | size_t position = 0; 360 | position = kv_decode_uint64(buffer, position, &count); 361 | for(uint64_t i = 0 ; i < count ; i ++) { 362 | uint64_t node_id = 0; 363 | position = kv_decode_uint64(buffer, position, &node_id); 364 | db->nodes_ids.push_back(node_id); 365 | if (node_id > max_node_id) { 366 | max_node_id = node_id; 367 | } 368 | } 369 | for(uint64_t i = 0 ; i < count ; i ++) { 370 | uint64_t keys_count = 0; 371 | position = kv_decode_uint64(buffer, position, &keys_count); 372 | db->nodes_keys_count.push_back((uint32_t) keys_count); 373 | } 374 | //size_t remaining = size - (p - value); 375 | size_t remaining = size - position; 376 | unserialize_words_list(db->nodes_first_keys, value + position, remaining); 377 | return 0; 378 | } 379 | 380 | // binary search of a node that should contain the given key. 381 | // returns the index of the node within the given boundaries. 382 | // used by find_node() below. 383 | static unsigned int find_node_with_boundaries(kvdbo * db, const std::string key, 384 | unsigned int left, unsigned int right) 385 | { 386 | unsigned int middle = (left + right) / 2; 387 | if (key >= db->nodes_first_keys[right]) { 388 | return right; 389 | } 390 | if (left == middle) { 391 | return left; 392 | } 393 | 394 | if (key >= db->nodes_first_keys[middle]) { 395 | return find_node_with_boundaries(db, key, middle, right); 396 | } 397 | else { 398 | return find_node_with_boundaries(db, key, left, middle - 1); 399 | } 400 | } 401 | 402 | // binary search of a node that should contain the given key. 403 | // returns the index of the node. 404 | static unsigned int find_node(kvdbo * db, const std::string key) 405 | { 406 | return find_node_with_boundaries(db, key, 0, (unsigned int) db->nodes_first_keys.size() - 1); 407 | } 408 | 409 | // binary search of a key in the node loaded by the iterator. 410 | // returns the index of the key within the node, in the given range. 411 | // used by find_key() below. 412 | static unsigned int find_key_with_boundaries(kvdbo_iterator * iterator, const std::string key, 413 | unsigned int left, unsigned int right) 414 | { 415 | unsigned int middle = (left + right) / 2; 416 | if (key >= iterator->keys[right]) { 417 | return right; 418 | } 419 | if (left == middle) { 420 | return left; 421 | } 422 | 423 | if (key >= iterator->keys[middle]) { 424 | return find_key_with_boundaries(iterator, key, middle, right); 425 | } 426 | else { 427 | return find_key_with_boundaries(iterator, key, left, middle - 1); 428 | } 429 | } 430 | 431 | // binary search of a key in the node loaded by the iterator. 432 | // returns the index of the key within the node. 433 | static unsigned int find_key(kvdbo_iterator * iterator, const std::string key) 434 | { 435 | return find_key_with_boundaries(iterator, key, 0, (unsigned int) (iterator->keys.size() - 1)); 436 | } 437 | 438 | // unserialize a list of words to a vector. 439 | static void unserialize_words_list(std::vector & word_list, char * value, size_t size) 440 | { 441 | word_list.clear(); 442 | const char * p = value; 443 | const char * key_start = value; 444 | while (size > 0) { 445 | if (* p == 0) { 446 | // add key. 447 | size_t len = p - key_start; 448 | word_list.push_back(std::string(key_start, len)); 449 | key_start = p + 1; 450 | } 451 | p ++; 452 | size --; 453 | } 454 | } 455 | 456 | // unserialize a list of words to a set. 457 | static void unserialize_words_set(std::set & word_set, char * value, size_t size, bool clear_words_set) 458 | { 459 | if (clear_words_set) { 460 | word_set.clear(); 461 | } 462 | const char * p = value; 463 | const char * key_start = value; 464 | while (size > 0) { 465 | if (* p == 0) { 466 | // add key. 467 | size_t len = p - key_start; 468 | word_set.insert(std::string(key_start, len)); 469 | key_start = p + 1; 470 | } 471 | p ++; 472 | size --; 473 | } 474 | } 475 | 476 | // serialize a list of words stored in a set. 477 | // the result will be stored in the variable value. 478 | static void serialize_words_set(std::string & value, std::set & word_set) 479 | { 480 | std::set::iterator it = word_set.begin(); 481 | while (it != word_set.end()) { 482 | value.append(* it); 483 | value.push_back(0); 484 | it ++; 485 | } 486 | } 487 | 488 | // pending modification to a node. 489 | struct modified_node { 490 | kvdbo * db; 491 | uint64_t node_id; 492 | unsigned int node_index; 493 | std::set keys; 494 | }; 495 | 496 | // flush the pending changes of the keys list in memory. 497 | static int flush_pending_keys(kvdbo * db) 498 | { 499 | if ((db->pending_keys.size() > 0) && (db->nodes_ids.size() == 0)) { 500 | add_first_node(db); 501 | } 502 | 503 | struct modified_node current_node; 504 | current_node.db = db; 505 | current_node.node_id = 0; 506 | current_node.node_index = -1; 507 | 508 | std::set::iterator addition_it = db->pending_keys.begin(); 509 | std::set::iterator deletion_it = db->pending_keys_delete.begin(); 510 | for(unsigned int node_index = 0 ; node_index < db->nodes_ids.size() ; node_index ++) { 511 | // if it's the last node. 512 | if (node_index == db->nodes_ids.size() - 1) { 513 | // also applies when nodes_ids->size() == 1, node_index == 0 514 | while (deletion_it != db->pending_keys_delete.end()) { 515 | if (current_node.node_index != node_index) { 516 | load_node(¤t_node, node_index); 517 | } 518 | current_node.keys.erase(* deletion_it); 519 | deletion_it ++; 520 | } 521 | while (addition_it != db->pending_keys.end()) { 522 | if (current_node.node_index != node_index) { 523 | load_node(¤t_node, node_index); 524 | } 525 | current_node.keys.insert(* addition_it); 526 | addition_it ++; 527 | } 528 | } 529 | else { 530 | // applies when nodes_ids->size() >= 2 531 | while (deletion_it != db->pending_keys_delete.end()) { 532 | // make sure that we don't reach the boundary of the next node. 533 | if (* deletion_it >= db->nodes_first_keys[node_index + 1]) { 534 | // stop here. 535 | break; 536 | } 537 | if (current_node.node_index != node_index) { 538 | load_node(¤t_node, node_index); 539 | } 540 | current_node.keys.erase(* deletion_it); 541 | deletion_it ++; 542 | } 543 | while (addition_it != db->pending_keys.end()) { 544 | // make sure that we don't reach the boundary of the next node. 545 | if (* addition_it >= db->nodes_first_keys[node_index + 1]) { 546 | // stop here. 547 | break; 548 | } 549 | if (current_node.node_index != node_index) { 550 | load_node(¤t_node, node_index); 551 | } 552 | current_node.keys.insert(* addition_it); 553 | addition_it ++; 554 | } 555 | } 556 | } 557 | // write the last node. 558 | write_loaded_node(¤t_node); 559 | db->pending_keys.clear(); 560 | db->pending_keys_delete.clear(); 561 | 562 | return 0; 563 | } 564 | 565 | // load the given node in memory. 566 | static int load_node(struct modified_node * node, unsigned int node_index) 567 | { 568 | write_loaded_node(node); 569 | 570 | uint64_t node_id = node->db->nodes_ids[node_index]; 571 | node->node_index = node_index; 572 | node->node_id = node_id; 573 | node->keys.clear(); 574 | 575 | int r = load_from_node_id(node, node_id); 576 | if (r != 0) { 577 | return r; 578 | } 579 | 580 | return r; 581 | } 582 | 583 | // add the keys from the given node to the data structure. 584 | static int load_from_node_id(struct modified_node * node, uint64_t node_id) 585 | { 586 | std::string node_key; 587 | node_key.append(METAKEY_PREFIX, METAKEY_PREFIX_SIZE); 588 | node_key.append(NODE_PREFIX, strlen(NODE_PREFIX)); 589 | uint64_t identifier = hton64(node_id); 590 | node_key.append((const char *) &identifier, sizeof(identifier)); 591 | char * value; 592 | size_t value_size; 593 | int r = kvdb_get(node->db->db, node_key.c_str(), node_key.length(), &value, &value_size); 594 | if (r == -2) { 595 | node->node_index = -1; 596 | return -2; 597 | } 598 | if (r == 0) { 599 | unserialize_words_set(node->keys, value, value_size, false); 600 | free(value); 601 | } 602 | 603 | return 0; 604 | } 605 | 606 | static int remove_node_id(kvdbo * db, uint64_t node_id) 607 | { 608 | std::string node_key; 609 | node_key.append(METAKEY_PREFIX, METAKEY_PREFIX_SIZE); 610 | node_key.append(NODE_PREFIX, strlen(NODE_PREFIX)); 611 | uint64_t identifier = hton64(node_id); 612 | node_key.append((const char *) &identifier, sizeof(identifier)); 613 | int r = kvdb_delete(db->db, node_key.c_str(), node_key.length()); 614 | if (r == -1) { 615 | return 0; 616 | } 617 | if (r != 0) { 618 | return r; 619 | } 620 | return 0; 621 | } 622 | 623 | // returns the next usable node identifier. 624 | static uint64_t allocate_node_id(kvdbo * db) 625 | { 626 | uint64_t node_id = db->next_node_id; 627 | db->next_node_id ++; 628 | return node_id; 629 | } 630 | 631 | // create the first node. 632 | static int add_first_node(kvdbo * db) 633 | { 634 | uint64_t node_id = allocate_node_id(db); 635 | db->nodes_ids.push_back(node_id); 636 | db->nodes_first_keys.push_back(""); 637 | db->nodes_keys_count.push_back(0); 638 | int r = write_master_node(db); 639 | if (r != 0) { 640 | return r; 641 | } 642 | return 0; 643 | } 644 | 645 | #define MAX_KEYS_PER_NODE 16384 646 | #define KEYS_PER_NODE_MERGE_THRESHOLD_FACTOR 4 647 | #define KEYS_PER_NODE_MERGE_THRESHOLD (MAX_KEYS_PER_NODE / KEYS_PER_NODE_MERGE_THRESHOLD_FACTOR) 648 | #define MEAN_KEYS_PER_NODE_FACTOR 2 649 | #define MEAN_KEYS_PER_NODE (MAX_KEYS_PER_NODE / MEAN_KEYS_PER_NODE_FACTOR) 650 | 651 | // write the node to disk. 652 | static int write_loaded_node(struct modified_node * node) 653 | { 654 | // not valid. 655 | if (node->node_index == -1) { 656 | return 0; 657 | } 658 | 659 | if (node->keys.size() == 0) { 660 | // if there's no keys. 661 | int r = remove_node(node->db, node->node_index); 662 | // invalidate. 663 | node->node_index = -1; 664 | return r; 665 | } 666 | else if (node->keys.size() > MAX_KEYS_PER_NODE) { 667 | // if there's more keys than the limit, split node. 668 | unsigned int node_index = node->node_index; 669 | // compute the number of nodes to create to replace this one. 670 | unsigned int count = (unsigned int) ((node->keys.size() + MEAN_KEYS_PER_NODE - 1) / MEAN_KEYS_PER_NODE); 671 | int r = split_node(node->db, node_index, count, node->keys); 672 | if (r != 0) { 673 | return r; 674 | } 675 | bool didMerge = false; 676 | // try to merge the last one with the next one. 677 | r = try_merge(node->db, node_index + count - 1, &didMerge); 678 | if (r != 0) { 679 | return r; 680 | } 681 | // invalidate. 682 | node->node_index = -1; 683 | return 0; 684 | } 685 | else if (node->keys.size() < KEYS_PER_NODE_MERGE_THRESHOLD) { 686 | // if there's a low number of keys. 687 | int r = write_single_loaded_node(node); 688 | if (r != 0) { 689 | return r; 690 | } 691 | 692 | // try to merge node with previous... 693 | unsigned int node_index = node->node_index; 694 | bool didMerge = false; 695 | if (node_index > 0) { 696 | r = try_merge(node->db, node_index - 1, &didMerge); 697 | if (r != 0) { 698 | return r; 699 | } 700 | if (didMerge) { 701 | node_index --; 702 | } 703 | } 704 | // then, with next. 705 | r = try_merge(node->db, node_index, &didMerge); 706 | if (r != 0) { 707 | return r; 708 | } 709 | // invalidate. 710 | node->node_index = -1; 711 | return 0; 712 | } 713 | else { 714 | // in other cases. 715 | int r = write_single_loaded_node(node); 716 | // invalidate. 717 | node->node_index = -1; 718 | return r; 719 | } 720 | return 0; 721 | } 722 | 723 | static int write_single_loaded_node(struct modified_node * node) 724 | { 725 | // write the node. 726 | std::string value; 727 | serialize_words_set(value, node->keys); 728 | std::string node_key; 729 | node_key.append(METAKEY_PREFIX, METAKEY_PREFIX_SIZE); 730 | node_key.append(NODE_PREFIX, strlen(NODE_PREFIX)); 731 | uint64_t identifier = hton64(node->node_id); 732 | node_key.append((const char *) &identifier, sizeof(identifier)); 733 | int r = kvdb_set(node->db->db, node_key.c_str(), node_key.length(), value.c_str(), value.length()); 734 | if (r != 0) { 735 | return r; 736 | } 737 | // update the master node. 738 | bool changed = false; 739 | if (node->node_id != node->db->nodes_ids[node->node_index]) { 740 | node->db->nodes_ids[node->node_index] = node->node_id; 741 | changed = true; 742 | } 743 | if (node->db->nodes_keys_count[node->node_index] != node->keys.size()) { 744 | node->db->nodes_keys_count[node->node_index] = (uint32_t) node->keys.size(); 745 | changed = true; 746 | } 747 | std::string first_key; 748 | if (node->keys.begin() != node->keys.end()) { 749 | first_key = * node->keys.begin(); 750 | } 751 | if (node->db->nodes_first_keys[node->node_index] != first_key) { 752 | node->db->nodes_first_keys[node->node_index] = first_key; 753 | changed = true; 754 | } 755 | if (changed) { 756 | r = write_master_node(node->db); 757 | if (r != 0) { 758 | return r; 759 | } 760 | } 761 | 762 | return 0; 763 | } 764 | 765 | // try to merge with the next node. 766 | static int try_merge(kvdbo * db, unsigned int node_index, bool * pDidMerge) 767 | { 768 | // there's no next node. 769 | if (node_index + 1 >= db->nodes_ids.size()) { 770 | * pDidMerge = false; 771 | return 0; 772 | } 773 | 774 | // would it make the number of keys larger than the threshold? 775 | if (db->nodes_keys_count[node_index] + db->nodes_keys_count[node_index + 1] > MEAN_KEYS_PER_NODE) { 776 | * pDidMerge = false; 777 | return 0; 778 | } 779 | 780 | struct modified_node current_node; 781 | current_node.db = db; 782 | current_node.node_id = db->nodes_ids[node_index]; 783 | current_node.node_index = node_index; 784 | 785 | // add keys of node at node_index into memory. 786 | int r = load_from_node_id(¤t_node, db->nodes_ids[node_index]); 787 | if (r != 0) { 788 | return r; 789 | } 790 | // add keys of node at (node_index + 1) into memory. 791 | r = load_from_node_id(¤t_node, db->nodes_ids[node_index + 1]); 792 | if (r != 0) { 793 | return r; 794 | } 795 | 796 | // write the result. 797 | r = write_single_loaded_node(¤t_node); 798 | if (r != 0) { 799 | return r; 800 | } 801 | 802 | //delete current_node.keys; 803 | 804 | // remove the node at (node_index + 1). 805 | r = remove_node(db, node_index + 1); 806 | if (r != 0) { 807 | return r; 808 | } 809 | 810 | * pDidMerge = true; 811 | 812 | return 0; 813 | } 814 | 815 | // remove node at the given index. 816 | static int remove_node(kvdbo * db, unsigned int node_index) 817 | { 818 | int r = remove_node_id(db, db->nodes_ids[node_index]); 819 | if (r != 0) { 820 | return r; 821 | } 822 | db->nodes_ids.erase(db->nodes_ids.begin() + node_index); 823 | db->nodes_first_keys.erase(db->nodes_first_keys.begin() + node_index); 824 | db->nodes_keys_count.erase(db->nodes_keys_count.begin() + node_index); 825 | if (r != 0) { 826 | return r; 827 | } 828 | r = write_master_node(db); 829 | if (r != 0) { 830 | return r; 831 | } 832 | 833 | return 0; 834 | } 835 | 836 | // create 'count' new nodes to replace the given node at node_index. 837 | // the given keys will be used to fill the new nodes. 838 | static int split_node(kvdbo * db, unsigned int node_index, unsigned int count, 839 | std::set & keys) 840 | { 841 | // creates as many nodes as needed for the split. 842 | struct modified_node * nodes = new modified_node[count]; 843 | for(unsigned int i = 0 ; i < count ; i ++) { 844 | nodes[i].db = db; 845 | nodes[i].node_id = allocate_node_id(db); 846 | nodes[i].node_index = node_index + i; 847 | //nodes[i].keys = new std::set(); 848 | } 849 | 850 | // fill the new nodes with keys. 851 | struct modified_node * current_node = &nodes[0]; 852 | unsigned int added_count = 0; 853 | std::set::iterator it = keys.begin(); 854 | while (it != keys.end()) { 855 | if (added_count >= MAX_KEYS_PER_NODE / MEAN_KEYS_PER_NODE_FACTOR) { 856 | current_node ++; 857 | added_count = 0; 858 | } 859 | current_node->keys.insert(* it); 860 | added_count ++; 861 | it ++; 862 | } 863 | 864 | // adjust the master node information. 865 | int r; 866 | remove_node_id(db, db->nodes_ids[node_index]); 867 | db->nodes_ids.erase(db->nodes_ids.begin() + node_index); 868 | db->nodes_first_keys.erase(db->nodes_first_keys.begin() + node_index); 869 | db->nodes_keys_count.erase(db->nodes_keys_count.begin() + node_index); 870 | db->nodes_ids.insert(db->nodes_ids.begin() + node_index, count, 0); 871 | db->nodes_first_keys.insert(db->nodes_first_keys.begin() + node_index, count, ""); 872 | db->nodes_keys_count.insert(db->nodes_keys_count.begin() + node_index, count, 0); 873 | // write the nodes. 874 | for(unsigned int i = 0 ; i < count ; i ++) { 875 | r = write_single_loaded_node(&nodes[i]); 876 | if (r != 0) { 877 | return r; 878 | } 879 | } 880 | delete [] nodes; 881 | 882 | return 0; 883 | } 884 | 885 | // for debug purpose. 886 | static void show_nodes(kvdbo * db) 887 | { 888 | printf("*******\n"); 889 | printf("node_ids: "); 890 | for(unsigned int i = 0 ; i < db->nodes_ids.size() ; i ++) { 891 | printf("%i ", (int) db->nodes_ids[i]); 892 | } 893 | printf("\n"); 894 | printf("keys: "); 895 | for(unsigned int i = 0 ; i < db->nodes_first_keys.size() ; i ++) { 896 | printf("%s ", db->nodes_first_keys[i].c_str()); 897 | } 898 | printf("\n"); 899 | printf("count: "); 900 | for(unsigned int i = 0 ; i < db->nodes_keys_count.size() ; i ++) { 901 | printf("%i ", (int) db->nodes_keys_count[i]); 902 | } 903 | printf("\n"); 904 | printf("*******\n"); 905 | } -------------------------------------------------------------------------------- /src/kvdbo.h: -------------------------------------------------------------------------------- 1 | #ifndef KVDBO_H 2 | 3 | #define KVDBO_H 4 | 5 | #include 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | // kvdbo is like kvdb except it maintains an efficient ordered list of keys. 12 | // It will let you iterate on the list of keys. 13 | 14 | typedef struct kvdbo kvdbo; 15 | 16 | // create a kvdbo. 17 | kvdbo * kvdbo_new(const char * filename); 18 | // destroy a kvdbo. 19 | void kvdbo_free(kvdbo * db); 20 | 21 | // opens a kvdbo. 22 | int kvdbo_open(kvdbo * db); 23 | // closes a kvdbo. 24 | void kvdbo_close(kvdbo * db); 25 | 26 | // write pending changes. 27 | int kvdbo_flush(kvdbo * db); 28 | 29 | // insert a key / value. if the key already exists, it's replaced. 30 | // Returns -2 if there's a I/O error. 31 | // Returns -3 if the key is invalid (starting with \0kvdbo). 32 | // kvdbo_flush() must be called to write on disk all pending changes. 33 | int kvdbo_set(kvdbo * db, const char * key, size_t key_size, 34 | const char * value, size_t value_size); 35 | 36 | // retrieve the value for the given key. 37 | // result stored in p_value should be released using free(). 38 | // Returns -1 if item is not found. 39 | // Returns -2 if there's a I/O error. 40 | // kvdbo_flush() must be called to write on disk all pending changes. 41 | int kvdbo_get(kvdbo * db, const char * key, size_t key_size, 42 | char ** p_value, size_t * p_value_size); 43 | 44 | // remove the given key. 45 | // Returns -1 if item is not found. 46 | // Returns -2 if there's a I/O error. 47 | // kvdbo_flush() must be called to write on disk all pending changes. 48 | int kvdbo_delete(kvdbo * db, const char * key, size_t key_size); 49 | 50 | typedef struct kvdbo_iterator kvdbo_iterator; 51 | 52 | // create an iterator on the given kvdbo (order is lexicographical). 53 | kvdbo_iterator * kvdbo_iterator_new(kvdbo * db); 54 | 55 | // destroy an iterator. 56 | void kvdbo_iterator_free(kvdbo_iterator * iterator); 57 | 58 | // seek to the first key. 59 | void kvdbo_iterator_seek_first(kvdbo_iterator * iterator); 60 | 61 | // seek to the position of the given key or after. 62 | void kvdbo_iterator_seek_after(kvdbo_iterator * iterator, const char * key, size_t key_size); 63 | 64 | // seek to the last key. 65 | void kvdbo_iterator_seek_last(kvdbo_iterator * iterator); 66 | 67 | // seek to the next key. 68 | void kvdbo_iterator_next(kvdbo_iterator * iterator); 69 | 70 | // seek to the previous key. 71 | void kvdbo_iterator_previous(kvdbo_iterator * iterator); 72 | 73 | // returns the key at the position of the iterator. 74 | // result is valid until the next call to any iterator function. 75 | void kvdbo_iterator_get_key(kvdbo_iterator * iterator, const char ** p_key, size_t * p_key_size); 76 | 77 | // returns whether the iterator is valid. 78 | int kvdbo_iterator_is_valid(kvdbo_iterator * iterator); 79 | 80 | #ifdef __cplusplus 81 | } 82 | #endif 83 | 84 | #endif 85 | -------------------------------------------------------------------------------- /src/kvendian.h: -------------------------------------------------------------------------------- 1 | // 2 | // kvendian.h 3 | // kvdb 4 | // 5 | // Created by DINH Viêt Hoà on 6/1/13. 6 | // Copyright (c) 2013 etpan. All rights reserved. 7 | // 8 | 9 | #ifndef kvdb_kvendian_h 10 | #define kvdb_kvendian_h 11 | 12 | #include 13 | 14 | // Convert a 64 bit value to network byte order. 15 | static inline uint64_t hton64(uint64_t val) 16 | { 17 | union { uint64_t ll; 18 | uint32_t l[2]; 19 | } w, r; 20 | 21 | // platform already in network byte order? 22 | if (htonl(1) == 1L) 23 | return val; 24 | w.ll = val; 25 | r.l[0] = htonl(w.l[1]); 26 | r.l[1] = htonl(w.l[0]); 27 | return r.ll; 28 | } 29 | 30 | // Convert a 64 bit value from network to host byte order. 31 | static inline uint64_t ntoh64(uint64_t val) 32 | { 33 | union { uint64_t ll; 34 | uint32_t l[2]; 35 | } w, r; 36 | 37 | // platform already in network byte order? 38 | if (htonl(1) == 1L) 39 | return val; 40 | w.ll = val; 41 | r.l[0] = ntohl(w.l[1]); 42 | r.l[1] = ntohl(w.l[0]); 43 | return r.ll; 44 | } 45 | 46 | static inline uint64_t bytes_to_h64(char * bytes) 47 | { 48 | uint64_t result = * (uint64_t *) bytes; 49 | return ntoh64(result); 50 | } 51 | 52 | static inline void h64_to_bytes(char * bytes, uint64_t value) 53 | { 54 | value = hton64(value); 55 | * (uint64_t *) bytes = value; 56 | } 57 | 58 | static inline uint32_t bytes_to_h32(char * bytes) 59 | { 60 | uint32_t result = * (uint32_t *) bytes; 61 | return ntohl(result); 62 | } 63 | 64 | static inline void h32_to_bytes(char * bytes, uint32_t value) 65 | { 66 | value = htonl(value); 67 | * (uint32_t *) bytes = value; 68 | } 69 | 70 | static inline uint8_t bytes_to_h8(char * bytes) 71 | { 72 | uint8_t result = * (uint8_t *) bytes; 73 | return result; 74 | } 75 | 76 | static inline void h8_to_bytes(char * bytes, uint8_t value) 77 | { 78 | * (uint8_t *) bytes = value; 79 | } 80 | 81 | #endif 82 | -------------------------------------------------------------------------------- /src/kvmurmurhash.h: -------------------------------------------------------------------------------- 1 | // 2 | // murmurhash.h 3 | // kvdb 4 | // 5 | // Created by DINH Viêt Hoà on 6/2/13. 6 | // Copyright (c) 2013 etpan. All rights reserved. 7 | // 8 | 9 | #ifndef KVMURMURHASH_H 10 | #define KVMURMURHASH_H 11 | 12 | static inline uint32_t kv_murmur_hash(const char * data, size_t length, uint32_t seed) 13 | { 14 | uint32_t m = 0x5bd1e995; 15 | uint32_t r = 24; 16 | unsigned char * bytes = (unsigned char *) data; 17 | 18 | uint32_t h = seed ^ (uint32_t) length; 19 | 20 | size_t len_4 = length >> 2; 21 | 22 | for (int i = 0; i < len_4; i++) { 23 | int i_4 = i << 2; 24 | uint32_t k = bytes[i_4 + 3]; 25 | k = k << 8; 26 | k = k | bytes[i_4 + 2]; 27 | k = k << 8; 28 | k = k | bytes[i_4 + 1]; 29 | k = k << 8; 30 | k = k | bytes[i_4 + 0]; 31 | k *= m; 32 | k ^= k >> r; 33 | k *= m; 34 | h *= m; 35 | h ^= k; 36 | } 37 | 38 | // avoid calculating modulo 39 | size_t len_m = len_4 << 2; 40 | size_t left = length - len_m; 41 | 42 | if (left != 0) { 43 | if (left >= 3) { 44 | h ^= (uint32_t) data[length - 3] << 16; 45 | } 46 | if (left >= 2) { 47 | h ^= (uint32_t) data[length - 2] << 8; 48 | } 49 | if (left >= 1) { 50 | h ^= (uint32_t) data[length - 1]; 51 | } 52 | 53 | h *= m; 54 | } 55 | 56 | h ^= h >> 13; 57 | h *= m; 58 | h ^= h >> 15; 59 | 60 | return h; 61 | } 62 | 63 | #endif 64 | -------------------------------------------------------------------------------- /src/kvpaddingutils.h: -------------------------------------------------------------------------------- 1 | // 2 | // kvpaddingutils.h 3 | // kvdb 4 | // 5 | // Created by DINH Viêt Hoà on 6/2/13. 6 | // Copyright (c) 2013 etpan. All rights reserved. 7 | // 8 | 9 | #ifndef KVPADDINGUTILS_H 10 | #define KVPADDINGUTILS_H 11 | 12 | #include 13 | 14 | static inline uint64_t power2_round_up(uint64_t value) 15 | { 16 | uint64_t power = 1; 17 | while (power < value) 18 | power <<= 1; 19 | return power; 20 | } 21 | 22 | static inline uint64_t block_size_round_up(uint64_t value) 23 | { 24 | if (value < 16) { 25 | value = 16; 26 | } 27 | return power2_round_up(value); 28 | } 29 | 30 | static inline unsigned int log2_round_up(uint64_t value) 31 | { 32 | uint64_t power = 1; 33 | unsigned int log2_value = 0; 34 | while (power < value) { 35 | power <<= 1; 36 | log2_value ++; 37 | } 38 | return log2_value; 39 | } 40 | 41 | #define KV_ULONG_PTR unsigned long 42 | #define KV_PAGE_ROUND_UP(db, x) ( (((KV_ULONG_PTR)(x)) + db->kv_pagesize-1) & (~(db->kv_pagesize-1)) ) 43 | #define KV_PAGE_ROUND_DOWN(db, x) ( ((KV_ULONG_PTR)(x)) & (~(db->kv_pagesize-1)) ) 44 | #define KV_BYTE_ROUND_UP(x) ( (((KV_ULONG_PTR)(x)) + 8-1) & (~(8-1)) ) 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /src/kvprime.c: -------------------------------------------------------------------------------- 1 | // 2 | // kvprime.c 3 | // kvdb 4 | // 5 | // Created by DINH Viêt Hoà on 6/2/13. 6 | // Copyright (c) 2013 etpan. All rights reserved. 7 | // 8 | 9 | #include "kvprime.h" 10 | 11 | uint64_t kv_getnextprime(uint64_t num) 12 | { 13 | uint64_t prime_numbers[] = { 14 | 1, 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 43, 47, 53, 59, 61, 71, 79, 83, 15 | 89, 103, 109, 113, 127, 139, 157, 173, 191, 199, 223, 239, 251, 283, 317, 349, 16 | 383, 409, 443, 479, 509, 571, 631, 701, 761, 829, 887, 953, 1021, 1151, 1279, 17 | 1399, 1531, 1663, 1789, 1913, 2039, 2297, 2557, 2803, 3067, 3323, 3583, 3833, 18 | 4093, 4603, 5119, 5623, 6143, 6653, 7159, 7673, 8191, 9209, 10223, 11261, 19 | 12281, 13309, 14327, 15359, 16381, 18427, 20479, 22511, 24571, 26597, 28669, 20 | 30713, 32749, 36857, 40949, 45053, 49139, 53239, 57331, 61417, 65521, 73727, 21 | 81919, 90107, 98299, 106487, 114679, 122869, 131071, 147451, 163819, 180221, 22 | 196597, 212987, 229373, 245759, 262139, 294911, 327673, 360439, 393209, 425977, 23 | 458747, 491503, 524287, 589811, 655357, 720887, 786431, 851957, 917503, 982981, 24 | 1048573, 1179641, 1310719, 1441771, 1572853, 1703903, 1835003, 1966079, 25 | 2097143, 2359267, 2621431, 2883577, 3145721, 3407857, 3670013, 3932153, 26 | 4194301, 4718579, 5242877, 5767129, 6291449, 6815741, 7340009, 7864301, 27 | 8388593, 9437179, 10485751, 11534329, 12582893, 13631477, 14680063, 15728611, 28 | 16777213, 18874367, 20971507, 23068667, 25165813, 27262931, 29360087, 31457269, 29 | 33554393, 37748717, 41943023, 46137319, 50331599, 54525917, 58720253, 62914549, 30 | 67108859, 75497467, 83886053, 92274671, 100663291, 109051903, 117440509, 31 | 125829103, 134217689, 150994939, 167772107, 184549373, 201326557, 218103799, 32 | 234881011, 251658227, 268435399, 301989881, 335544301, 369098707, 402653171, 33 | 436207613, 469762043, 503316469, 536870909, 603979769, 671088637, 738197503, 34 | 805306357, 872415211, 939524087, 1006632947, 1073741789, 1207959503, 35 | 1342177237, 1476394991, 1610612711, 1744830457, 1879048183, 2013265907, 36 | 2576980349, 3092376431, 3710851741, 4718021527, 6133428047, 7973456459, 37 | 10365493393, 13475141413, 17517683831, 22772988923, 29604885677, 38486351381, 38 | 50032256819, 65041933867, 84554514043, 109920868241, 153889215497, 0 39 | }; 40 | static int count = sizeof(prime_numbers) / sizeof(prime_numbers[0]); 41 | 42 | for(int i = 0 ; i < count ; i ++){ 43 | if (num <= prime_numbers[i]) 44 | return prime_numbers[i]; 45 | } 46 | return prime_numbers[count - 1]; 47 | } 48 | -------------------------------------------------------------------------------- /src/kvprime.h: -------------------------------------------------------------------------------- 1 | // 2 | // kvprime.h 3 | // kvdb 4 | // 5 | // Created by DINH Viêt Hoà on 6/2/13. 6 | // Copyright (c) 2013 etpan. All rights reserved. 7 | // 8 | 9 | #ifndef KVPRIME_H 10 | #define KVPRIME_H 11 | 12 | #include 13 | 14 | uint64_t kv_getnextprime(uint64_t num); 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /src/kvserialization.cpp: -------------------------------------------------------------------------------- 1 | #include "kvserialization.h" 2 | 3 | #include 4 | 5 | void kv_encode_uint64(std::string & buffer, uint64_t value) 6 | { 7 | char valuestr[10]; 8 | int len = 0; 9 | while (1) { 10 | unsigned char remainder = value & 0x7f; 11 | value = value >> 7; 12 | if (value == 0) { 13 | // last item to write. 14 | valuestr[len] = remainder; 15 | len ++; 16 | break; 17 | } 18 | else { 19 | valuestr[len] = remainder | 0x80; 20 | len ++; 21 | } 22 | } 23 | buffer.append(valuestr, len); 24 | } 25 | 26 | size_t kv_decode_uint64(std::string & buffer, size_t position, uint64_t * p_value) 27 | { 28 | uint64_t value = 0; 29 | int s = 0; 30 | 31 | while (1) { 32 | unsigned char remainder = buffer[position]; 33 | position ++; 34 | value += ((uint64_t) remainder & 0x7f) << s; 35 | if ((remainder & 0x80) == 0) { 36 | break; 37 | } 38 | s += 7; 39 | } 40 | 41 | * p_value = value; 42 | 43 | return position; 44 | } 45 | -------------------------------------------------------------------------------- /src/kvserialization.h: -------------------------------------------------------------------------------- 1 | #ifndef KVSERIALIZATION_H 2 | 3 | #define KVSERIALIZATION_H 4 | 5 | #include 6 | 7 | void kv_encode_uint64(std::string & buffer, uint64_t value); 8 | size_t kv_decode_uint64(std::string & buffer, size_t position, uint64_t * p_value); 9 | 10 | #endif 11 | 12 | -------------------------------------------------------------------------------- /src/kvtable.c: -------------------------------------------------------------------------------- 1 | // 2 | // table.c 3 | // kvdb 4 | // 5 | // Created by DINH Viêt Hoà on 6/2/13. 6 | // Copyright (c) 2013 etpan. All rights reserved. 7 | // 8 | 9 | #include "kvtable.h" 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "kvtypes.h" 17 | #include "kvprime.h" 18 | #include "kvpaddingutils.h" 19 | 20 | static int map_table(kvdb * db, struct kvdb_table ** result, uint64_t offset, int is_first); 21 | static int mapping_setup(struct kvdb_mapping * mapping, int fd, off_t offset, size_t size); 22 | static void mapping_unsetup(struct kvdb_mapping * mapping); 23 | static void unmap_table(struct kvdb_table * table); 24 | 25 | int kv_table_header_write(kvdb * db, uint64_t table_start, uint64_t maxcount) 26 | { 27 | uint64_t bloomsize = kv_getnextprime(maxcount * KV_TABLE_BITS_FOR_BLOOM_FILTER); 28 | char data[KV_TABLE_HEADER_SIZE]; 29 | bzero(data, KV_TABLE_HEADER_SIZE); 30 | h64_to_bytes(&data[KV_TABLE_BLOOM_SIZE_OFFSET], bloomsize); 31 | h64_to_bytes(&data[KV_TABLE_MAX_COUNT_OFFSET], maxcount); 32 | ssize_t r; 33 | r = pwrite(db->kv_fd, data, KV_TABLE_HEADER_SIZE, table_start); 34 | if (r < 0) 35 | return -1; 36 | return 0; 37 | } 38 | 39 | int kv_tables_setup(kvdb * db) 40 | { 41 | map_table(db, &db->kv_first_table, KV_HEADER_SIZE, 1); 42 | return 0; 43 | } 44 | 45 | void kv_tables_unsetup(kvdb * db) 46 | { 47 | unmap_table(db->kv_first_table); 48 | } 49 | 50 | uint64_t kv_table_create(kvdb * db, uint64_t size, struct kvdb_table ** result) 51 | { 52 | //fprintf(stderr, "create table %llu", (unsigned long long) size); 53 | uint64_t mapping_size = KV_TABLE_SIZE(size); 54 | uint64_t offset = ntoh64(* db->kv_filesize); 55 | int r; 56 | r = ftruncate(db->kv_fd, offset + mapping_size); 57 | if (r < 0) 58 | return 0; 59 | uint64_t filesize = ntoh64(* db->kv_filesize); 60 | filesize += mapping_size; 61 | r = kv_table_header_write(db, offset, size); 62 | if (r < 0) 63 | return 0; 64 | r = map_table(db, result, offset, 0); 65 | if (r < 0) 66 | return 0; 67 | 68 | // When everything succeeded, update file size 69 | * db->kv_filesize = hton64(filesize); 70 | 71 | return offset; 72 | } 73 | 74 | static int map_table(kvdb * db, struct kvdb_table ** result, uint64_t offset, int is_first) 75 | { 76 | struct kvdb_table * table; 77 | uint64_t maxcount; 78 | ssize_t read_result; 79 | char data[8]; 80 | int r; 81 | off_t pre_page_align_size; 82 | 83 | table = calloc(1, sizeof(* table)); 84 | if (is_first) { 85 | pre_page_align_size = KV_HEADER_SIZE; 86 | } 87 | else { 88 | off_t mapping_offset = KV_PAGE_ROUND_DOWN(db, offset); 89 | pre_page_align_size = offset - mapping_offset; 90 | } 91 | 92 | read_result = pread(db->kv_fd, data, 8, offset + KV_TABLE_MAX_COUNT_OFFSET); 93 | if (read_result < 0) { 94 | return -1; 95 | } 96 | maxcount = bytes_to_h64(data); 97 | uint64_t mapping_size = pre_page_align_size + KV_TABLE_SIZE(maxcount); 98 | r = mapping_setup(&table->kv_mapping, db->kv_fd, offset - pre_page_align_size, (size_t) mapping_size); 99 | if (r < 0) { 100 | return -1; 101 | } 102 | table->kv_table_start = table->kv_mapping.kv_bytes + pre_page_align_size; 103 | 104 | table->kv_items = (struct kvdb_item *) (table->kv_table_start + KV_TABLE_ITEMS_OFFSET_OFFSET(maxcount)); 105 | table->kv_next_table_offset = (uint64_t *) (table->kv_table_start + KV_TABLE_NEXT_TABLE_OFFSET_OFFSET); 106 | table->kv_count = (uint64_t *) (table->kv_table_start + KV_TABLE_COUNT_OFFSET); 107 | table->kv_bloom_filter_size = (uint64_t *) (table->kv_table_start + KV_TABLE_BLOOM_SIZE_OFFSET); 108 | table->kv_maxcount = (uint64_t *) (table->kv_table_start + KV_TABLE_MAX_COUNT_OFFSET); 109 | table->kv_bloom_filter = (uint8_t *) (table->kv_table_start + KV_TABLE_BLOOM_FILTER_OFFSET); 110 | 111 | * result = table; 112 | 113 | if (* table->kv_next_table_offset != 0) { 114 | r = map_table(db, &table->kv_next_table, ntoh64(* table->kv_next_table_offset), 0); 115 | if (r < 0) { 116 | return -1; 117 | } 118 | } 119 | else { 120 | table->kv_next_table = NULL; 121 | } 122 | 123 | return 0; 124 | } 125 | 126 | static void unmap_table(struct kvdb_table * table) 127 | { 128 | if (table == NULL) 129 | return; 130 | 131 | struct kvdb_table * next_table = table->kv_next_table; 132 | mapping_unsetup(&table->kv_mapping); 133 | free(table); 134 | 135 | unmap_table(next_table); 136 | } 137 | 138 | static int mapping_setup(struct kvdb_mapping * mapping, int fd, off_t offset, size_t size) 139 | { 140 | mapping->kv_bytes = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, offset); 141 | if (mapping->kv_bytes == MAP_FAILED) { 142 | return -1; 143 | } 144 | mapping->kv_size = size; 145 | 146 | return 0; 147 | } 148 | 149 | static void mapping_unsetup(struct kvdb_mapping * mapping) 150 | { 151 | if (mapping->kv_bytes == NULL) { 152 | return; 153 | } 154 | 155 | int r; 156 | r = munmap(mapping->kv_bytes, mapping->kv_size); 157 | if (r < 0) { 158 | fprintf(stderr, "Could not unmap memory\n"); 159 | } 160 | mapping->kv_bytes = NULL; 161 | mapping->kv_size = 0; 162 | } 163 | -------------------------------------------------------------------------------- /src/kvtable.h: -------------------------------------------------------------------------------- 1 | // 2 | // kvtable.h 3 | // kvdb 4 | // 5 | // Created by DINH Viêt Hoà on 6/2/13. 6 | // Copyright (c) 2013 etpan. All rights reserved. 7 | // 8 | 9 | #ifndef kvdb_kvtable_h 10 | #define kvdb_kvtable_h 11 | 12 | #include 13 | 14 | #include "kvtypes.h" 15 | #include "kvendian.h" 16 | #include "kvprime.h" 17 | 18 | int kv_table_header_write(kvdb * db, uint64_t table_start, uint64_t maxcount); 19 | uint64_t kv_table_create(kvdb * db, uint64_t size, struct kvdb_table ** result); 20 | 21 | int kv_tables_setup(kvdb * db); 22 | void kv_tables_unsetup(kvdb * db); 23 | 24 | static inline int kv_select_table(kvdb * db) 25 | { 26 | if (db->kv_current_table == NULL) { 27 | db->kv_current_table = db->kv_first_table; 28 | } 29 | 30 | //fprintf(stderr, "count %i\n", (int) (* db->kv_current_table->kv_count)); 31 | while (ntoh64(* db->kv_current_table->kv_count) > ntoh64(* db->kv_current_table->kv_maxcount) * KV_MAX_MEAN_COLLISION) { 32 | if (db->kv_current_table->kv_next_table == NULL) { 33 | uint64_t nextsize = kv_getnextprime(ntoh64(* db->kv_current_table->kv_maxcount) * 2); 34 | uint64_t offset = kv_table_create(db, nextsize, &db->kv_current_table->kv_next_table); 35 | if (offset == 0) { 36 | return -1; 37 | } 38 | * db->kv_current_table->kv_next_table_offset = hton64(offset); 39 | } 40 | 41 | db->kv_current_table = db->kv_current_table->kv_next_table; 42 | } 43 | 44 | return 0; 45 | } 46 | 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /src/kvtypes.h: -------------------------------------------------------------------------------- 1 | // 2 | // kvtypes.h 3 | // kvdb 4 | // 5 | // Created by DINH Viêt Hoà on 6/1/13. 6 | // Copyright (c) 2013 etpan. All rights reserved. 7 | // 8 | 9 | #ifndef KVTYPES_H 10 | #define KVTYPES_H 11 | 12 | #include 13 | #include 14 | 15 | #include "kvdb.h" 16 | 17 | #define KV_HEADER_SIZE (4 + 4 + 8 + 1 + 8 + 64 * 8) 18 | #define KV_HEADER_MARKER_OFFSET 0 19 | #define KV_HEADER_VERSION_OFFSET 4 20 | #define KV_HEADER_FIRSTMAXCOUNT_OFFSET (4 + 4) 21 | #define KV_HEADER_FILESIZE_OFFSET (4 + 4 + 8 + 1) 22 | #define KV_HEADER_FREELIST_OFFSET (4 + 4 + 8 + 1 + 8) 23 | 24 | // 1. marker 4 bytes 25 | // 2. version 4 bytes 26 | // 3. first table max count 8 bytes 27 | // 4. storage type 1 byte 28 | // 5. recycled blocks offset (for each size) 64 * 8 bytes 29 | 30 | /* 31 | table: 32 | 1. next offset: 8 bytes 33 | 2. count: 8 bytes 34 | 3. bloom_size: 8 bytes 35 | 4. maxcount 8 bytes 36 | 5. bloom filter table BLOOM_FILTER_SIZE(size) bytes 37 | 6. offset to items (actual hash table) maxcount items of 8 bytes 38 | 39 | table mapping size: 8 + 8 + 8 + 8 + BLOOM_FILTER_SIZE(maxcount) + (maxcount * 8) 40 | */ 41 | 42 | #define KV_TABLE_NEXT_TABLE_OFFSET_OFFSET 0 43 | #define KV_TABLE_COUNT_OFFSET 8 44 | #define KV_TABLE_BLOOM_SIZE_OFFSET 16 45 | #define KV_TABLE_MAX_COUNT_OFFSET 24 46 | #define KV_TABLE_BLOOM_FILTER_OFFSET 32 47 | #define KV_TABLE_ITEMS_OFFSET_OFFSET(maxcount) (KV_TABLE_HEADER_SIZE + KV_TABLE_BLOOM_FILTER_SIZE(maxcount)) 48 | 49 | #define KV_TABLE_HEADER_SIZE (8 + 8 + 8 + 8) 50 | 51 | #define KV_TABLE_SIZE(maxcount) (KV_TABLE_HEADER_SIZE + KV_TABLE_BLOOM_FILTER_SIZE(maxcount) + maxcount * 8) 52 | #define KV_FIRST_TABLE_MAX_COUNT (1 << 17) 53 | 54 | #define KV_TABLE_BITS_FOR_BLOOM_FILTER 5 55 | #define KV_TABLE_BLOOM_FILTER_SIZE(maxcount) (KV_BYTE_ROUND_UP(kv_getnextprime(maxcount * KV_TABLE_BITS_FOR_BLOOM_FILTER)) / 8) 56 | #define KV_BLOOM_FILTER_HASH_COUNT 3 57 | 58 | #define KV_MAX_MEAN_COLLISION 3 59 | 60 | /* 61 | block: 62 | 1. next offset 8 bytes 63 | 2. hash_value 4 bytes 64 | 3. key size 8 bytes 65 | 4. key bytes variable length 66 | 5. data size 8 bytes 67 | 6. data bytes variable length 68 | */ 69 | 70 | #define KV_BLOCK_NEXT_OFFSET_OFFSET 0 71 | #define KV_BLOCK_HASH_VALUE_OFFSET 8 72 | #define KV_BLOCK_LOG2SIZE_OFFSET 9 73 | #define KV_BLOCK_KEY_SIZE_OFFSET 13 74 | #define KV_BLOCK_KEY_BYTES_OFFSET 21 75 | 76 | struct kvdb_mapping { 77 | char * kv_bytes; 78 | size_t kv_size; 79 | }; 80 | 81 | struct kvdb { 82 | char * kv_filename; 83 | int kv_pagesize; 84 | int kv_fd; 85 | int kv_opened; 86 | uint64_t kv_firstmaxcount; 87 | int kv_compression_type; 88 | uint64_t * kv_filesize; // host order 89 | uint64_t * kv_free_blocks; // host order 90 | struct kvdb_table * kv_first_table; 91 | struct kvdb_table * kv_current_table; 92 | }; 93 | 94 | struct kvdb_item { 95 | // host order 96 | uint64_t kv_offset; 97 | }; 98 | 99 | struct kvdb_table { 100 | struct kvdb_mapping kv_mapping; 101 | char * kv_table_start; 102 | struct kvdb_item * kv_items; 103 | uint64_t * kv_bloom_filter_size; // host order 104 | uint8_t * kv_bloom_filter; 105 | uint64_t * kv_next_table_offset; // host order 106 | uint64_t * kv_count; // host order 107 | uint64_t * kv_maxcount; // host order 108 | struct kvdb_table * kv_next_table; 109 | }; 110 | 111 | struct find_key_cb_params { 112 | const char * key; 113 | size_t key_size; 114 | uint64_t previous_offset; 115 | uint64_t current_offset; 116 | uint64_t next_offset; 117 | struct kvdb_item * item; 118 | uint64_t * table_count; 119 | size_t log2_size; 120 | }; 121 | 122 | typedef void findkey_callback(kvdb * db, struct find_key_cb_params * params, 123 | void * data); 124 | 125 | #endif 126 | -------------------------------------------------------------------------------- /src/kvunicode.c: -------------------------------------------------------------------------------- 1 | #include "kvunicode.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #if __APPLE__ 8 | #include 9 | #endif 10 | 11 | #include "ConvertUTF.h" 12 | 13 | #if !__APPLE__ 14 | // Transliteration helpers. 15 | 16 | typedef struct XReplaceable { 17 | UChar* text; /* MUST BE null-terminated */ 18 | } XReplaceable; 19 | 20 | static void InitXReplaceable(XReplaceable* rep, const UChar* str, int length) 21 | { 22 | if (length == 0) { 23 | length = u_strlen(str); 24 | } 25 | rep->text = (UChar*) malloc(sizeof(* rep->text) * (length + 1)); 26 | rep->text[length] = 0; 27 | u_strncpy(rep->text, str, length); 28 | } 29 | 30 | static void FreeXReplaceable(XReplaceable* rep) 31 | { 32 | if (rep->text != NULL) { 33 | free(rep->text); 34 | rep->text = NULL; 35 | } 36 | } 37 | 38 | /* UReplaceableCallbacks callback */ 39 | static int32_t Xlength(const UReplaceable* rep) 40 | { 41 | const XReplaceable* x = (const XReplaceable*)rep; 42 | return u_strlen(x->text); 43 | } 44 | 45 | /* UReplaceableCallbacks callback */ 46 | static UChar XcharAt(const UReplaceable* rep, int32_t offset) 47 | { 48 | const XReplaceable* x = (const XReplaceable*)rep; 49 | return x->text[offset]; 50 | } 51 | 52 | /* UReplaceableCallbacks callback */ 53 | static UChar32 Xchar32At(const UReplaceable* rep, int32_t offset) 54 | { 55 | const XReplaceable* x = (const XReplaceable*)rep; 56 | return x->text[offset]; 57 | } 58 | 59 | /* UReplaceableCallbacks callback */ 60 | static void Xreplace(UReplaceable* rep, int32_t start, int32_t limit, 61 | const UChar* text, int32_t textLength) 62 | { 63 | XReplaceable* x = (XReplaceable*)rep; 64 | int32_t newLen = Xlength(rep) + limit - start + textLength; 65 | UChar* newText = (UChar*) malloc(sizeof(UChar) * (newLen+1)); 66 | u_strncpy(newText, x->text, start); 67 | u_strncpy(newText + start, text, textLength); 68 | u_strcpy(newText + start + textLength, x->text + limit); 69 | free(x->text); 70 | x->text = newText; 71 | } 72 | 73 | /* UReplaceableCallbacks callback */ 74 | static void Xcopy(UReplaceable* rep, int32_t start, int32_t limit, int32_t dest) 75 | { 76 | XReplaceable* x = (XReplaceable*)rep; 77 | int32_t newLen = Xlength(rep) + limit - start; 78 | UChar* newText = (UChar*) malloc(sizeof(UChar) * (newLen+1)); 79 | u_strncpy(newText, x->text, dest); 80 | u_strncpy(newText + dest, x->text + start, limit - start); 81 | u_strcpy(newText + dest + limit - start, x->text + dest); 82 | free(x->text); 83 | x->text = newText; 84 | } 85 | 86 | /* UReplaceableCallbacks callback */ 87 | static void Xextract(UReplaceable* rep, int32_t start, int32_t limit, UChar* dst) 88 | { 89 | XReplaceable* x = (XReplaceable*)rep; 90 | int32_t len = limit - start; 91 | u_strncpy(dst, x->text, len); 92 | } 93 | 94 | static void InitXReplaceableCallbacks(UReplaceableCallbacks* callbacks) 95 | { 96 | callbacks->length = Xlength; 97 | callbacks->charAt = XcharAt; 98 | callbacks->char32At = Xchar32At; 99 | callbacks->replace = Xreplace; 100 | callbacks->extract = Xextract; 101 | callbacks->copy = Xcopy; 102 | } 103 | 104 | 105 | // init and deinit. 106 | 107 | static UReplaceableCallbacks s_xrepVtable; 108 | static UTransliterator * s_trans = NULL; 109 | static pthread_mutex_t s_lock = PTHREAD_MUTEX_INITIALIZER; 110 | static int s_initialized = 0; 111 | static int pthread_once_t s_once = PTHREAD_ONCE_INIT; 112 | 113 | static void kv_unicode_init(void) 114 | { 115 | pthread_mutex_lock(&s_lock); 116 | if (!s_initialized) { 117 | UChar urules[1024]; 118 | UErrorCode status = U_ZERO_ERROR; 119 | u_strFromUTF8(urules, sizeof(urules), NULL, "Any-Latin; NFD; Lower; [:nonspacing mark:] remove; nfc", -1, &status); 120 | LIDX_ASSERT(status == U_ZERO_ERROR); 121 | 122 | UParseError parseError; 123 | s_trans = utrans_openU(urules, -1, UTRANS_FORWARD, 124 | NULL, -1, &parseError, &status); 125 | LIDX_ASSERT(status == U_ZERO_ERROR); 126 | 127 | InitXReplaceableCallbacks(&s_xrepVtable); 128 | s_initialized = 1; 129 | } 130 | pthread_mutex_unlock(&s_lock); 131 | } 132 | 133 | static void kv_unicode_deinit(void) 134 | { 135 | utrans_close(s_trans); 136 | } 137 | #endif 138 | 139 | unsigned int kv_u_get_length(const UChar * word) 140 | { 141 | unsigned int length = 0; 142 | while (* word != 0) { 143 | word ++; 144 | length ++; 145 | } 146 | return length; 147 | } 148 | 149 | // UTF <-> UTF16 150 | 151 | UChar * kv_from_utf8(const char * word) 152 | { 153 | size_t len = strlen(word); 154 | const UTF8 * source = (const UTF8 *) word; 155 | UTF16 * target = (UTF16 *) malloc((len + 1) * sizeof(* target)); 156 | UTF16 * targetStart = target; 157 | ConvertUTF8toUTF16(&source, source + len, 158 | &targetStart, targetStart + len, lenientConversion); 159 | unsigned int utf16length = (unsigned int) (targetStart - target); 160 | target[utf16length] = 0; 161 | return (UChar *) target; 162 | } 163 | 164 | char * kv_to_utf8(const UChar * word) 165 | { 166 | unsigned int len = kv_u_get_length(word); 167 | const UTF16 * source = (const UTF16 *) word; 168 | UTF8 * target = (UTF8 *) malloc(len * 6 + 1); 169 | UTF8 * targetStart = target; 170 | ConvertUTF16toUTF8(&source, source + len, 171 | &targetStart, targetStart + len * 6 + 1, lenientConversion); 172 | unsigned int utf8length = (unsigned int) (targetStart - target); 173 | target[utf8length] = 0; 174 | return (char *) target; 175 | } 176 | 177 | // transliterate to ASCII 178 | 179 | char * kv_transliterate(const UChar * text, int length) 180 | { 181 | #if __APPLE__ 182 | if (length == -1) { 183 | length = kv_u_get_length(text); 184 | } 185 | 186 | int is_ascii = 1; 187 | const UChar * p = text; 188 | for(int i = 0 ; i < length ; i ++) { 189 | if ((* p < 32) || (* p >= 127)) { 190 | //if (!isalnum(* p)) { 191 | is_ascii = 0; 192 | break; 193 | } 194 | p ++; 195 | } 196 | 197 | if (is_ascii) { 198 | char * result = malloc(length + 1); 199 | char * q = result; 200 | for(int i = 0 ; i < length ; i ++) { 201 | * q = tolower(text[i]); 202 | q ++; 203 | } 204 | * q = 0; 205 | return result; 206 | } 207 | 208 | CFMutableStringRef cfStr = CFStringCreateMutable(NULL, 0); 209 | CFStringAppendCharacters(cfStr, (const UniChar *) text, length); 210 | CFStringTransform(cfStr, NULL, CFSTR("Any-Latin; NFD; Lower; [:nonspacing mark:] remove; nfc"), false); 211 | CFIndex resultLength = CFStringGetLength(cfStr); 212 | char * buffer = (char *) malloc(resultLength + 1); 213 | buffer[resultLength] = 0; 214 | CFStringGetCString(cfStr, buffer, resultLength + 1, kCFStringEncodingUTF8); 215 | CFRelease(cfStr); 216 | return buffer; 217 | #else 218 | if (length == -1) { 219 | length = u_strlen(text); 220 | } 221 | 222 | pthread_once(&s_once, kv_unicode_init); 223 | 224 | XReplaceable xrep; 225 | InitXReplaceable(&xrep, text, length); 226 | UErrorCode status = U_ZERO_ERROR; 227 | 228 | int32_t limit = length; 229 | utrans_trans(s_trans, (UReplaceable *) &xrep, &s_xrepVtable, 0, &limit, &status); 230 | if (status != U_ZERO_ERROR) { 231 | goto free_xrep; 232 | } 233 | 234 | char * result = lidx_to_utf8(xrep.text); 235 | FreeXReplaceable(&xrep); 236 | 237 | return result; 238 | 239 | free_xrep: 240 | FreeXReplaceable(&xrep); 241 | err: 242 | return NULL; 243 | #endif 244 | } 245 | -------------------------------------------------------------------------------- /src/kvunicode.h: -------------------------------------------------------------------------------- 1 | #ifndef KVUNICODE_H 2 | 3 | #define KVUNICODE_H 4 | 5 | #if !__APPLE__ 6 | #include "unicode/utypes.h" 7 | #include "unicode/uloc.h" 8 | #include "unicode/utext.h" 9 | #include "unicode/localpointer.h" 10 | #include "unicode/parseerr.h" 11 | #include "unicode/ubrk.h" 12 | #include "unicode/urep.h" 13 | #include "unicode/utrans.h" 14 | #include "unicode/parseerr.h" 15 | #include "unicode/uenum.h" 16 | #include "unicode/uset.h" 17 | #include "unicode/putil.h" 18 | #include "unicode/uiter.h" 19 | #include "unicode/ustring.h" 20 | #else 21 | #if defined(__CHAR16_TYPE__) 22 | typedef __CHAR16_TYPE__ UChar; 23 | #else 24 | typedef uint16_t UChar; 25 | #endif 26 | #endif 27 | 28 | #ifdef __cplusplus 29 | extern "C" { 30 | #endif 31 | 32 | unsigned int kv_u_get_length(const UChar * word); 33 | UChar * kv_from_utf8(const char * word); 34 | char * kv_to_utf8(const UChar * word); 35 | char * kv_transliterate(const UChar * text, int length); 36 | 37 | #ifdef __cplusplus 38 | } 39 | #endif 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /src/sfts.cpp: -------------------------------------------------------------------------------- 1 | #include "sfts.h" 2 | 3 | #include 4 | 5 | #include "kvdbo.h" 6 | 7 | #include "kvunicode.h" 8 | #include "kvserialization.h" 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #if __APPLE__ 16 | #include 17 | #endif 18 | 19 | static int db_put(sfts * index, std::string & key, std::string & value); 20 | static int db_get(sfts * index, std::string & key, std::string * p_value); 21 | static int db_delete(sfts * index, std::string & key); 22 | static int db_flush(sfts * index); 23 | static int tokenize(sfts * index, uint64_t doc, const UChar * text); 24 | static int add_to_indexer(sfts * index, uint64_t doc, const char * word, 25 | std::set & wordsids_set); 26 | 27 | // . -> next word id 28 | // ,[docid] -> [words ids] 29 | // /[word id] -> word 30 | // word -> [word id], [docs ids] 31 | 32 | struct sfts { 33 | kvdbo * sfts_db; 34 | std::unordered_map sfts_buffer; 35 | std::unordered_set sfts_buffer_dirty; 36 | std::unordered_set sfts_deleted; 37 | }; 38 | 39 | sfts * sfts_new(const char * filename) 40 | { 41 | sfts * result = new sfts; 42 | result->sfts_db = kvdbo_new(filename); 43 | return result; 44 | } 45 | 46 | void sfts_free(sfts * index) 47 | { 48 | kvdbo_free(index->sfts_db); 49 | free(index); 50 | } 51 | 52 | int sfts_open(sfts * index) 53 | { 54 | kvdbo_open(index->sfts_db); 55 | 56 | return 0; 57 | } 58 | 59 | void sfts_close(sfts * index) 60 | { 61 | db_flush(index); 62 | kvdbo_close(index->sfts_db); 63 | } 64 | 65 | int sfts_flush(sfts * index) 66 | { 67 | return db_flush(index); 68 | } 69 | 70 | //int lidx_set(lidx * index, uint64_t doc, const char * text); 71 | // text -> wordboundaries -> transliterated word -> store word with new word id 72 | // word -> append doc id to docs ids 73 | // store doc id -> words ids 74 | 75 | int sfts_set(sfts * index, uint64_t doc, const char * text) 76 | { 77 | UChar * utext = kv_from_utf8(text); 78 | int r = sfts_u_set(index, doc, utext); 79 | free(utext); 80 | return r; 81 | } 82 | 83 | int sfts_set2(sfts * index, uint64_t doc, const char ** text, int count) 84 | { 85 | UChar ** utext = (UChar **) malloc(count * sizeof(* utext)); 86 | for(int i = 0 ; i < count ; i ++) { 87 | utext[i] = kv_from_utf8(text[i]); 88 | } 89 | int result = sfts_u_set2(index, doc, (const UChar **) utext, count); 90 | for(int i = 0 ; i < count ; i ++) { 91 | free((void *) utext[i]); 92 | } 93 | free((void *) utext); 94 | return result; 95 | } 96 | 97 | int sfts_u_set(sfts * index, uint64_t doc, const UChar * utext) 98 | { 99 | int r = sfts_remove(index, doc); 100 | if (r < 0) { 101 | return r; 102 | } 103 | r = tokenize(index, doc, utext); 104 | if (r < 0) { 105 | return r; 106 | } 107 | return 0; 108 | } 109 | 110 | int sfts_u_set2(sfts * index, uint64_t doc, const UChar ** utext, int count) 111 | { 112 | int r = sfts_remove(index, doc); 113 | if (r < 0) { 114 | return r; 115 | } 116 | int result = 0; 117 | std::set wordsids_set; 118 | for(unsigned int i = 0 ; i < count ; i ++) { 119 | if (utext[i] == NULL) { 120 | continue; 121 | } 122 | char * transliterated = kv_transliterate(utext[i], kv_u_get_length(utext[i])); 123 | if (transliterated == NULL) { 124 | continue; 125 | } 126 | int r = add_to_indexer(index, doc, transliterated, wordsids_set); 127 | if (r < 0) { 128 | result = r; 129 | break; 130 | } 131 | free(transliterated); 132 | } 133 | if (result != 0) { 134 | return result; 135 | } 136 | 137 | std::string key(","); 138 | kv_encode_uint64(key, doc); 139 | 140 | std::string value_str; 141 | for(std::set::iterator wordsids_set_iterator = wordsids_set.begin() ; wordsids_set_iterator != wordsids_set.end() ; ++ wordsids_set_iterator) { 142 | kv_encode_uint64(value_str, * wordsids_set_iterator); 143 | } 144 | r = db_put(index, key, value_str); 145 | if (r < 0) { 146 | return r; 147 | } 148 | 149 | return 0; 150 | } 151 | 152 | static int tokenize(sfts * index, uint64_t doc, const UChar * text) 153 | { 154 | int result = 0; 155 | std::set wordsids_set; 156 | #if __APPLE__ 157 | unsigned int len = kv_u_get_length(text); 158 | CFStringRef str = CFStringCreateWithBytes(NULL, (const UInt8 *) text, len * sizeof(* text), kCFStringEncodingUTF16LE, false); 159 | CFStringTokenizerRef tokenizer = CFStringTokenizerCreate(NULL, str, CFRangeMake(0, len), kCFStringTokenizerUnitWord, NULL); 160 | while (1) { 161 | CFStringTokenizerTokenType wordKind = CFStringTokenizerAdvanceToNextToken(tokenizer); 162 | if (wordKind == kCFStringTokenizerTokenNone) { 163 | break; 164 | } 165 | if (wordKind == kCFStringTokenizerTokenHasNonLettersMask) { 166 | continue; 167 | } 168 | CFRange range = CFStringTokenizerGetCurrentTokenRange(tokenizer); 169 | char * transliterated = kv_transliterate(&text[range.location], (int) range.length); 170 | if (transliterated == NULL) { 171 | continue; 172 | } 173 | int r = add_to_indexer(index, doc, transliterated, wordsids_set); 174 | if (r < 0) { 175 | result = r; 176 | break; 177 | } 178 | 179 | free(transliterated); 180 | } 181 | CFRelease(str); 182 | CFRelease(tokenizer); 183 | #else 184 | UErrorCode status; 185 | status = U_ZERO_ERROR; 186 | UBreakIterator * iterator = ubrk_open(UBRK_WORD, NULL, text, u_strlen(text), &status); 187 | LIDX_ASSERT(status <= U_ZERO_ERROR); 188 | 189 | int32_t left = 0; 190 | int32_t right = 0; 191 | int word_kind = 0; 192 | ubrk_first(iterator); 193 | 194 | while (1) { 195 | left = right; 196 | right = ubrk_next(iterator); 197 | if (right == UBRK_DONE) { 198 | break; 199 | } 200 | 201 | word_kind = ubrk_getRuleStatus(iterator); 202 | if (word_kind == 0) { 203 | // skip punctuation and space. 204 | continue; 205 | } 206 | 207 | char * transliterated = lidx_transliterate(&text[left], right - left); 208 | if (transliterated == NULL) { 209 | continue; 210 | } 211 | int r = add_to_indexer(index, doc, transliterated, wordsids_set); 212 | if (r < 0) { 213 | result = r; 214 | break; 215 | } 216 | 217 | free(transliterated); 218 | } 219 | ubrk_close(iterator); 220 | #endif 221 | if (result != 0) { 222 | return result; 223 | } 224 | 225 | std::string key(","); 226 | kv_encode_uint64(key, doc); 227 | 228 | std::string value_str; 229 | for(std::set::iterator wordsids_set_iterator = wordsids_set.begin() ; wordsids_set_iterator != wordsids_set.end() ; ++ wordsids_set_iterator) { 230 | kv_encode_uint64(value_str, * wordsids_set_iterator); 231 | } 232 | int r = db_put(index, key, value_str); 233 | if (r < 0) { 234 | return r; 235 | } 236 | 237 | return 0; 238 | } 239 | 240 | static int add_to_indexer(sfts * index, uint64_t doc, const char * word, 241 | std::set & wordsids_set) 242 | { 243 | std::string word_str(word); 244 | std::string value; 245 | uint64_t wordid; 246 | 247 | //fprintf(stderr, "adding word: %s\n", word); 248 | 249 | int r = db_get(index, word_str, &value); 250 | if (r < -1) { 251 | return -1; 252 | } 253 | if (r == 0) { 254 | // Adding doc id to existing entry. 255 | kv_decode_uint64(value, 0, &wordid); 256 | kv_encode_uint64(value, doc); 257 | int r = db_put(index, word_str, value); 258 | if (r < 0) { 259 | return r; 260 | } 261 | } 262 | else /* r == -1 */ { 263 | // Not found. 264 | 265 | // Creating an entry. 266 | // store word with new id 267 | 268 | // read next word it 269 | std::string str; 270 | std::string nextwordidkey("."); 271 | int r = db_get(index, nextwordidkey, &str); 272 | if (r == -1) { 273 | wordid = 0; 274 | } 275 | else if (r < 0) { 276 | return -1; 277 | } 278 | else { 279 | kv_decode_uint64(str, 0, &wordid); 280 | } 281 | 282 | // write next word id 283 | std::string value; 284 | uint64_t next_wordid = wordid; 285 | next_wordid ++; 286 | kv_encode_uint64(value, next_wordid); 287 | r = db_put(index, nextwordidkey, value); 288 | if (r < 0) { 289 | return r; 290 | } 291 | 292 | std::string value_str; 293 | kv_encode_uint64(value_str, wordid); 294 | kv_encode_uint64(value_str, doc); 295 | r = db_put(index, word_str, value_str); 296 | if (r < 0) { 297 | return r; 298 | } 299 | 300 | std::string key("/"); 301 | kv_encode_uint64(key, wordid); 302 | r = db_put(index, key, word_str); 303 | if (r < 0) { 304 | return r; 305 | } 306 | } 307 | 308 | wordsids_set.insert(wordid); 309 | 310 | return 0; 311 | } 312 | 313 | //int lidx_remove(lidx * index, uint64_t doc); 314 | // docid -> words ids -> remove docid from word 315 | // if docs ids for word is empty, we remove the word id 316 | 317 | static std::string get_word_for_wordid(sfts * index, uint64_t wordid); 318 | static int remove_docid_in_word(sfts * index, std::string word, uint64_t doc); 319 | static int remove_word(sfts * index, std::string word, uint64_t wordid); 320 | 321 | int sfts_remove(sfts * index, uint64_t doc) 322 | { 323 | std::string key(","); 324 | kv_encode_uint64(key, doc); 325 | std::string str; 326 | int r = db_get(index, key, &str); 327 | if (r == -1) { 328 | // do nothing 329 | } 330 | else if (r < 0) { 331 | return -1; 332 | } 333 | 334 | db_delete(index, key); 335 | size_t position = 0; 336 | while (position < str.size()) { 337 | uint64_t wordid; 338 | position = kv_decode_uint64(str, position, &wordid); 339 | std::string word = get_word_for_wordid(index, wordid); 340 | if (word.size() == 0) { 341 | continue; 342 | } 343 | int r = remove_docid_in_word(index, word, doc); 344 | if (r < 0) { 345 | return -1; 346 | } 347 | } 348 | 349 | return 0; 350 | } 351 | 352 | static std::string get_word_for_wordid(sfts * index, uint64_t wordid) 353 | { 354 | std::string wordidkey("/"); 355 | kv_encode_uint64(wordidkey, wordid); 356 | std::string str; 357 | int r = db_get(index, wordidkey, &str); 358 | if (r < 0) { 359 | return std::string(); 360 | } 361 | return str; 362 | } 363 | 364 | static int remove_docid_in_word(sfts * index, std::string word, uint64_t doc) 365 | { 366 | std::string str; 367 | int r = db_get(index, word, &str); 368 | if (r == -1) { 369 | return 0; 370 | } 371 | else if (r < 0) { 372 | return -1; 373 | } 374 | 375 | uint64_t wordid; 376 | std::string buffer; 377 | size_t position = 0; 378 | position = kv_decode_uint64(str, position, &wordid); 379 | while (position < str.size()) { 380 | uint64_t current_docid; 381 | position = kv_decode_uint64(str, position, ¤t_docid); 382 | if (current_docid != doc) { 383 | kv_encode_uint64(buffer, current_docid); 384 | } 385 | } 386 | if (buffer.size() == 0) { 387 | // remove word entry 388 | int r = remove_word(index, word, wordid); 389 | if (r < 0) { 390 | return -1; 391 | } 392 | } 393 | else { 394 | // update word entry 395 | int r = db_put(index, word, buffer); 396 | if (r < 0) { 397 | return r; 398 | } 399 | } 400 | 401 | return 0; 402 | } 403 | 404 | static int remove_word(sfts * index, std::string word, uint64_t wordid) 405 | { 406 | std::string wordidkey("/"); 407 | kv_encode_uint64(wordidkey, wordid); 408 | int r; 409 | r = db_delete(index, wordidkey); 410 | if (r < 0) { 411 | return -1; 412 | } 413 | r = db_delete(index, word); 414 | if (r < 0) { 415 | return -1; 416 | } 417 | 418 | return 0; 419 | } 420 | 421 | //int lidx_search(lidx * index, const char * token); 422 | // token -> transliterated token -> docs ids 423 | 424 | int sfts_search(sfts * index, const char * token, sfts_search_kind kind, uint64_t ** p_docsids, size_t * p_count) 425 | { 426 | int result; 427 | UChar * utoken = kv_from_utf8(token); 428 | result = sfts_u_search(index, utoken, kind, p_docsids, p_count); 429 | free((void *) utoken); 430 | return result; 431 | } 432 | 433 | int sfts_u_search(sfts * index, const UChar * utoken, sfts_search_kind kind, 434 | uint64_t ** p_docsids, size_t * p_count) 435 | { 436 | db_flush(index); 437 | 438 | char * transliterated = kv_transliterate(utoken, -1); 439 | unsigned int transliterated_length = (unsigned int) strlen(transliterated); 440 | std::set result_set; 441 | 442 | kvdbo_iterator * iterator = kvdbo_iterator_new(index->sfts_db); 443 | if (kind == sfts_search_kind_prefix) { 444 | kvdbo_iterator_seek_after(iterator, transliterated, strlen(transliterated)); 445 | } 446 | else { 447 | kvdbo_iterator_seek_first(iterator); 448 | } 449 | while (kvdbo_iterator_is_valid(iterator)) { 450 | int add_to_result = 0; 451 | 452 | const char * key; 453 | size_t key_size; 454 | kvdbo_iterator_get_key(iterator, &key, &key_size); 455 | std::string key_str(key, key_size); 456 | if (key_str.find(".") == 0 || key_str.find(",") == 0 || key_str.find("/") == 0) { 457 | kvdbo_iterator_next(iterator); 458 | continue; 459 | } 460 | if (kind == sfts_search_kind_prefix) { 461 | if (key_str.find(transliterated) != 0) { 462 | break; 463 | } 464 | add_to_result = 1; 465 | } 466 | else if (kind == sfts_search_kind_substr) { 467 | //fprintf(stderr, "matching: %s %s\n", key_str.c_str(), transliterated); 468 | if (key_str.find(transliterated) != std::string::npos) { 469 | add_to_result = 1; 470 | } 471 | } 472 | else if (kind == sfts_search_kind_suffix) { 473 | if ((key_str.length() >= transliterated_length) && 474 | (key_str.compare(key_str.length() - transliterated_length, transliterated_length, transliterated) == 0)) { 475 | add_to_result = 1; 476 | } 477 | } 478 | if (add_to_result) { 479 | size_t position = 0; 480 | uint64_t wordid; 481 | char * value; 482 | size_t value_size; 483 | int r = kvdbo_get(index->sfts_db, key_str.c_str(), key_str.length(), &value, &value_size); 484 | if (r != 0) { 485 | fprintf(stderr, "VALUE NOT FOUND for key %s\n", key_str.c_str()); 486 | } 487 | std::string value_str(value, value_size); 488 | free(value); 489 | position = kv_decode_uint64(value_str, position, &wordid); 490 | while (position < value_str.size()) { 491 | uint64_t docid; 492 | position = kv_decode_uint64(value_str, position, &docid); 493 | result_set.insert(docid); 494 | } 495 | } 496 | 497 | kvdbo_iterator_next(iterator); 498 | } 499 | kvdbo_iterator_free(iterator); 500 | 501 | free(transliterated); 502 | 503 | uint64_t * result = (uint64_t *) calloc(result_set.size(), sizeof(* result)); 504 | unsigned int count = 0; 505 | for(std::set::iterator set_iterator = result_set.begin() ; set_iterator != result_set.end() ; ++ set_iterator) { 506 | result[count] = * set_iterator; 507 | count ++; 508 | } 509 | 510 | * p_docsids = result; 511 | * p_count = count; 512 | 513 | return 0; 514 | } 515 | 516 | static int db_put(sfts * index, std::string & key, std::string & value) 517 | { 518 | index->sfts_deleted.erase(key); 519 | index->sfts_buffer[key] = value; 520 | index->sfts_buffer_dirty.insert(key); 521 | 522 | return 0; 523 | } 524 | 525 | static int db_get(sfts * index, std::string & key, std::string * p_value) 526 | { 527 | if (index->sfts_deleted.find(key) != index->sfts_deleted.end()) { 528 | return -1; 529 | } 530 | 531 | if (index->sfts_buffer.find(key) != index->sfts_buffer.end()) { 532 | * p_value = index->sfts_buffer[key]; 533 | return 0; 534 | } 535 | 536 | char * value; 537 | size_t value_size; 538 | int r = kvdbo_get(index->sfts_db, key.c_str(), key.length(), &value, &value_size); 539 | if (r != 0) { 540 | return r; 541 | } 542 | * p_value = std::string(value, value_size); 543 | index->sfts_buffer[key] = * p_value; 544 | free(value); 545 | return 0; 546 | } 547 | 548 | static int db_delete(sfts * index, std::string & key) 549 | { 550 | index->sfts_deleted.insert(key); 551 | index->sfts_buffer_dirty.erase(key); 552 | index->sfts_buffer.erase(key); 553 | return 0; 554 | } 555 | 556 | static int db_flush(sfts * index) 557 | { 558 | if ((index->sfts_buffer_dirty.size() == 0) && (index->sfts_deleted.size() == 0)) { 559 | return 0; 560 | } 561 | for(std::unordered_set::iterator set_iterator = index->sfts_buffer_dirty.begin() ; set_iterator != index->sfts_buffer_dirty.end() ; ++ set_iterator) { 562 | std::string key = * set_iterator; 563 | std::string value = index->sfts_buffer[key]; 564 | kvdbo_set(index->sfts_db, key.c_str(), key.length(), value.c_str(), value.length()); 565 | } 566 | for(std::unordered_set::iterator set_iterator = index->sfts_deleted.begin() ; set_iterator != index->sfts_deleted.end() ; ++ set_iterator) { 567 | std::string key = * set_iterator; 568 | kvdbo_delete(index->sfts_db, key.c_str(), key.length()); 569 | } 570 | kvdbo_flush(index->sfts_db); 571 | index->sfts_buffer.clear(); 572 | index->sfts_buffer_dirty.clear(); 573 | index->sfts_deleted.clear(); 574 | return 0; 575 | } 576 | -------------------------------------------------------------------------------- /src/sfts.h: -------------------------------------------------------------------------------- 1 | #ifndef LIDX_H 2 | 3 | #define LIDX_H 4 | 5 | #ifdef __cplusplus 6 | extern "C" { 7 | #endif 8 | 9 | #include 10 | #include 11 | 12 | // We're using the same UChar as the ICU library. 13 | #if defined(__CHAR16_TYPE__) 14 | typedef __CHAR16_TYPE__ UChar; 15 | #else 16 | typedef uint16_t UChar; 17 | #endif 18 | 19 | typedef struct sfts sfts; 20 | 21 | // prefix provides the best performance, two other options 22 | // have poor performance. 23 | typedef enum sfts_search_kind { 24 | sfts_search_kind_prefix, // Search documents that has strings that start with the given token. 25 | sfts_search_kind_substr, // Search documents that has strings that contain the given token. 26 | sfts_search_kind_suffix, // Search documents that has strings that end the given token. 27 | } sfts_search_kind; 28 | 29 | // Create a new indexer. 30 | sfts * sfts_new(const char * filename); 31 | 32 | // Release resource of the new indexer. 33 | void sfts_free(sfts * index); 34 | 35 | // Open the indexer. 36 | int sfts_open(sfts * index); 37 | 38 | // Close the indexer. 39 | void sfts_close(sfts * index); 40 | 41 | // Adds a UTF-8 document to the indexer. 42 | // `doc`: document identifier (numerical identifier in a 64-bits range) 43 | // `text`: content of the document in UTF-8 encoding. 44 | int sfts_set(sfts * index, uint64_t doc, const char * text); 45 | 46 | // Adds an unicode document to the indexer. 47 | // `utext`: content of the document in UTF-16 encoding. 48 | int sfts_u_set(sfts * index, uint64_t doc, const UChar * utext); 49 | 50 | // Adds a UTF-8 document to the indexer. 51 | // `doc`: document identifier (numerical identifier in a 64-bits range) 52 | int sfts_set2(sfts * index, uint64_t doc, const char ** text, int count); 53 | 54 | // Adds an unicode document to the indexer. 55 | int sfts_u_set2(sfts * index, uint64_t doc, const UChar ** utext, int count); 56 | 57 | // Removes a document from the indexer. 58 | int sfts_remove(sfts * index, uint64_t doc); 59 | 60 | // Searches a UTF-8 token in the indexer. 61 | // `token`: string to search in UTF-8 encoding. 62 | // `kind`: kind of matching to perform. See `lidx_search_kind`. 63 | // The result is an array of documents IDs. The array is stored in `*p_docsids`. 64 | // The number of items in the result array is stored in `*p_count`. 65 | // 66 | // The result array has to be freed using `free()`. 67 | int sfts_search(sfts * index, const char * token, sfts_search_kind kind, 68 | uint64_t ** p_docsids, size_t * p_count); 69 | 70 | // Searches a unicode token in the indexer. 71 | // `token`: string to search in UTF-16 encoding. 72 | int sfts_u_search(sfts * index, const UChar * utoken, sfts_search_kind kind, 73 | uint64_t ** p_docsids, size_t * p_count); 74 | 75 | // Writes changes to disk if they are still pending in memory. 76 | int sfts_flush(sfts * index); 77 | 78 | #ifdef __cplusplus 79 | } 80 | #endif 81 | 82 | #endif 83 | --------------------------------------------------------------------------------