├── LICENSE ├── README.md └── utfset.c /LICENSE: -------------------------------------------------------------------------------- 1 | MIT/X Consortium License 2 | 3 | © 2016 Connor Lane Smith 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a 6 | copy of this software and associated documentation files (the "Software"), 7 | to deal in the Software without restriction, including without limitation 8 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | and/or sell copies of the Software, and to permit persons to whom the 10 | Software is furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | DEALINGS IN THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # UTFSet 2 | 3 | This code implements a UTFSet, a set containing Unicode characters (‘runes’) 4 | in a tree structure that mirrors the UTF-8 encoding format. The result is an 5 | exceedingly simple data structure, implemented here in under 75 lines of C, 6 | which nevertheless takes up less space when storing smaller (and generally 7 | more common) runes, and more when storing larger ones. It may not be the most 8 | efficient, but it is kind of neat. 9 | 10 | The code is very well commented; read `utfset.c` in order to understand it. 11 | -------------------------------------------------------------------------------- /utfset.c: -------------------------------------------------------------------------------- 1 | /* 2 | * See LICENSE file for copyright and licence details. 3 | * 4 | * This code implements a UTFSet, a set containing Unicode characters (`runes') 5 | * in a tree structure that mirrors the UTF-8 encoding format. The result is an 6 | * exceedingly simple data structure, implemented here in under 75 lines of C, 7 | * which nevertheless takes up less space when storing smaller (and generally 8 | * more common) runes, and more when storing larger ones. It may not be the most 9 | * efficient, but it is kind of neat. 10 | */ 11 | #include 12 | #include 13 | #include 14 | 15 | /* 16 | * The structure underlying a UTFSet is a 64-ary tree (a tree whose nodes each 17 | * have 64 child nodes), with booleans for leaves. With the exception of the 18 | * root, which is special as described below, a node has homogenous children: 19 | * they are either all bool, all node, all node>, and so on. 20 | * 21 | * As an optimisation for nodes whose children are all bool, instead of storing 22 | * a 64-bit pointer to an array of 64 bools, we store a 64-bit bitmask in the 23 | * space that would have stored the pointer to the node. As a result, a node is 24 | * either a 64-bit pointer to an array of 64 child nodes or, in the case of a 25 | * leaf node, a 64-bit bitmask. 26 | */ 27 | struct block { 28 | union child { 29 | struct block *ptr; 30 | uint64_t bits; 31 | } blk[64]; 32 | }; 33 | 34 | /* 35 | * At the root of the tree is a node whose children are not homogenous; their 36 | * types depend on which index they have. The first 32 are node (and so 37 | * are all bitmasks), the next 16 are node>, the next 8 after that 38 | * are node>>, and so on. 39 | * 40 | * Note the correspondence here with the leading byte sequences in UTF-8. A 41 | * leading byte has the bit pattern 11xxxxxx, where the x bits may be 0 or 1. 42 | * There are 64 possible values for such a byte. The first 32, 11000000 up to 43 | * 11011111, have one continuation byte; the next 16, 11100000 up to 11101111, 44 | * have two continuation bytes; and so on. 45 | * 46 | * The tree is indexed in a similar way to a prefix tree, or `trie': the first 47 | * node is indexed by a 6-bit integer, comprising the x bits of the leading byte 48 | * 11xxxxxx. The second node is indexed by a subsequent 6-bit integer, which are 49 | * the x bits of the continuation byte 10xxxxxx. This continues until no more 50 | * continuation bytes are expected to follow the initial leading byte. 51 | * 52 | * There is no accommodation for ASCII bytes, i.e. 0xxxxxxx, which are instead 53 | * dealt with in a special way, described later. 54 | */ 55 | typedef struct block UTFSet; 56 | 57 | /* 58 | * This table is used to look up the number of leading ones in a 6-bit integer. 59 | * Given some leading byte 11xxxxxx, if there are n leading ones in the 6-bit 60 | * value xxxxxx, then there are n+1 continuation bytes to follow. 61 | * 62 | * Note that the final four values, having 4 or more leading ones, are not valid 63 | * in UTF-8 according to RFC 3629. 64 | */ 65 | const char clo6[] = { 66 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 00xxxx */ 67 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 01xxxx */ 68 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10xxxx */ 69 | 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6, /* 11xxxx */ 70 | }; 71 | 72 | /* 73 | * addutf() adds the next rune in the UTF-8 string to the UTFSet, and returns a 74 | * pointer to the byte following that rune. For clarity of code, we assume the 75 | * string is valid UTF-8. 76 | */ 77 | const char * 78 | addutf(UTFSet *set, const char *s) 79 | { 80 | union child *tp; 81 | unsigned char c = *s++; 82 | 83 | /* 84 | * We write UTF-8 bytes in octal, because their structure is clearer in 85 | * this form. In octal 03xx is a leading byte, 02xx a continuation byte, 86 | * and 00xx or 01xx an ASCII byte. 87 | */ 88 | if (c >= 0300) { 89 | /* 90 | * This is a leading byte, 11xxxxxx. Traversal starts at the 91 | * xxxxxx-th child of the root. 92 | */ 93 | tp = &set->blk[c % 64]; 94 | /* 95 | * The number of leading ones in the lower 6 bits tells us how 96 | * many continuation bytes are expected. All but the final byte 97 | * (which is the bitmask index) is handled by traversing to the 98 | * corresponding child. If the child doesn't point anywhere yet, 99 | * a new block will be allocated for it. 100 | */ 101 | for (unsigned int n = clo6[c % 64]; c = *s++, n > 0; n--) { 102 | if (!tp->ptr && !(tp->ptr = calloc(1, sizeof(struct block)))) { 103 | return NULL; /* out of memory */ 104 | } 105 | tp = &tp->ptr->blk[c % 64]; 106 | } 107 | } else if (c < 0200) { 108 | /* 109 | * This is an ASCII byte, 0xxxxxxx. One such byte like this is 110 | * equivalent to two UTF-8 bytes of the form 1100000x 10xxxxxx. 111 | * This means we can take the 7th bit as the first index, and 112 | * then stay on the same byte so we can later read the lower 6 113 | * bits as if they were a subsequent byte. 114 | */ 115 | tp = &set->blk[c / 64]; 116 | } else { 117 | /* 118 | * This is a continuation byte, 10xxxxxx. This should not start 119 | * a UTF-8 codepoint, so we return NULL to indicate an error. 120 | */ 121 | return NULL; 122 | } 123 | 124 | /* 125 | * This is the final byte, so this node's children are boolean leaves. 126 | * This means we need to set a bit in the bitmask to indicate that the 127 | * rune corresponding to this byte sequence is an element of the set. 128 | */ 129 | tp->bits |= UINT64_C(1) << (c % 64); 130 | 131 | return s; 132 | } 133 | 134 | static void foreach1(union child, unsigned int, char32_t, void (*)(char32_t)); 135 | 136 | /* 137 | * foreach() takes a pointer to a function that takes a rune, and calls that 138 | * function for each rune in the set, in ascending order. 139 | * 140 | * It may be worth noting that if an overlong UTF-8 sequence is entered into the 141 | * UTFSet then the runes will not actually be in true ascending order, as the 142 | * overlong rune will be reached after the `true' rune. However, since overlong 143 | * UTF-8 sequences are illegal, this is not a problem if the input is sanitised. 144 | */ 145 | void 146 | foreach(const UTFSet *set, void (*fcn)(char32_t)) 147 | { 148 | /* 149 | * For each possible leading byte (11xxxxxx), we determine the number of 150 | * continuation bytes that will follow it, and unpack the bits that will 151 | * contribute to the value of the rune. That is, all of the bits after 152 | * the leading ones. 153 | */ 154 | for (unsigned char c = 0; c < 64; c++) { 155 | unsigned int n = clo6[c]; /* there are n+1 bytes left */ 156 | char32_t r = c % (64 >> n); /* unpack bits for the rune */ 157 | 158 | foreach1(set->blk[c], n, r, fcn); 159 | } 160 | } 161 | 162 | /* 163 | * foreach1() is the recursive workhorse used by foreach(). Whereas foreach() 164 | * applies to a UTFSet, foreach1() applies to an individual node in the tree. 165 | * As well as the node, which may be either a pointer or a bitmask, foreach1() 166 | * is passed: the number of bytes still to go, the value of the rune extracted 167 | * from bytes so far, and the function to be called for each rune in the set. 168 | */ 169 | void 170 | foreach1(union child t, unsigned int n, char32_t r, void (*fcn)(char32_t)) 171 | { 172 | if (n == 0) { 173 | /* 174 | * This is the final byte, so this node's children are booleans, 175 | * which means this is a bitmask. Call the function for each set 176 | * bit, again appending the value onto that of the rune so far. 177 | */ 178 | for (unsigned char c = 0; c < 64; c++) { 179 | if ((t.bits & (UINT64_C(1) << c))) { 180 | fcn((r * 64) | c); 181 | } 182 | } 183 | } else if (t.ptr) { 184 | /* 185 | * This is not the final byte, so this node's children (if it 186 | * has any) are also nodes. Recurse over them for each possible 187 | * continuation value (10xxxxxx). We append the byte's value 188 | * onto that of the rune so far. 189 | */ 190 | for (unsigned char c = 0; c < 64; c++) { 191 | foreach1(t.ptr->blk[c], n - 1, (r * 64) | c, fcn); 192 | } 193 | } 194 | } 195 | 196 | /* 197 | * The following are examples of how this data structure is to be used. 198 | */ 199 | #include 200 | #include 201 | 202 | /* 203 | * prune() is an example function for passing to foreach(). It prints the rune 204 | * it is passed, in the usual U+0000 format, to stdout. 205 | */ 206 | void 207 | prune(char32_t r) 208 | { 209 | printf("U+%04" PRIX32 "\n", (uint32_t)r); 210 | } 211 | 212 | /* 213 | * prunes() is an example function which, given a UTF-8 string, will print the 214 | * runes it contains in ascending order. Returns 0 on success, -1 on failure. 215 | */ 216 | int 217 | prunes(const char *s) 218 | { 219 | UTFSet set = { 0 }; /* empty set */ 220 | 221 | while (*s != '\0') { 222 | if (!(s = addutf(&set, s))) { 223 | return -1; /* something went wrong */ 224 | } 225 | } 226 | 227 | foreach(&set, &prune); /* print all runes */ 228 | return 0; 229 | } 230 | --------------------------------------------------------------------------------