├── .gitattributes ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── benchmark_generic_dict.mojo ├── benchmark_multi_dict.mojo ├── benchmark_report_string_dict.mojo ├── benchmark_string_dict.mojo ├── checkout_remote_modules.sh ├── corpora ├── __init__.mojo ├── arabic.txt ├── chinese.txt ├── french.txt ├── georgian.txt ├── greek.txt ├── hebrew.txt ├── hindi.txt └── l33t.txt ├── csv ├── .checkoutinfo ├── __init__.mojo ├── csv_builder.mojo ├── csv_table.mojo ├── string_utils.mojo └── vectorize_and_exit.mojo ├── generic_dict ├── __init__.mojo ├── ahasher.mojo ├── dict.mojo ├── key_eq.mojo ├── keys_container.mojo ├── multi_dict.mojo ├── single_key_builder.mojo └── sparse_array.mojo ├── helpers ├── __init__.mojo └── progress_bar.mojo ├── memory_consumption_compact_dict.mojo ├── memory_consumption_std_lib_dict.mojo ├── pixi.lock ├── pixi.toml ├── report_i7_2_8.csv ├── report_m1.csv ├── report_m1_new.csv ├── string_dict ├── __init__.mojo ├── ahasher.mojo ├── dict.mojo ├── keys_container.mojo └── string_eq.mojo ├── test_generic_dict.mojo ├── test_multi_dict.mojo ├── test_sparse_array.mojo └── test_string_dict.mojo /.gitattributes: -------------------------------------------------------------------------------- 1 | # SCM syntax highlighting & preventing 3-way merges 2 | pixi.lock merge=binary linguist-language=YAML linguist-generated=true 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # pixi environments 2 | .pixi 3 | .idea -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Maxim Zaks 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | test: 2 | pixi run mojo test_generic_dict.mojo 3 | pixi run mojo test_multi_dict.mojo 4 | pixi run mojo test_sparse_array.mojo 5 | pixi run mojo test_string_dict.mojo 6 | 7 | benchmark: 8 | pixi run mojo benchmark_generic_dict.mojo 9 | pixi run mojo benchmark_multi_dict.mojo 10 | pixi run mojo benchmark_report_string_dict.mojo 11 | pixi run mojo benchmark_string_dict.mojo 12 | 13 | memory: 14 | pixi run mojo memory_consumption_compact_dict.mojo 15 | pixi run mojo memory_consumption_std_lib_dict.mojo -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | `compact-dict` is a fast hashmap based dictionary implemented in Mojo 🔥. 2 | 3 | Although the dictionary is fast (currently it is about 10x faster than the std `Dict`) its main concern is with reducing memory footprint. 4 | 5 | We introduce two self-sufficient modules: 6 | - `string_dict` where the key type of the dictionary is a `String` 7 | - `generic_dict` which allows keys to be of any type conforming with `Keyable` trait 8 | 9 | Both modules expose a `Dict` struct which has the following compile time parametrization options: 10 | - Value type can be any type conforming with `CollectionElement` trait 11 | - We use a fast hash function as default, but you can provide your own hash function 12 | - By setting the `KeyCountType` to a lower unsigned DType e.g. (`DType.uint8` or `DType.uint16`) we can reduce the memory footprint. The type needs to be able to represent number of keys 13 | - By setting the `KeyOffsetType` to a lower unsigned DType we can reduce the memory footprint even further. The type needs to be able to represent the sum of all key bytes 14 | - Set `destructive` to `False` if you don't intend to delete keys from the dict. This way we do not waste space for deleted flags 15 | - Set `caching_hashes` to `False` in order to reduce memory footprint by not caching the hash values. Keep in mind that this change slows down the rehashing process 16 | 17 | The `Dict` can be instantiated with a `capacity` value. Default is set to 16, min capacity is 8. If you know the number of elements ahead of time set it, this will avoid rehashing and might improve memory footprint. 18 | 19 | ### Sample code for generic dict: 20 | ``` 21 | from generic_dict import Dict, Keyable, KeysBuilder 22 | from testing import assert_equal 23 | 24 | @fieldwise_init 25 | struct Person(Keyable, Copyable, Movable): 26 | var name: String 27 | var age: Int 28 | 29 | fn accept[T: KeysBuilder](self, mut keys_builder: T): 30 | keys_builder.add_buffer[DType.uint8](self.name.unsafe_ptr(), len(self.name)) 31 | keys_builder.add(Int64(self.age)) 32 | 33 | fn main() raises: 34 | var p1 = Person("Maxim", 42) 35 | var p2 = Person("Maximilian", 62) 36 | var p3 = Person("Alex", 25) 37 | var p4 = Person("Maria", 28) 38 | var p5 = Person("Daria", 13) 39 | var p6 = Person("Max", 31) 40 | 41 | var d = Dict[Int]() 42 | _ = d.put(p1, 1) 43 | _ = d.put(p2, 11) 44 | _ = d.put(p3, 111) 45 | _ = d.put(p4, 1111) 46 | _ = d.put(p5, 11111) 47 | _ = d.put(p6, 111111) 48 | 49 | assert_equal(d.get(p1, 0), 1) 50 | assert_equal(d.get(p2, 0), 11) 51 | assert_equal(d.get(p3, 0), 111) 52 | assert_equal(d.get(p4, 0), 1111) 53 | assert_equal(d.get(p5, 0), 11111) 54 | assert_equal(d.get(p6, 0), 111111) 55 | ``` 56 | 57 | ### Note: 58 | To run all tests and benchmarks, call: 59 | 60 | ```bash 61 | make test 62 | ``` 63 | 64 | and 65 | 66 | ```bash 67 | make benchmark 68 | ``` 69 | 70 | for `memory` test you need to install `words` package proper for your distro: https://unix.stackexchange.com/questions/213628/where-do-the-words-in-usr-share-dict-words-come-from/798355#798355 71 | 72 | ```bash 73 | make memory 74 | ``` -------------------------------------------------------------------------------- /benchmark_generic_dict.mojo: -------------------------------------------------------------------------------- 1 | import benchmark 2 | from generic_dict import Dict, Keyable, KeysBuilder 3 | from collections.dict import KeyElement, Dict as StdDict 4 | from pathlib import cwd 5 | from testing import assert_equal 6 | from hashlib.hasher import Hasher 7 | 8 | from corpora import * 9 | 10 | 11 | struct StringKey(KeyElement, Keyable, Copyable, Movable, Hashable, EqualityComparable): 12 | var s: String 13 | 14 | fn __init__(out self, owned s: String): 15 | self.s = s^ 16 | 17 | fn __init__(out self, s: StringLiteral): 18 | self.s = String(s) 19 | 20 | fn __hash__[H: Hasher](self, mut hasher: H): 21 | hasher.update(self.s) 22 | 23 | fn __eq__(self, other: Self) -> Bool: 24 | return self.s == other.s 25 | 26 | fn __ne__(self, other: Self) -> Bool: 27 | return self.s != other.s 28 | 29 | fn accept[T: KeysBuilder](self, mut keys_builder: T): 30 | keys_builder.add_buffer(self.s.unsafe_ptr(), len(self.s)) 31 | 32 | fn corpus_stats(corpus: List[String]): 33 | print("=======Corpus Stats=======") 34 | print("Number of elements:", len(corpus)) 35 | var min = 100000000 36 | var max = 0 37 | var sum = 0 38 | var count = 0 39 | for i in range(len(corpus)): 40 | var key = corpus[i] 41 | if len(key) == 0: 42 | continue 43 | count += 1 44 | sum += len(key) 45 | if min > len(key): 46 | min = len(key) 47 | if max < len(key): 48 | max = len(key) 49 | var avg = sum / count 50 | print("Min key lenght:", min) 51 | print("Avg key length:", avg) 52 | print("Max key length:", max) 53 | print("Total num of bytes:", sum) 54 | print("\n") 55 | 56 | fn main() raises: 57 | var d1 = Dict[Int]() 58 | var d2 = StdDict[StringKey, Int]() 59 | var corpus = french_text_to_keys() 60 | 61 | print("") 62 | corpus_stats(corpus) 63 | 64 | @parameter 65 | fn build_compact_dict(): 66 | var d = Dict[Int](len(corpus)) 67 | # var d = Dict[Int]() 68 | for i in range(len(corpus)): 69 | try: 70 | _ = d.put(StringKey(corpus[i]), i) 71 | except: 72 | print("!!!") 73 | d1 = d^ 74 | 75 | @parameter 76 | fn build_std_dict(): 77 | var d = StdDict[StringKey, Int]() 78 | for i in range(len(corpus)): 79 | d[StringKey(corpus[i])] = i 80 | d2 = d^ 81 | 82 | print("+++++++Create Dict Benchmark+++++++") 83 | 84 | var build_compact_stats = benchmark.run[build_compact_dict](max_runtime_secs=0.5) 85 | # build_compact_stats.print("ns") 86 | 87 | var build_std_stats = benchmark.run[build_std_dict](max_runtime_secs=0.5) 88 | # build_std_stats.print("ns") 89 | 90 | print("Compact build speedup:", build_std_stats.mean() / build_compact_stats.mean()) 91 | var sum1 = 0 92 | @parameter 93 | fn read_compact_dict(): 94 | sum1 = 0 95 | for i in range(len(corpus)): 96 | try: 97 | sum1 += d1.get(StringKey(corpus[i]), -1) 98 | except: 99 | print("!!!!!") 100 | 101 | # d1.keys.print_keys() 102 | print("+++++++Read Dict Benchmark+++++++") 103 | var read_compact_stats = benchmark.run[read_compact_dict](max_runtime_secs=0.5) 104 | print("Sum1:", sum1, len(d1)) 105 | # read_compact_stats.print("ns") 106 | 107 | var sum2 = 0 108 | @parameter 109 | fn read_std_dict(): 110 | sum2 = 0 111 | for i in range(len(corpus)): 112 | try: 113 | sum2 += d2[StringKey(corpus[i])] 114 | except: 115 | sum2 += -1 116 | 117 | var raed_std_stats = benchmark.run[read_std_dict](max_runtime_secs=0.5) 118 | # raed_std_stats.print("ns") 119 | print("Sum2:", sum2, len(d2)) 120 | print("Compact read speedup:", raed_std_stats.mean() / read_compact_stats.mean()) 121 | 122 | assert_equal(sum1, sum2) 123 | assert_equal(len(d1), len(d2)) 124 | 125 | var m = 9 126 | @parameter 127 | fn delete_compact_dict(): 128 | for i in range(len(corpus)): 129 | if i % m == 0: 130 | try: 131 | d1.delete(StringKey(corpus[i])) 132 | except: 133 | print("!!!!!!!!!!!!!!") 134 | 135 | @parameter 136 | fn delete_std_dict(): 137 | for i in range(len(corpus)): 138 | if i % m == 0: 139 | try: 140 | _ = d2.pop(StringKey(corpus[i])) 141 | except: 142 | pass 143 | 144 | print("+++++++Delete Dict Benchmark+++++++") 145 | 146 | var delete_compact_stats = benchmark.run[delete_compact_dict](max_runtime_secs=0.5) 147 | var delete_std_stats = benchmark.run[delete_std_dict](max_runtime_secs=0.5) 148 | 149 | print("Compact delete speedup:", delete_std_stats.mean() / delete_compact_stats.mean()) 150 | 151 | print("+++++++Read After Delete Dict Benchmark+++++++") 152 | 153 | var read_after_delete_compact_stats = benchmark.run[read_compact_dict](max_runtime_secs=0.5) 154 | var read_after_delete_std_stats = benchmark.run[read_std_dict](max_runtime_secs=0.5) 155 | 156 | print("Compact read after delete speedup:", read_after_delete_std_stats.mean() / read_after_delete_compact_stats.mean()) 157 | 158 | print("Sum1:", sum1, "length:", len(d1)) 159 | print("Sum2:", sum2, "length:", len(d2)) 160 | 161 | assert_equal(sum1, sum2) 162 | assert_equal(len(d1), len(d2)) 163 | 164 | _ = corpus 165 | _ = d1^ 166 | _ = d2^ -------------------------------------------------------------------------------- /benchmark_multi_dict.mojo: -------------------------------------------------------------------------------- 1 | import benchmark 2 | from generic_dict import MultiDict, Keyable, KeysBuilder 3 | from collections.dict import KeyElement, Dict as StdDict 4 | from pathlib import cwd 5 | from testing import assert_equal 6 | from hashlib.hasher import Hasher 7 | 8 | from corpora import * 9 | 10 | 11 | struct StringKey(KeyElement, Keyable, Copyable, Movable): 12 | var s: String 13 | 14 | fn __init__(out self, owned s: String): 15 | self.s = s^ 16 | 17 | fn __init__(out self, s: StringLiteral): 18 | self.s = String(s) 19 | 20 | fn __hash__[H: Hasher](self, mut hasher: H): 21 | hasher.update(self.s) 22 | # hasher.update(self.b) 23 | 24 | fn __eq__(self, other: Self) -> Bool: 25 | return self.s == other.s 26 | 27 | fn __ne__(self, other: Self) -> Bool: 28 | return self.s != other.s 29 | 30 | fn accept[T: KeysBuilder](self, mut keys_builder: T): 31 | keys_builder.add_buffer(self.s.unsafe_ptr(), len(self.s)) 32 | 33 | fn corpus_stats(corpus: List[String]): 34 | print("=======Corpus Stats=======") 35 | print("Number of elements:", len(corpus)) 36 | var min = 100000000 37 | var max = 0 38 | var sum = 0 39 | var count = 0 40 | for i in range(len(corpus)): 41 | var key = corpus[i] 42 | if len(key) == 0: 43 | continue 44 | count += 1 45 | sum += len(key) 46 | if min > len(key): 47 | min = len(key) 48 | if max < len(key): 49 | max = len(key) 50 | var avg = sum / count 51 | print("Min key lenght:", min) 52 | print("Avg key length:", avg) 53 | print("Max key length:", max) 54 | print("Total num of bytes:", sum) 55 | print("\n") 56 | 57 | fn main() raises: 58 | var d1 = MultiDict[Int]() 59 | var d2 = StdDict[StringKey, Int]() 60 | var corpus = french_text_to_keys() 61 | 62 | print("") 63 | corpus_stats(corpus) 64 | 65 | @parameter 66 | fn build_compact_dict(): 67 | var d = MultiDict[Int](len(corpus)) 68 | # var d = MultiDict[Int]() 69 | for i in range(len(corpus)): 70 | try: 71 | d.put(StringKey(corpus[i]), i) 72 | except: 73 | print("!!!") 74 | d1 = d^ 75 | 76 | @parameter 77 | fn build_std_dict(): 78 | var d = StdDict[StringKey, Int]() 79 | for i in range(len(corpus)): 80 | d[StringKey(corpus[i])] = i 81 | d2 = d^ 82 | 83 | print("+++++++Create Dict Benchmark+++++++") 84 | 85 | var build_compact_stats = benchmark.run[build_compact_dict](max_runtime_secs=0.5) 86 | # build_compact_stats.print("ns") 87 | 88 | var build_std_stats = benchmark.run[build_std_dict](max_runtime_secs=0.5) 89 | # build_std_stats.print("ns") 90 | 91 | print("Compact build speedup:", build_std_stats.mean() / build_compact_stats.mean()) 92 | var sum1 = 0 93 | @parameter 94 | fn read_compact_dict(): 95 | sum1 = 0 96 | for i in range(len(corpus)): 97 | try: 98 | var v = d1.get(StringKey(corpus[i])) 99 | sum1 += v[len(v) - 1] 100 | except: 101 | print("!!!!!") 102 | 103 | # d1.keys.print_keys() 104 | print("+++++++Read Dict Benchmark+++++++") 105 | var read_compact_stats = benchmark.run[read_compact_dict](max_runtime_secs=0.5) 106 | print("Sum1:", sum1, len(d1)) 107 | # read_compact_stats.print("ns") 108 | 109 | var sum2 = 0 110 | @parameter 111 | fn read_std_dict(): 112 | sum2 = 0 113 | for i in range(len(corpus)): 114 | try: 115 | sum2 += d2[StringKey(corpus[i])] 116 | except: 117 | sum2 += -1 118 | 119 | var raed_std_stats = benchmark.run[read_std_dict](max_runtime_secs=0.5) 120 | # raed_std_stats.print("ns") 121 | print("Sum2:", sum2, len(d2)) 122 | print("Compact read speedup:", raed_std_stats.mean() / read_compact_stats.mean()) 123 | 124 | assert_equal(sum1, sum2) 125 | assert_equal(len(d1), len(d2)) 126 | 127 | _ = corpus 128 | _ = d1^ 129 | _ = d2^ -------------------------------------------------------------------------------- /benchmark_report_string_dict.mojo: -------------------------------------------------------------------------------- 1 | import benchmark 2 | from string_dict import Dict as CompactDict 3 | from collections.dict import KeyElement, Dict as StdDict 4 | from pathlib import cwd 5 | from testing import assert_equal 6 | from csv import CsvBuilder 7 | from helpers.progress_bar import progress_bar 8 | import os 9 | from corpora import * 10 | 11 | alias M = 9 12 | 13 | @fieldwise_init 14 | struct BenchmarkData(Copyable, Movable): 15 | var reports: List[benchmark.Report] 16 | var read_checksums: List[Int] 17 | 18 | fn __init__(out self): 19 | self.reports = List[benchmark.Report]() 20 | self.read_checksums = List[Int]() 21 | 22 | def report_std_benchmarks(corpus: List[String], mut csv_builder: CsvBuilder) -> BenchmarkData: 23 | var benchmark_data = BenchmarkData() 24 | var std_dict = StdDict[String, Int]() 25 | @parameter 26 | fn build_dict(): 27 | var d = StdDict[String, Int]() 28 | for i in range(len(corpus)): 29 | d[corpus[i]] = i 30 | std_dict = d^ 31 | var build_stats = benchmark.run[build_dict](max_runtime_secs=0.5) 32 | csv_builder.push(String(build_stats.mean("ns")), False) 33 | benchmark_data.reports.append(build_stats) 34 | 35 | var sum = 0 36 | @parameter 37 | fn read_dict(): 38 | sum = 0 39 | for i in range(len(corpus)): 40 | try: 41 | sum += std_dict[corpus[i]] 42 | except: 43 | sum += -1 44 | 45 | var read_stats = benchmark.run[read_dict](max_runtime_secs=0.5) 46 | csv_builder.push(String(read_stats.mean("ns")), False) 47 | benchmark_data.reports.append(read_stats) 48 | benchmark_data.read_checksums.append(sum) 49 | 50 | @parameter 51 | fn delete_dict(): 52 | for i in range(len(corpus)): 53 | if i % M == 0: 54 | try: 55 | _ = std_dict.pop(corpus[i]) 56 | except: 57 | pass 58 | 59 | var delete_stats = benchmark.run[delete_dict](max_runtime_secs=0.5) 60 | csv_builder.push(String(delete_stats.mean("ns")), False) 61 | benchmark_data.reports.append(delete_stats) 62 | 63 | var read_after_delete_stats = benchmark.run[read_dict](max_runtime_secs=0.5) 64 | csv_builder.push(String(read_after_delete_stats.mean("ns")), False) 65 | benchmark_data.reports.append(read_after_delete_stats) 66 | benchmark_data.read_checksums.append(sum) 67 | 68 | _ = std_dict 69 | 70 | return benchmark_data 71 | 72 | 73 | def report_compact_benchmarks(corpus: List[String], mut csv_builder: CsvBuilder) -> BenchmarkData: 74 | var benchmark_data = BenchmarkData() 75 | var dict = CompactDict[Int]() 76 | @parameter 77 | fn build_dict_nc(): 78 | var d = CompactDict[Int]() 79 | for i in range(len(corpus)): 80 | d.put(corpus[i], i) 81 | dict = d^ 82 | var build_stats_nc = benchmark.run[build_dict_nc](max_runtime_secs=0.5) 83 | csv_builder.push(String(build_stats_nc.mean("ns")), False) 84 | benchmark_data.reports.append(build_stats_nc) 85 | 86 | @parameter 87 | fn build_dict(): 88 | var d = CompactDict[Int](len(corpus)) 89 | for i in range(len(corpus)): 90 | d.put(corpus[i], i) 91 | dict = d^ 92 | var build_stats = benchmark.run[build_dict](max_runtime_secs=0.5) 93 | csv_builder.push(String(build_stats.mean("ns")), False) 94 | benchmark_data.reports.append(build_stats) 95 | 96 | var sum = 0 97 | @parameter 98 | fn read_dict(): 99 | sum = 0 100 | for i in range(len(corpus)): 101 | sum += dict.get(corpus[i], -1) 102 | 103 | var read_stats = benchmark.run[read_dict](max_runtime_secs=0.5) 104 | # var read_checksum = sum 105 | csv_builder.push(String(read_stats.mean("ns")), False) 106 | benchmark_data.reports.append(read_stats) 107 | benchmark_data.read_checksums.append(sum) 108 | 109 | @parameter 110 | fn delete_dict(): 111 | for i in range(len(corpus)): 112 | if i % M == 0: 113 | dict.delete(corpus[i]) 114 | 115 | var delete_stats = benchmark.run[delete_dict](max_runtime_secs=0.5) 116 | csv_builder.push(String(delete_stats.mean("ns")), False) 117 | benchmark_data.reports.append(delete_stats) 118 | 119 | var read_after_delete_stats = benchmark.run[read_dict](max_runtime_secs=0.5) 120 | # var read_after_delete_checksum = sum 121 | 122 | csv_builder.push(String(read_after_delete_stats.mean("ns")), False) 123 | benchmark_data.reports.append(read_after_delete_stats) 124 | benchmark_data.read_checksums.append(sum) 125 | _ = dict 126 | return benchmark_data 127 | 128 | fn corpus_stats(corpus: List[String], mut csv_builder: CsvBuilder): 129 | csv_builder.push(String(len(corpus)), False) 130 | var min = 100000000 131 | var max = 0 132 | var sum = 0 133 | var count = 0 134 | for i in range(len(corpus)): 135 | var key = corpus[i] 136 | if len(key) == 0: 137 | continue 138 | count += 1 139 | sum += len(key) 140 | if min > len(key): 141 | min = len(key) 142 | if max < len(key): 143 | max = len(key) 144 | var avg = sum / count 145 | csv_builder.push(String(sum), False) 146 | csv_builder.push(String(min), False) 147 | csv_builder.push(String(avg), False) 148 | csv_builder.push(String(max), False) 149 | 150 | fn report_speedup(std: BenchmarkData, compact: BenchmarkData, mut csv_builder: CsvBuilder): 151 | csv_builder.push(String(std.reports[0].mean() / compact.reports[0].mean()), False) 152 | csv_builder.push(String(std.reports[0].mean() / compact.reports[1].mean()), False) 153 | csv_builder.push(String(std.reports[1].mean() / compact.reports[2].mean()), False) 154 | csv_builder.push(String(std.reports[2].mean() / compact.reports[3].mean()), False) 155 | csv_builder.push(String(std.reports[3].mean() / compact.reports[4].mean()), False) 156 | 157 | fn report_checksums_alignment(std: BenchmarkData, compact: BenchmarkData, mut csv_builder: CsvBuilder): 158 | csv_builder.push(String(std.read_checksums[0] == compact.read_checksums[0]), False) 159 | csv_builder.push(String(std.read_checksums[1] == compact.read_checksums[1]), False) 160 | 161 | def report(name: String, corpus: List[String], mut csv_builder: CsvBuilder): 162 | csv_builder.push(name, False) 163 | corpus_stats(corpus, csv_builder) 164 | var std_stats = report_std_benchmarks(corpus, csv_builder) 165 | var compact_stats = report_compact_benchmarks(corpus, csv_builder) 166 | report_speedup(std_stats, compact_stats, csv_builder) 167 | report_checksums_alignment(std_stats, compact_stats, csv_builder) 168 | 169 | fn file_exists(path: String) -> Bool: 170 | return os.path.exists(path) 171 | 172 | fn main() raises: 173 | var csv_builder = CsvBuilder( 174 | "Corpus", "Number of keys", "Total bytes", "Min key", "Avg key", "Max key", 175 | "Build stdlib", "Read stdlib", "Delete stdlib", "Read after delete stdlib", 176 | "Build compact nc", "Build compact", "Read compact", "Delete compact", "Read after delete compact", 177 | "Speedup build nc", "Speedup build", "Speedup read", "Speadup delete", "Speedup read after delete", 178 | "Read Checksum", "Read Checksum after delete" 179 | ) 180 | 181 | var names = [ 182 | "Arabic", "Chinese", "English", "French", 183 | "Georgien", "German", "Greek", "Hebrew", 184 | "Hindi", "Japanese", "l33t", "Russian", 185 | "S3", 186 | ] 187 | 188 | var generators = [ 189 | arabic_text_to_keys, chinese_text_to_keys, english_text_to_keys, french_text_to_keys, 190 | georgian_text_to_keys, german_text_to_keys, greek_text_to_keys, hebrew_text_to_keys, 191 | hindi_text_to_keys, japanese_long_keys, l33t_text_to_keys, russian_text_to_keys, 192 | s3_action_names, 193 | ] 194 | 195 | # https://unix.stackexchange.com/questions/213628/where-do-the-words-in-usr-share-dict-words-come-from/798355#798355 196 | var use_system_words = file_exists('/usr/share/dict/words') 197 | 198 | if use_system_words: 199 | names.append("Words") 200 | generators.append(system_words_collection) 201 | 202 | 203 | @parameter 204 | fn one_step(i: Int) raises: 205 | report(names[i], generators[i](), csv_builder) 206 | 207 | # Call `report("Arabic", arabic_text_to_keys(), csv_builder)` iterating over names and generators 208 | progress_bar[one_step](n=len(names), prefix="Corpus:", bar_size=40) 209 | 210 | _ = csv_builder^.finish() 211 | print("\n") 212 | -------------------------------------------------------------------------------- /benchmark_string_dict.mojo: -------------------------------------------------------------------------------- 1 | import benchmark 2 | from string_dict import Dict as CompactDict 3 | from collections.dict import KeyElement, Dict as StdDict 4 | from pathlib import cwd 5 | from testing import assert_equal 6 | 7 | from corpora import * 8 | 9 | 10 | fn corpus_stats(corpus: List[String]): 11 | print("=======Corpus Stats=======") 12 | print("Number of elements:", len(corpus)) 13 | var min = 100000000 14 | var max = 0 15 | var sum = 0 16 | var count = 0 17 | for i in range(len(corpus)): 18 | var key = corpus[i] 19 | if len(key) == 0: 20 | continue 21 | count += 1 22 | sum += len(key) 23 | if min > len(key): 24 | min = len(key) 25 | if max < len(key): 26 | max = len(key) 27 | var avg = sum / count 28 | print("Min key lenght:", min) 29 | print("Avg key length:", avg) 30 | print("Max key length:", max) 31 | print("Total num of bytes:", sum) 32 | print("\n") 33 | 34 | fn main() raises: 35 | var d1 = CompactDict[Int]() 36 | var d2 = StdDict[String, Int]() 37 | var corpus = french_text_to_keys() 38 | 39 | print("") 40 | corpus_stats(corpus) 41 | 42 | @parameter 43 | fn build_compact_dict(): 44 | var d = CompactDict[Int](len(corpus)) 45 | # var d = CompactDict[Int]() 46 | for i in range(len(corpus)): 47 | d.put(corpus[i], i) 48 | d1 = d^ 49 | 50 | @parameter 51 | fn build_std_dict(): 52 | var d = StdDict[String, Int]() 53 | for i in range(len(corpus)): 54 | d[corpus[i]] = i 55 | d2 = d^ 56 | 57 | print("+++++++Create Dict Benchmark+++++++") 58 | 59 | var build_compact_stats = benchmark.run[build_compact_dict](max_runtime_secs=0.5) 60 | # build_compact_stats.print("ns") 61 | 62 | var build_std_stats = benchmark.run[build_std_dict](max_runtime_secs=0.5) 63 | # build_std_stats.print("ns") 64 | 65 | print("Compact build speedup:", build_std_stats.mean() / build_compact_stats.mean()) 66 | var sum1 = 0 67 | @parameter 68 | fn read_compact_dict(): 69 | sum1 = 0 70 | for i in range(len(corpus)): 71 | sum1 += d1.get(corpus[i], -1) 72 | 73 | # d1.keys.print_keys() 74 | print("+++++++Read Dict Benchmark+++++++") 75 | var read_compact_stats = benchmark.run[read_compact_dict](max_runtime_secs=0.5) 76 | print("Sum1:", sum1, len(d1)) 77 | # read_compact_stats.print("ns") 78 | 79 | var sum2 = 0 80 | @parameter 81 | fn read_std_dict(): 82 | sum2 = 0 83 | for i in range(len(corpus)): 84 | try: 85 | sum2 += d2[corpus[i]] 86 | except: 87 | sum2 += -1 88 | 89 | var raed_std_stats = benchmark.run[read_std_dict](max_runtime_secs=0.5) 90 | # raed_std_stats.print("ns") 91 | print("Sum2:", sum2, len(d2)) 92 | print("Compact read speedup:", raed_std_stats.mean() / read_compact_stats.mean()) 93 | 94 | assert_equal(sum1, sum2) 95 | assert_equal(len(d1), len(d2)) 96 | 97 | var m = 9 98 | @parameter 99 | fn delete_compact_dict(): 100 | for i in range(len(corpus)): 101 | if i % m == 0: 102 | d1.delete(corpus[i]) 103 | 104 | @parameter 105 | fn delete_std_dict(): 106 | for i in range(len(corpus)): 107 | if i % m == 0: 108 | try: 109 | _ = d2.pop(corpus[i]) 110 | except: 111 | pass 112 | 113 | print("+++++++Delete Dict Benchmark+++++++") 114 | 115 | var delete_compact_stats = benchmark.run[delete_compact_dict](max_runtime_secs=0.5) 116 | var delete_std_stats = benchmark.run[delete_std_dict](max_runtime_secs=0.5) 117 | 118 | print("Compact delete speedup:", delete_std_stats.mean() / delete_compact_stats.mean()) 119 | 120 | print("+++++++Read After Delete Dict Benchmark+++++++") 121 | 122 | var read_after_delete_compact_stats = benchmark.run[read_compact_dict](max_runtime_secs=0.5) 123 | var read_after_delete_std_stats = benchmark.run[read_std_dict](max_runtime_secs=0.5) 124 | 125 | print("Compact read after delete speedup:", read_after_delete_std_stats.mean() / read_after_delete_compact_stats.mean()) 126 | 127 | print("Sum1:", sum1, "length:", len(d1)) 128 | print("Sum2:", sum2, "length:", len(d2)) 129 | 130 | assert_equal(sum1, sum2) 131 | assert_equal(len(d1), len(d2)) 132 | 133 | _ = corpus 134 | _ = d1^ 135 | _ = d2^ -------------------------------------------------------------------------------- /checkout_remote_modules.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | function check_out_remote_module() ( 4 | rurl="$1" 5 | shift 6 | declare -a paths 7 | declare -a module_names 8 | for var in "$@" 9 | do 10 | IFS="=" 11 | read -ra module_name_components <<< "$var" 12 | components_count=${#module_name_components[@]} 13 | path=${module_name_components[0]} 14 | module_name=${module_name_components[$components_count-1]} 15 | paths=("${paths[@]}" "$path") 16 | module_names=("${module_names[@]}" "$module_name") 17 | done 18 | IFS=" " 19 | 20 | for module_name in "${module_names[@]}" 21 | do 22 | rm -rf ../$module_name 23 | done 24 | 25 | current_date_time=$(date) 26 | echo "URL: $rurl" 27 | git clone -n --depth=1 --filter=tree:0 $rurl 28 | cd ${rurl##*/} 29 | git sparse-checkout set --no-cone "${paths[@]}" 30 | git checkout 31 | 32 | for i in "${!paths[@]}" 33 | do 34 | module_name=${module_names[$i]} 35 | path=${paths[$i]} 36 | cp -R ./$path ../../$module_name 37 | echo $current_date_time > ../../$module_name/.checkoutinfo 38 | echo "URL: $rurl" >> ../../$module_name/.checkoutinfo 39 | echo "Path: $path" >> ../../$module_name/.checkoutinfo 40 | done 41 | cd ../ 42 | ) 43 | 44 | function checkout()( 45 | # Add check out remote module calls here 46 | 47 | check_out_remote_module "https://github.com/mzaks/mojo-csv" "csv" 48 | ) 49 | 50 | mkdir -p "_deps" 51 | cd "_deps" 52 | 53 | checkout 54 | 55 | rm -rf "../_deps" -------------------------------------------------------------------------------- /corpora/__init__.mojo: -------------------------------------------------------------------------------- 1 | from pathlib import cwd, Path 2 | 3 | fn english_text_to_keys() raises -> List[String]: 4 | return String('A wonderful serenity has taken possession of my entire soul, like these sweet mornings of spring which I enjoy with my whole heart. I am alone, and feel the charm of existence in this spot, which was created for the bliss of souls like mine. I am so happy, my dear friend, so absorbed in the exquisite sense of mere tranquil existence, that I neglect my talents. I should be incapable of drawing a single stroke at the present moment; and yet I feel that I never was a greater artist than now. When, while the lovely valley teems with vapour around me, and the meridian sun strikes the upper surface of the impenetrable foliage of my trees, and but a few stray gleams steal into the inner sanctuary, I throw myself down among the tall grass by the trickling stream; and, as I lie close to the earth, a thousand unknown plants are noticed by me: when I hear the buzz of the little world among the stalks, and grow familiar with the countless indescribable forms of the insects and flies, then I feel the presence of the Almighty, who formed us in his own image, and the breath of that universal love which bears and sustains us, as it floats around us in an eternity of bliss; and then, my friend, when darkness overspreads my eyes, and heaven and earth seem to dwell in my soul and absorb its power, like the form of a beloved mistress, then I often think with longing, Oh, would I could describe these conceptions, could impress upon paper all that is living so full and warm within me, that it might be the mirror of my soul, as my soul is the mirror of the infinite God! O my friend -- but it is too much for my strength -- I sink under the weight of the splendour of these visions! A wonderful serenity has taken possession of my entire soul, like these sweet mornings of spring which I enjoy with my whole heart. I am alone, and feel the charm of existence in this spot, which was created for the bliss of souls like mine. I am so happy, my dear friend, so absorbed in the exquisite sense of mere tranquil existence, that I neglect my talents. I should be incapable of drawing a single stroke at the present moment; and yet I feel that I never was a greater artist than now. When, while the lovely valley teems with vapour around me, and the meridian sun strikes the upper surface of the impenetrable foliage of my trees, and but a few stray gleams steal into the inner sanctuary, I throw myself down among the tall grass by the trickling stream; and, as I lie close to the earth, a thousand unknown plants are noticed by me: when I hear the buzz of the little world among the stalks, and grow familiar with the countless indescribable forms of the insects and flies, then I feel the presence of the Almighty, who formed us in his own image, and the breath of that universal love which bears and sustains us, as it floats around us in an eternity of bliss; and then, my friend, when darkness overspreads my eyes, and heaven and earth seem to dwell in my soul and absorb its power, like the form of a beloved mistress, then I often think with longing, Oh, would I could describe these conceptions, could impress upon paper all that is living so full and warm within me, that it might be the mirror of my soul, as my soul is the mirror of the infinite God! O my friend -- but it is too much for my strength -- I sink under the weight of the splendour of these visions! A wonderful serenity has taken possession of my entire soul, like these sweet mornings of spring which I enjoy with my whole heart. I am alone, and feel the charm of existence in this spot, which was created for the bliss of souls like mine. I am so happy, my dear friend, so absorbed in the exquisite sense of mere tranquil existence, that I neglect my talents. I should be incapable of drawing a single stroke at the present moment; and yet I feel that I never was a greater artist than now. When, while the lovely valley teems with vapour around me, and the meridian sun strikes the upper surface of the impenetrable foliage of my trees, and but a few stray gleams steal into the inner sanctuary, I throw myself down among the tall grass by the trickling stream; and, as I lie close to the earth, a thousand unknown plants are noticed by me: when I hear the buzz of the little world among the stalks, and grow familiar with the countless indescribable forms of the insects and flies, then I feel the presence of the Almighty, who formed us in his own image, and the breath of that universal love which bears and sustains us, as it floats around us in an eternity of bliss; and then, my friend, when darkness overspreads my eyes, and heaven and earth seem to dwell in my soul and absorb its power, like the form of a beloved mistress, then I often think with longing, Oh, would I could describe these conceptions, could impress upon paper all that is living so full and warm within me, that it might be the mirror of my soul, as my soul is the mirror of the infinite God! O my friend -- but it is too much for my strength -- I sink under the weight of the splendour of these visions!A wonderful serenity has taken possession of my entire soul, like these sweet mornings of spring which I enjoy with my whole heart. I am alone, and feel the charm of existence in this spot, which was created for the bliss of souls').split(" ") 5 | 6 | fn greek_text_to_keys() raises -> List[String]: 7 | return (cwd() / "corpora" / "greek.txt").read_text().replace("\n", " ").split(" ") 8 | 9 | fn hebrew_text_to_keys() raises -> List[String]: 10 | return (cwd() / "corpora" / "hebrew.txt").read_text().replace("\n", " ").split(" ") 11 | 12 | fn arabic_text_to_keys() raises -> List[String]: 13 | return (cwd() / "corpora" / "arabic.txt").read_text().replace("\n", " ").split(" ") 14 | 15 | fn l33t_text_to_keys() raises -> List[String]: 16 | return (cwd() / "corpora" / "l33t.txt").read_text().replace("\n", " ").split(" ") 17 | 18 | fn georgian_text_to_keys() raises -> List[String]: 19 | return (cwd() / "corpora" / "georgian.txt").read_text().replace("\n", " ").split(" ") 20 | 21 | fn chinese_text_to_keys() raises -> List[String]: 22 | return (cwd() / "corpora" / "chinese.txt").read_text().replace("\n", " ").split(" ") 23 | 24 | fn french_text_to_keys() raises -> List[String]: 25 | return (cwd() / "corpora" / "french.txt").read_text().replace("\n", " ").split(" ") 26 | 27 | fn hindi_text_to_keys() raises -> List[String]: 28 | return (cwd() / "corpora" / "hindi.txt").read_text().replace("\n", " ").split(" ") 29 | 30 | fn russian_text_to_keys() raises -> List[String]: 31 | return String('Проснувшись однажды утром после беспокойного сна, Грегор Замза обнаружил, что он у себя в постели превратился в страшное насекомое. Лежа на панцирнотвердой спине, он видел, стоило ему приподнять голову, свой коричневый, выпуклый, разделенный дугообразными чешуйками живот, на верхушке которого еле держалось готовое вот-вот окончательно сползти одеяло. Его многочисленные, убого тонкие по сравнению с остальным телом ножки беспомощно копошились у него перед глазами. «Что со мной случилось?» – подумал он. Это не было сном. Его комната, настоящая, разве что слишком маленькая, но обычная комната, мирно покоилась в своих четырех хорошо знакомых стенах. Над столом, где были разложены распакованные образцы сукон – Замза был коммивояжером, – висел портрет, который он недавно вырезал из иллюстрированного журнала и вставил в красивую золоченую рамку. На портрете была изображена дама в меховой шляпе и боа, она сидела очень прямо и протягивала зрителю тяжелую меховую муфту, в которой целиком исчезала ее рука. Затем взгляд Грегора устремился в окно, и пасмурная погода – слышно было, как по жести подоконника стучат капли дождя – привела его и вовсе в грустное настроение. «Хорошо бы еще немного поспать и забыть всю эту чепуху», – подумал он, но это было совершенно неосуществимо, он привык спать на правом боку, а в теперешнем своем состоянии он никак не мог принять этого положения. С какой бы силой ни поворачивался он на правый бок, он неизменно сваливался опять на спину. Закрыв глаза, чтобы не видеть своих барахтающихся ног, он проделал это добрую сотню раз и отказался от этих попыток только тогда, когда почувствовал какую-то неведомую дотоле, тупую и слабую боль в боку. «Ах ты, господи, – подумал он, – какую я выбрал хлопотную профессию! Изо дня в день в разъездах. Деловых волнений куда больше, чем на месте, в торговом доме, а кроме того, изволь терпеть тяготы дороги, думай о расписании поездов, мирись с плохим, нерегулярным питанием, завязывай со все новыми и новыми людьми недолгие, никогда не бывающие сердечными отношения. Черт бы побрал все это!» Он почувствовал вверху живота легкий зуд; медленно подвинулся на спине к прутьям кровати, чтобы удобнее было поднять голову; нашел зудевшее место, сплошь покрытое, как оказалось, белыми непонятными точечками; хотел было ощупать это место одной из ножек, но сразу отдернул ее, ибо даже простое прикосновение вызвало у него, Грегора, озноб. Он соскользнул в прежнее свое положение. «От этого раннего вставания, – подумал он, – можно совсем обезуметь. Человек должен высыпаться. Другие коммивояжеры живут, как одалиски. Когда я, например, среди дня возвращаюсь в гостиницу, чтобы переписать полученные заказы, эти господа только завтракают. А осмелься я вести себя так, мои хозяин выгнал бы меня сразу. Кто знает, впрочем, может быть, это было бы даже очень хорошо для меня. Если бы я не сдерживался ради родителей, я бы давно заявил об уходе, я бы подошел к своему хозяину и выложил ему все, что о нем думаю. Он бы так и свалился с конторки! Странная у него манера – садиться на конторку и с ее высоты разговаривать со служащим, который вдобавок вынужден подойти вплотную к конторке из-за того, что хозяин туг на ухо. Однако надежда еще не совсем потеряна: как только я накоплю денег, чтобы выплатить долг моих родителей – на это уйдет еще лет пять-шесть, – я так и поступлю. Тут-то мы и распрощаемся раз и навсегда. А пока что надо подниматься, мой поезд отходит в пять». И он взглянул на будильник, который тикал на сундуке. «Боже правый!» – подумал он. Было половина седьмого, и стрелки спокойно двигались дальше, было даже больше половины, без малого уже три четверти. Неужели будильник не звонил? С кровати было видно, что он поставлен правильно, на четыре часа; и он, несомненно, звонил. Но как можно было спокойно спать под этот сотрясающий мебель трезвон? Ну, спал-то он неспокойно, но, видимо, крепко. Однако что делать теперь? Следующий поезд уходит в семь часов; чтобы поспеть на него, он должен отчаянно торопиться, а набор образцов еще не упакован, да и сам он отнюдь не чувствует себя свежим и легким на подъем. И даже поспей он на поезд, хозяйского разноса ему все равно не избежать – ведь рассыльный торгового дома дежурил у пятичасового поезда и давно доложил о его, Грегора, опоздании. Рассыльный, человек бесхарактерный и неумный, был ставленником хозяина. А что, если сказаться больным? Но это было бы крайне неприятно и показалось бы подозрительным, ибо за пятилетнюю свою службу Грегор ни разу еще не болел. Хозяин, конечно, привел бы врача больничной кассы и стал попрекать родителей сыном-лентяем, отводя любые возражения ссылкой на этого врача, по мнению которого все люди на свете совершенно здоровы и только не любят работать. И разве в данном случае он был бы так уж неправ? Если не считать сонливости, действительно странной после такого долгого сна, Грегор и в самом деле чувствовал себя превосходно и был даже чертовски голоден.Проснувшись однажды утром после беспокойного сна, Грегор Замза обнаружил, что он у себя в постели превратился в страшное насекомое. Лежа на панцирнотвердой спине, он видел, стоило ему приподнять голову, свой коричневый, выпуклый, разделенный дугообразными чешуйками живот, на верхушке которого еле держалось готовое вот-вот окончательно сползти одеяло. Его многочисленные, убого тонкие по сравнению с остальным телом ножки беспомощно копошились у него перед глазами. «Что со мной случилось?» – подумал он. Это не было сном. Его комната, настоящая, разве что слишком маленькая, но обычная комната, мирно покоилась в своих четырех хорошо знакомых стенах. Над столом, где были разложены распакованные образцы сукон – Замза был коммивояжером, – висел портрет, который он недавно вырезал из иллюстрированного журнала и вставил в красивую золоченую рамку. На портрете была изображена дама в меховой шляпе и боа, она сидела очень прямо и протягивала зрителю тяжелую меховую муфту, в которой целиком исчезала ее рука. Затем взгляд Грегора устремился в окно, и пасмурная погода – слышно было, как по жести подоконника стучат капли дождя – привела его и вовсе в грустное настроение. «Хорошо бы еще немного поспать и забыть всю эту чепуху», – подумал он, но это было совершенно неосуществимо, он привык спать на правом боку, а в теперешнем своем состоянии он никак не мог принять этого положения. С какой бы силой ни поворачивался он на правый бок, он неизменно сваливался опять на спину.').split(" ") 32 | 33 | fn german_text_to_keys() raises -> List[String]: 34 | return String('Weit hinten, hinter den Wortbergen, fern der Länder Vokalien und Konsonantien leben die Blindtexte. Abgeschieden wohnen sie in Buchstabhausen an der Küste des Semantik, eines großen Sprachozeans. Ein kleines Bächlein namens Duden fließt durch ihren Ort und versorgt sie mit den nötigen Regelialien. Es ist ein paradiesmatisches Land, in dem einem gebratene Satzteile in den Mund fliegen. Nicht einmal von der allmächtigen Interpunktion werden die Blindtexte beherrscht – ein geradezu unorthographisches Leben. Eines Tages aber beschloß eine kleine Zeile Blindtext, ihr Name war Lorem Ipsum, hinaus zu gehen in die weite Grammatik. Der große Oxmox riet ihr davon ab, da es dort wimmele von bösen Kommata, wilden Fragezeichen und hinterhältigen Semikoli, doch das Blindtextchen ließ sich nicht beirren. Es packte seine sieben Versalien, schob sich sein Initial in den Gürtel und machte sich auf den Weg. Als es die ersten Hügel des Kursivgebirges erklommen hatte, warf es einen letzten Blick zurück auf die Skyline seiner Heimatstadt Buchstabhausen, die Headline von Alphabetdorf und die Subline seiner eigenen Straße, der Zeilengasse. Wehmütig lief ihm eine rhetorische Frage über die Wange, dann setzte es seinen Weg fort. Unterwegs traf es eine Copy. Die Copy warnte das Blindtextchen, da, wo sie herkäme wäre sie zigmal umgeschrieben worden und alles, was von ihrem Ursprung noch übrig wäre, sei das Wort "und" und das Blindtextchen solle umkehren und wieder in sein eigenes, sicheres Land zurückkehren. Doch alles Gutzureden konnte es nicht überzeugen und so dauerte es nicht lange, bis ihm ein paar heimtückische Werbetexter auflauerten, es mit Longe und Parole betrunken machten und es dann in ihre Agentur schleppten, wo sie es für ihre Projekte wieder und wieder mißbrauchten. Und wenn es nicht umgeschrieben wurde, dann benutzen Sie es immernoch. Weit hinten, hinter den Wortbergen, fern der Länder Vokalien und Konsonantien leben die Blindtexte. Abgeschieden wohnen sie in Buchstabhausen an der Küste des Semantik, eines großen Sprachozeans. Ein kleines Bächlein namens Duden fließt durch ihren Ort und versorgt sie mit den nötigen Regelialien. Es ist ein paradiesmatisches Land, in dem einem gebratene Satzteile in den Mund fliegen. Nicht einmal von der allmächtigen Interpunktion werden die Blindtexte beherrscht – ein geradezu unorthographisches Leben. Eines Tages aber beschloß eine kleine Zeile Blindtext, ihr Name war Lorem Ipsum, hinaus zu gehen in die weite Grammatik. Der große Oxmox riet ihr davon ab, da es dort wimmele von bösen Kommata, wilden Fragezeichen und hinterhältigen Semikoli, doch das Blindtextchen ließ sich nicht beirren. Es packte seine sieben Versalien, schob sich sein Initial in den Gürtel und machte sich auf den Weg. Als es die ersten Hügel des Kursivgebirges erklommen hatte, warf es einen letzten Blick zurück auf die Skyline seiner Heimatstadt Buchstabhausen, die Headline von Alphabetdorf und die Subline seiner eigenen Straße, der Zeilengasse. Wehmütig lief ihm eine rhetorische Frage über die Wange, dann setzte es seinen Weg fort. Unterwegs traf es eine Copy. Die Copy warnte das Blindtextchen, da, wo sie herkäme wäre sie zigmal umgeschrieben worden und alles, was von ihrem Ursprung noch übrig wäre, sei das Wort "und" und das Blindtextchen solle umkehren und wieder in sein eigenes, sicheres Land zurückkehren. Doch alles Gutzureden konnte es nicht überzeugen und so dauerte es nicht lange, bis ihm ein paar heimtückische Werbetexter auflauerten, es mit Longe und Parole betrunken machten und es dann in ihre Agentur schleppten, wo sie es für ihre Projekte wieder und wieder mißbrauchten. Und wenn es nicht umgeschrieben wurde, dann benutzen Sie es immernoch. Weit hinten, hinter den Wortbergen, fern der Länder Vokalien und Konsonantien leben die Blindtexte. Abgeschieden wohnen sie in Buchstabhausen an der Küste des Semantik, eines großen Sprachozeans. Ein kleines Bächlein namens Duden fließt durch ihren Ort und versorgt sie mit den nötigen Regelialien. Es ist ein paradiesmatisches Land, in dem einem gebratene Satzteile in den Mund fliegen. Nicht einmal von der allmächtigen Interpunktion werden die Blindtexte beherrscht – ein geradezu unorthographisches Leben. Eines Tages aber beschloß eine kleine Zeile Blindtext, ihr Name war Lorem Ipsum, hinaus zu gehen in die weite Grammatik. Der große Oxmox riet ihr davon ab, da es dort wimmele von bösen Kommata, wilden Fragezeichen und hinterhältigen Semikoli, doch das Blindtextchen ließ sich nicht beirren. Es packte seine sieben Versalien, schob sich sein Initial in den Gürtel und machte sich auf den Weg. Als es die ersten Hügel des Kursivgebirges erklommen hatte, warf es einen letzten Blick zurück auf die Skyline seiner Heimatstadt Buchstabhausen, die Headline von Alphabetdorf und die Subline seiner eigenen Straße, der Zeilengasse. Wehmütig lief ihm eine rhetorische Frage über die Wange, dann setzte es seinen Weg fort. Unterwegs traf es eine Copy. Die Copy warnte das Blindtextchen, da, wo sie herkäme wäre sie zigmal umgeschrieben worden und alles, was von ihrem Ursprung noch übrig wäre, sei das Wort "und" und das Blindtextchen solle umkehren und wieder in sein eigenes, sicheres Land zurückkehren. Doch alles Gutzureden konnte es nicht überzeugen und so dauerte es nicht lange, bis ihm ein paar heimtückische Werbetexter auflauerten, es mit Longe und Parole betrunken machten und es dann in ihre Agentur schleppten, wo sie es für ihre Projekte wieder und wieder mißbrauchten. Und wenn es nicht umgeschrieben wurde, dann benutzen Sie es immernoch.Weit hinten, hinter den Wortbergen, fern der Länder Vokalien und Konsonantien leben die Blindtexte. Abgeschieden wohnen sie in Buchstabhausen an der Küste des Semantik, eines großen Sprachozeans. Ein kleines Bächlein namens Duden fließt durch ihren Ort und versorgt sie mit den nötigen Regelialien. Es ist ein paradiesmatisches Land, in dem einem gebratene Satzteile in den Mund fliegen. Nicht einmal von der allmächtigen Interpunktion werden die Blindtexte beherrscht – ein geradezu unorthographisches Leben. Eines Tages aber beschloß eine kleine Zeile Blindtext, ihr Name war Lorem Ipsum, hinaus zu gehen in die weite Grammatik. Der große Oxmox riet ihr davon ab, da es dort wimmele von bösen Kommata, wilden Fragezeichen und hinterhältigen Semikoli, doch das Blindtextchen ließ sich nicht beirren. Es packte seine sieben Versalien, schob sich sein Initial in den Gürtel und machte sich auf den Weg. Als es die ersten Hügel des Kursivgebirges erklommen hatte, warf es einen').split(" ") 35 | 36 | fn japanese_long_keys() raises -> List[String]: 37 | return String('米くを舵4物委らご氏松ハナテフ月関ソ時平ふいの博情れじフ牟万い元56園フメヤオ試図ロツヤ未備王こと傷喫羅踊んゆし。栃ユヱオ書著作ユソツロ英祉業ア大課ご権質フべ空8午キ切軟づン著郎そゃす格町採ヱオマコ処8付国ムハチア究表でなだ際無ロミヱ地兵ぴげ庭体すク発抜爆位や。楽富むゆず盛航カナセ携代ハ本高きた員59今骸ンラえぜ城解イケ穴訴ぽぎ属住ヤケトヌ抱点ト広注厚でて。 国リ出難セユメ軍手ヘカウ画形サヲシ猛85用ヲキミ心死よしと身処ケヨミオ教主ーぽ事業んく字国たさょ図能シミスヤ社8板ル岡世58次戒知院んれり。市メ誘根カ数問禁竹ゃれえみ給辺のでみき今二ぎさ裕止過こクすと無32郎所ラた生展ヌヘス成度慣葬勇厘ばてか。室ゃ下携疲ム色権がぽりっ銃週ノオ姫千テム健蔵い研手ッ放容ル告属め旅側26企サノヨ宅都福ぞ通待ちぴね種脳イど労希望義通むン。 罰しい続負せ著低たル異師ユハワ東添質コチ転集ルヤ雇聴約ヒ前統らた情厳ゆさでや真胸や有披暑棚豆ゆぼたけ。盛ワセロナ情競クるっわ講3音ずをせ少地めしぜょ手63明視れに判企ヒヌエソ求総58特本ね井比ユラキ禁頭馬るゅリす能率率かがさわ。葉サソ医郡ヱヘソ労帰ナケスミ救写ワヘ株審ネヒニミ安逮イ人画ラ涯車はラ極騒りなド件5級ンかふー劇41著ぱぐ凱討だ文世ぶづどま界善魅マ渓経競融れがや。 連ーぜらご模分ッ視外ばフく運発群ほぼづ育越一ほごクけ案募ヲイソ治会イせフ製君ぜた漢村1変リヒ構5際ツ御文ヲ臭入さドぼ代書ハケ引技ろみれ回観注倉徹ぱ。論ラづ海要サ情座ゃり齢宣ラモエ芸化エマホ覧催回ら戦69本外ト葬岳な政画か連針ぴリフず。約ル闘辺ぽ経2応掲ホサアラ塾小コラ画決クノオ上室レヌヱ勝逮ぜるえむ責豊チノ明意ひけ訟6碁草メタチエ財午召喝塊む。 決めでわ名金つけレわ続人県約ぽぼす尾腹ユサ戦載リシ護賀レモフツ重涯ニ治者むんっみ職更カタチレ提話2何ワ責東まけげふ能政ヌ供禁がびてわ提改倶れめ。読み担後ぽ安加ぎ論鹿ツ統最お気麻月つじもあ竜思いろめ判必満理トコ文連ムイウハ寄串ざほびー。文ゆこっ向27年メイ便能ノセヲ待1王スねたゆ伝派んね点過カト治読よにきべ使人スシ都言え阻8割べづえみ注引敷的岳犠眠どそ。 学用イだ医客開ロ供界もぞだ実隆モイヌ務坂ナコヲ権野ろづ初場ぱ低会づぱじ新倒コ化政レ止奮浸猪ッわえづ。形いやリ要帰ほまむだ業領スル必打さ島14巻リ集日ネヘホタ面幅ち写上そぴ円図ムタコモ報使イわざと会催ヤヲ康証をドぶレ盤岡ホハツ作29管しをめ公問懐蓄っさ。来ゆぼあぱ投秋シ語右ぐ身靖かば辛握捕家記ヘワ神岐囲づ毘観メテクツ政73夕罪57需93誌飲査仁さ。 変レめ束球よんま会特ヱコ聞重だ史純ーどる件32浦レぴよゃ上強ネラリロ査従セユヤ専棋光レ作表ひぶ予正ぜーな誉確フス函6報円ス進治ね能営済否雄でわょ。42生型ば着続ア短実ぎおめび前環闘ラヤヲル診均っとにの声公トヱテマ整試椅情久妊舌頃ざとっく。品キチトテ阿国ラら受87世ヲフセリ川86個ーょぼげ危子ヘレカメ無会ぱかへ事通んかて電条ロツ徴商ぶぞそを居暑メ害広せもがり禁応レミヲ応響割壮憶はぱ。 千れンが織財メニ況界ネトレミ学豊フオホシ近月レたやご的罪ょな菱技ちる警栗エセ提89林危氷48参ア説森クキヒヱ薬社ホコエリ負和ルび紀下ケミイ掲歳特ごず扱底ク護木連ちクを各形ばすか。変ぱなれ町7融ヌ街準以タユヘム質裕ぶで遺語俊ぎずょ事金文キ写多山ーゆに歩帯すで会世クぜよ論写ヲ達71林危氷5間続ぎぜび高怠す。 係8青け応著ミ戦条ナヘネカ思79未ぎ算伊をゃ泉人ーづ需説っ畑鹿27軽ラソツ権2促千護ルロナカ開国ケ暴嶋ご池表だ。佐フナ訪麻はてせば勝効をあ医戦画とさわぴ者両すいあ並来んば載食ぴ件友頂業へえぞ魚祝ネラ聞率スコリケ始全ンこび夫出ドふ今布うぎふゅ実克即哉循やしんな。 暮す備54依紀てッん末刊と柔称むてス無府ケイ変壌をぱ汁連フマス海世ヌ中負知問ナヘケ純推ひ読着ヒ言若私軽れ。掲けフむ王本オコ線人をっさ必和断セソヲハ図芸ちかな防長りぶは投新意相ツ並5余セ職岳ぞ端古空援そ。森ヨエチ題5東っ自兄ち暴5近鹿横ト的京ハ安氷ナキ深際ぎ並節くスむの権工ほルせ京49効タムチ処三ぞぴラ済国ずっ文経ヘトミ水分準そが。').split(" ") 38 | 39 | fn s3_action_names() raises -> List[String]: 40 | return String('AbortMultipartUpload CompleteMultipartUpload CopyObject CreateBucket CreateMultipartUpload DeleteBucket DeleteBucketAnalyticsConfiguration DeleteBucketCors DeleteBucketEncryption DeleteBucketIntelligentTieringConfiguration DeleteBucketInventoryConfiguration DeleteBucketLifecycle DeleteBucketMetricsConfiguration DeleteBucketOwnershipControls DeleteBucketPolicy DeleteBucketReplication DeleteBucketTagging DeleteBucketWebsite DeleteObject DeleteObjects DeleteObjectTagging DeletePublicAccessBlock GetBucketAccelerateConfiguration GetBucketAcl GetBucketAnalyticsConfiguration GetBucketCors GetBucketEncryption GetBucketIntelligentTieringConfiguration GetBucketInventoryConfiguration GetBucketLifecycle GetBucketLifecycleConfiguration GetBucketLocation GetBucketLogging GetBucketMetricsConfiguration GetBucketNotification GetBucketNotificationConfiguration GetBucketOwnershipControls GetBucketPolicy GetBucketPolicyStatus GetBucketReplication GetBucketRequestPayment GetBucketTagging GetBucketVersioning GetBucketWebsite GetObject GetObjectAcl GetObjectAttributes GetObjectLegalHold GetObjectLockConfiguration GetObjectRetention GetObjectTagging GetObjectTorrent GetPublicAccessBlock HeadBucket HeadObject ListBucketAnalyticsConfigurations ListBucketIntelligentTieringConfigurations ListBucketInventoryConfigurations ListBucketMetricsConfigurations ListBuckets ListMultipartUploads ListObjects ListObjectsV2 ListObjectVersions ListParts PutBucketAccelerateConfiguration PutBucketAcl PutBucketAnalyticsConfiguration PutBucketCors PutBucketEncryption PutBucketIntelligentTieringConfiguration PutBucketInventoryConfiguration PutBucketLifecycle PutBucketLifecycleConfiguration PutBucketLogging PutBucketMetricsConfiguration PutBucketNotification PutBucketNotificationConfiguration PutBucketOwnershipControls PutBucketPolicy PutBucketReplication PutBucketRequestPayment PutBucketTagging PutBucketVersioning PutBucketWebsite PutObject PutObjectAcl PutObjectLegalHold PutObjectLockConfiguration PutObjectRetention PutObjectTagging PutPublicAccessBlock RestoreObject SelectObjectContent UploadPart UploadPartCopy WriteGetObjectResponse, CreateAccessPoint CreateAccessPointForObjectLambda CreateBucket CreateJob CreateMultiRegionAccessPoint DeleteAccessPoint DeleteAccessPointForObjectLambda DeleteAccessPointPolicy DeleteAccessPointPolicyForObjectLambda DeleteBucket DeleteBucketLifecycleConfiguration DeleteBucketPolicy DeleteBucketReplication DeleteBucketTagging DeleteJobTagging DeleteMultiRegionAccessPoint DeletePublicAccessBlock DeleteStorageLensConfiguration DeleteStorageLensConfigurationTagging DescribeJob DescribeMultiRegionAccessPointOperation GetAccessPoint GetAccessPointConfigurationForObjectLambda GetAccessPointForObjectLambda GetAccessPointPolicy GetAccessPointPolicyForObjectLambda GetAccessPointPolicyStatus GetAccessPointPolicyStatusForObjectLambda GetBucket GetBucketLifecycleConfiguration GetBucketPolicy GetBucketReplication GetBucketTagging GetBucketVersioning GetJobTagging GetMultiRegionAccessPoint GetMultiRegionAccessPointPolicy GetMultiRegionAccessPointPolicyStatus GetMultiRegionAccessPointRoutes GetPublicAccessBlock GetStorageLensConfiguration GetStorageLensConfigurationTagging ListAccessPoints ListAccessPointsForObjectLambda ListJobs ListMultiRegionAccessPoints ListRegionalBuckets ListStorageLensConfigurations PutAccessPointConfigurationForObjectLambda PutAccessPointPolicy PutAccessPointPolicyForObjectLambda PutBucketLifecycleConfiguration PutBucketPolicy PutBucketReplication PutBucketTagging PutBucketVersioning PutJobTagging PutMultiRegionAccessPointPolicy PutPublicAccessBlock PutStorageLensConfiguration PutStorageLensConfigurationTagging SubmitMultiRegionAccessPointRoutes UpdateJobPriority UpdateJobStatus').split(" ") 41 | 42 | # https://unix.stackexchange.com/questions/213628/where-do-the-words-in-usr-share-dict-words-come-from/798355#798355 43 | fn system_words_collection() raises -> List[String]: 44 | return Path("/usr/share/dict/words").read_text().split("\n") -------------------------------------------------------------------------------- /corpora/arabic.txt: -------------------------------------------------------------------------------- 1 | ثم أسر فمرّ لأداء, مع ومضى فاتّبع قبل. وزارة التّحول عن الى, كان أن وقوعها، الإطلاق الدّفاع, ودول الثالث، لتقليعة الى عن. أم عرض ميناء شواطيء استدعى, تونس بقسوة وإقامة هو نفس, ميناء بتخصيص الإنذار، أم كما. عل العدّ وإيطالي أما, ٣٠ فعل دارت وبعد. 2 | أي بالعمل والقرى قبل. ونتج الجنوب إيو أم. أن ولم الجو واحدة للإتحاد. هاربر العالم جُل بل, النفط الإحتفاظ الأوربيين ثم قام. وبغطاء الأوربيين حتى هو, أسر و شاسعة المؤلّفة, أمدها وإقامة العالم، مع يبق. تم وقامت تكاليف وباستثناء قبل, أدنى بأيدي الدولارات من حيث. 3 | كثيرة الثقيلة ٣٠ هذه. إذ قادة كنقطة الشهير غير. تمهيد مساعدة التجارية ان وقد, واُسدل بأضرار إيطاليا ثم بعض. كما مع أوسع والحزب ولكسمبورغ, جُل قامت باستخدام بـ. بتطويق والكساد والنرويج لها ان, وتم وفرنسا بالولايات ما, خيار فسقط يتبقّ قام و. بل هُزم بريطانيا الأوربيين فصل, عل بعض مئات اتفاق الربيع،, إذ الأحمر مقاومة ومن. حدى ويتّفق أوروبا إذ, خيار بقسوة تحرّك ٣٠ فقد. 4 | بل الأمم للمجهود الشّعبين كلّ. مئات بالرغم يتم أن. من حيث وانتهاءً الأمريكي, تم كلّ نقطة وحرمان الولايات. هو لمّ السيء عشوائية. 5 | من قتيل، وبولندا الأمريكية مكن, هُزم الأولية انتصارهم حين بـ. المنتصر الأهداف كما بل. ذات لم فكانت لعملة المتاخمة. حول بل للصين لإعادة, عرض أن وشعار الانجليزية. مكن حقول القادة كل. جيوب وبعد الشّعبين جعل أم, عل معاملة أفريقيا بعض, والفلبين ويكيبيديا فصل ٣٠. 6 | وعلى الأمور تحت أم. بين أم المضي والفلبين انتصارهم, الأرض الأراضي انه ان, عن أخر وإقامة الشتاء انتباه. وفي تشكيل فرنسا تشيكوسلوفاكيا ان, في وقرى ليركز وقدّموا مدن. بحث الباهضة الإتفاقية مع, حول ٣٠ أدوات بالفشل ويكيبيديا, أثره، بأيدي بـ وتم. مرجع العصبة الحدود على و. 7 | فصل من حصدت دأبوا الأبرياء, ان مرجع الأرواح عدد, هذا دارت الأمم في. إبّان الإقتصادية قبل بل, بين ان كرسي اكتوبر وبلجيكا،. حتى ما الثالث العمليات العالمية, عل بعد أراض الدول قائمة, من إعمار واتّجه للمجهود عدد. في وجهان تطوير ليرتفع دنو, يتم ببعض حاول المجتمع أم. 8 | أوزار المارق الشهير قد بلا, وحرمان ديسمبر إذ تحت. حدى عن شرسة إعلان الفترة. إذ دخول إستيلاء هذه, ٣٠ هذا وترك لأداء. لإعادة جزيرتي باستخدام من لكل. يقوم بالفشل بريطانيا، شيء أي, كانت بقسوة كل دول, الضغوط الصفحات نفس قد. بمحاولة الأثنان ما ذات. 9 | يتم عن واتّجه المنتصر, بل أوزار الأبرياء وبريطانيا كلا. حدى عن فسقط وصافرات, لها لم شرسة وقامت الجنود. أم الأخذ وانهاء بها, مما جيما فكان مقاومة و. المشترك شموليةً كلّ و, التّحول والمانيا بعد بـ. دفّة وبعدما جديداً فعل ثم, قبل وانهاء الهجوم ان, من وقوعها، المزيفة كلا. 10 | كثيرة بمعارضة ان ذات, الا لعملة بمعارضة الإيطالية لم. جدول المؤلّفة حدى من. قام في الأمور ديسمبر الاندونيسية, كانتا وبولندا جُل ثم. بتحدّي البشريةً كلّ بـ, مايو للصين فهرست هذا ثم, فبعد المضي اوروبا ان عدم. بعد قد ميناء وبداية واتّجه, المارق الأوروبية أي أضف. و بأيدي محاولات جُل, تحت اعلان وأكثرها إذ. لم بعد ليركز الفرنسية. -------------------------------------------------------------------------------- /corpora/chinese.txt: -------------------------------------------------------------------------------- 1 | 巡済思真報就少健話新能長載画。条護超館講商法何失転絶障報東民政浜六青聞。木送月定枚性校増予体労根援住話広。親氏創再信止園南質作海資著制映制。上徳次証禁込福画給査北州厳真。政噴属要式優芸花季件童門。議南補太権場検数場爺極東年了目物。背園鬱購事園税応駐小雪京安字。標事何式間若学投対継細一数常遣疑住場通。 2 | 魚申天旅連社今覚返車交社聞変。決結載面共対開全京誌約森気経報覧失日。住始問願芋火提法決断名身舞込最。廃図万読聞禁天野著技政来閲署心保見。王最通光期行米表視無校更科楽底衆円。但出感考加出不必敬奈族勢件陣市開意軽役済。市重森先式交図更再験成以極広注。理七垂上尾総信会名海宝海開日情村。無規代両明声先想内追書謙代。 3 | 捕点改現立野夫読郵育典提要龍毎権軽。性山陣禁欠近秒米長天号連非世。知張書未羊通嗅第和半教前程挑香。不並配恐著決木予家記容容日達株。露急置上趣用選魅住船座陽予共代去相能。左止朝士猶環部時書間切現沢苦案開子決個挿。極極値年昔友祉写原軽毎月現読勝説訴定件圧。評要園任治撤競記飯康下捕意登育読検地倒文。 4 | 対構哨武港載中群読病載音。聞国知売日夕処組眠木手要。録視聞早神転帯政五禁安入写出祉覧古片全。海形健読成安設陽権者男座打成素議。義賞堀発政表養効事吉題前造向冬歓立。初大転目政害定育教育日田千岩露。問上件者晴男件出代新占登祭部的学選。中著万点層愛綺載削的止党設却文蒼。式委現析会司泳厳期逆経長。米内却調暮夜大討防備品発問。 5 | 可著裏府暮才夕済断塁意証大。江終旅取方中協広指杯罪真松平開。段源有読運問載込表育日望板南容数索蹴。在急動件源辺永内原申面見語幸未真。毎優医負究対無朝止柳式旗亭祈索稿立営。劇子稿今嫁今虫年館能東球見際法名政。司供特媛方相銃望評覧能音台旅。売健誠系民半丸能橋暮堂田合際製経記題関。選南基合録止飾立連役見催五第連代。 6 | 属玲全支開手部衝関優亭国変問社環裏。転指育娘狂第権提連準前乗男連計実。力銀残住面告進更暮平護胸面係相補枝。憲舞対限容手健福年局月券談堀。島量極区明説芸豊故勝例部初都共者場。子成戦籍火北保際徳各一判交。志言香察凡飯模崎高色盤研促周問罪評噂盟歩。吾済会断全決変必重前著左検和家空境講。省特場薬受天面掲直転技部八。 7 | 士杏評公広第診謙学供氏査開式犠索攻趣義。送度毎省源場読乗選球合門表試毎再記入風嶋。月府躍針治宕号告式切試阪字課比市互速。故話紹谷位文期載水五答稿案埼安情。倉応記子初地行宏果後検祭明国訳。至増綾育除魚時外亭新価択。治官録容心国囲動止氏公券効進月句級宣論全。行野接重身妖話作済帳通際告棋。取援業症載像多文下表計時輸化車属。 8 | 雄禁読放訳済覧押悪身象真進体。文臣方事占刺俳立堂止回護公碁質。局民返自必面幻座職必質弾標陽渡計大府。休売担記問空川貯則章阻表者郡森編。化乗感想開私湖押面杯訴多怖。緊物文表野図情給端件置属真能。世参文域挙患甲作際辺強赤懇者部。五決政然児能本原表第独利区搬解。入論日講塾条満働大都別話。約載副得実後必学里場四幅芸弁示北高達。 9 | 将原放今恵該設島世稿代社。射第法件金内良無政毎城者佐見所代真弁。棋稿稿工申径走芸組合重泉山字万。頑男無拍超高童宅校全風新川。県場告国好都治中成担未為経要京切王世兵組。所業雄乗問巨図報無渡分写通論踊味転測引。歴許名績断左都席示学小将細。療他繕原謝保選域淡含誤保誤戦究択圧意。者文強弟更権浮地製市任竹斉内童殖死課。 10 | 園転掲連経取奥再異助事打人鮎。館聘掲入阪整太内也与先青並。選神情恒済級郵仁紹争施新近他。情打引治欧治型京鋭見池提象提地意読稿。応就説応判仕狙本所由影要軽記続。行置業載役世仁四界骨際桑格極再院小載再燃。義頼白列紹半社選奪試市新醸。成雲川法住下経乗康参日迎力食超施変挙朝民。聞葉性経好玲学体返渡定状容無急大代職翌味。 -------------------------------------------------------------------------------- /corpora/french.txt: -------------------------------------------------------------------------------- 1 | At hâs putànt tîmèam. No èssé mêis mèî, tê vïde éuismod sapërét eœs. Né qûï tàle cotidîequê, congue definïebàs çompléctîtur te qùo. Ea méà énim docendi praésent, no probo oblïqûe çœncèptam mea. Ut tàmqùam principes qûi, sàdipscing cônsectétuer çu vim. Vix në sïmul nemôrê torquâtos, stët facete përîculâ eà cum. 2 | Façëte offîçïïs atomœrùm éu prî, at tritàni delenit prô. Qùi volutpàt tîncîdunt et, îus èlît iusto œratîœ ân. Côngûê plâcerat similique sea èt, sîmùl albuçïus în pro. Ei prôbo concéptam rêprimiqùè qui, éu labôre possit his, të ius bruté vérïtus. Vivèndo tràctatos per cu, quîs œportéat ràtiônîbus êx vel. 3 | Usù lobœrtîs rectêqùè an. Pro îd primâ apêriri tractâtos, dicït salutatus mei an. Dissêntiet reprêhendunt his at, mea môvët adipisci éu, quàndô dïctas quo ex. Cû ludûs dôlôrum advêrsârium méî, in eum nèmoré impèdit, prïncîpes urbanitas vœluptatibus sea no. Eù prô mandamus périçùla mâïéstâtis. Ad çlita indôctum mel, ûrbanitas omittàntur vis nœ. Iùs ut tamquam appareat sçrîpsêrit, àn hàs tïncidunt scrïptorém, offïçîis gubergrén nëcéssitàtibùs seâ at. 4 | Dœcéndi ëlïgëndi èi séd. Véniâm nœluissé îd mël. Ad fâllî iudico sit. Facer qûodsi in sit, méi facétë similique at, his vëniam dicûnt menandri tè. Qùândo ëfficïàntur ëî mèà. 5 | Ea méis postéa graèço mei, cu stet cîbœ tamquam ius, îd pro porro assuéverit. Eum irîùre probatus id. Sed éû laùdém aetérno, âgàm omnêsque nêcessïtatibus çû ëœs, vix ex probœ mëliore. Dolœrê philosophia ët cum. No nam nèmœre nonumes. 6 | Alïi verear quî te, ridêns cœpiosaè detraxit an ius. Ad omnës ëleîfênd his, nonumés laborâmus usu et. Pertinax reprîmiqùé eà mel, êx mél quot fâbulas èpîçuri. Eros ludus învîdùnt èum nè, mùtât modus vis et. Sea ât dôctùs dispùtàtiôni, pro ad brute simul tràctâtœs, est cëtèro intéllégàm ïd. Adhuç mundï quî èi, éu veniam aperiam ïùs. 7 | Lùciliûs volùtpat mél in, legeré opœrtèàt sea ne, mêl illud rèqûê labîtur ùt. Ad altêrum vulputate nâm, séd ëripuit lâborês nô. Pri choro dôlorum cu, ut mea îûdîco cetèro cônvêniré. Muçiûs optiôn rèfôrmîdans ad ést, ât mél dœming qualîsqué, nec ôdio molëstîe vïtupératœribus ne. 8 | Solét fuïssét êx mêî, èï nobîs fâçilisis élaborarët ést, ut mël tantas âdvérsarium. Ne unum vœcibus prœdessêt vim, in hïs hàrum traçtatos corrûmpit. Utinam latine maiorum id duœ, eligéndi vitùperâtà disputationi no est. Të élit apériàm séd, pertïnacia omîttàntur et èst. Stet vïdît lobortis has tê, ëi nàm commùné iràcùndiâ phïlosœphîa. 9 | Vis summô êvértitur id, nàtùm fàcer percîpitur ne séa. Eàm promptâ éxpëtendâ nê, hàrum tibîqué sèâ ut. Meîs facîlisi an quî, cû latine àccusatà assuèverit mei. Doctus habémùs ne vél, mèa ét vitàé nostrud appareat, at duis çommodo hïs. In êam quôd malorûm façilïs. Ex vis essé modus çôrrumpit, çum èâ rêpudiandâé signiferumqué, nam nœ réqùè inânï dëlicata. 10 | Vis id œmnïs mâzim, in sït sàpèrét debîtis, qûâestio prodësset tè vîm. An sôléât àdipiscing meî. Sea àn dêtracto repùdiare, id dolorèm placeràt vîtûpêràtoribus sëd. Hàs cœrporâ çœncéptâm no. -------------------------------------------------------------------------------- /corpora/georgian.txt: -------------------------------------------------------------------------------- 1 | ლორემ იფსუმ დოლორ სით ამეთ, ფერრი ცოფიოსაე სედ ეუ, მოლესთიე ფერფეთუა ეუ ვის. მალორუმ ვირთუთე ინ ყუი, ყუო ეა ორნათუს ვითუფერათა. ევერთი ინერმის ან ესთ. აუთემ ნულლამ დისსენთიას ად ეუმ. თემფორ ანიმალ რეფრიმიყუე ინ ჰას, უსუ ფრიმის ელეცთრამ თე, აუთემ აუდიამ ეამ უთ. 2 | ათ მელიორე მოლესთიე დელიცათისსიმი ფრო, ვივენდუმ ინვენირე ეოს ნო. ჰის ეთ ცეთერო დისფუთათიონი, ჰის იდ ლეგერე ლაორეეთ დისფუთანდო, ყუი ეა თათიონ მუნერე ფოსთეა. იუს ეფიცურეი ფაცილისის ომითთანთურ ცუ, ერანთ თრაცთათოს ვის ეი. ურბანითას ინცორრუფთე სით ან. 3 | ნე ომნეს სემფერ ყუო. ეი ეამ მოვეთ ვენიამ არგუმენთუმ, ვერეარ თორყუათოს ელოყუენთიამ ფრი თე. ათ ირაცუნდია გუბერგრენ ფრო, ელით სფლენდიდე ად უსუ. დიცით ლაორეეთ ეხ ეოს, ეამ ათ ირიურე ფეთენთიუმ იმფერდიეთ, ეიუს ფრიმის რეფორმიდანს ვიმ თე. სუმო ნოვუმ მეი ნე. ან ეამ ალიენუმ რეცთეყუე მოლესთიაე. 4 | მეი ნოვუმ სიმულ ეუ, დიცთა ფუთენთ ჰას ეთ. ყუი მალის ლეგერე ეთ. ჰის ცუ ინანი უბიყუე გრაეცე. ინიმიცუს ცონსეცთეთუერ ფერ ეხ, უთ ჯუსთო ფოსსე ფერ. ეა ჰას ინვიდუნთ სცრიფსერით, მეა ფაცერ ფოსსით ეთ. ფრო ათ თორყუათოს ადვერსარიუმ, ვიმ ცუ ნოვუმ ალიყუანდო. 5 | მეი ლაუდემ ყუაესთიო ინცორრუფთე ან. ალიყუიფ რეფორმიდანს ველ თე, ნო ვის ნიჰილ გრაეცი დომინგ, იდ ფოსსით თიმეამ ფერფეთუა ველ. ჰარუმ აეთერნო ფერსეყუერის ეხ ნეც. ცორრუმფით დეფინითიონეს თე ყუი, მუნერე დელიცათისსიმი ეთ ცუმ. მელ ეთ სოლეთ თანთას ინციდერინთ. 6 | ინ ლობორთის ოცურრერეთ ველ, ნო თალე სუმო მუთათ ფერ. ელეცთრამ სიმილიყუე ფრი ან. ეა დიამ აცცუსამუს ფრი. ინ ანიმალ ვულფუთათე ნეც. უთ ფერრი ფოსთეა ვის. 7 | მელ ინ იუდიცო ყუაეყუე მინიმუმ, ად ფუგით დეცორე ლეგიმუს დუო. იუსთო ფერსეცუთი ეთ იუს, ველ თაციმათეს თინციდუნთ მედიოცრითათემ უთ. ვივენდო დელიცათა ცოთიდიეყუე ფერ ცუ. ყუემ ასსუევერით ნეგლეგენთურ ვიმ ეა, მაზიმ რეცუსაბო ნეცესსითათიბუს ფერ ეხ, ეა მეა მეის მუნერე. ნო დუო დიცთა ვენიამ ინერმის. 8 | დიცამ სალუთათუს იდ უსუ, ნეც ნო დენიყუე ელაბორარეთ აფფელლანთურ. ნეც ომნესყუე ცოთიდიეყუე ცომფლეცთითურ ნო. ინ ფერ რებუმ ფერთინახ, ნონუმეს ჰაბემუს ათ ეუმ. ჯუსთო ნოსთრუდ დისფუთანდო ნო ნამ. 9 | ცუ ლაუდემ ვოლუფთათიბუს იუს. ელით ცონთენთიონეს ველ ეი, ინ ნეც ვენიამ ნონუმy. თე სით ულლუმ დესერუისსე არგუმენთუმ. ან სთეთ ოფორთერე ესთ, ფერრი ფორრო დებითის ეუ ცუმ. ეროს ოფთიონ ეხ ფრო, მელ ათ ინანი ინერმის ვივენდუმ. 10 | იდ ვიმ ილლუდ ინთელლეგებათ. ეა ჰის ცეთეროს ვოლუფთათუმ დისფუთათიონი. თე ფრობათუს თემფორიბუს ინთელლეგებათ ნეც. იდ მელ ამეთ სცაევოლა. ყუოდსი ყუაესთიო სფლენდიდე ყუო იდ, ათ უსუ სალე ინთერესსეთ ცომფრეჰენსამ. იდ ეამ დიცამ დელენითი, თე მეა ფორრო ნულლა, ყუო ალიი ენიმ ფრიმა ათ. ნეც ფუთენთ ფეუგიათ ეა, თე მედიოცრემ ფერსეყუერის ვიმ. -------------------------------------------------------------------------------- /corpora/greek.txt: -------------------------------------------------------------------------------- 1 | Λορεμ ιπσθμ δολορ σιτ αμετ, ιλλθμ φορενσιβθσ εα σιτ. Ηαβεο περιcθλα εφφιcιαντθρ cθ vιμ. Περπετθα αδολεσcενσ εθμ θτ, vιμ οδιο δολορεμ ομνεσqθε αν. Εστ cοπιοσαε λιβεραvισσε νε, ιδ μει ελιτ φεθγιατ μεδιοcρεμ. 2 | Ναμ ρεqθε ιμπεδιτ πρινcιπεσ αδ, ατ ναμ εραντ μθνερε ρεγιονε. Ει προ βλανδιτ φαcιλισι αccθσαμθσ, τε εαμ σαλε παρτεμ απεριαμ, εα πριμα ανιμαλ vιμ. Εοσ ετ διcο ομνισ jθστο, εξ ηασ ελιτ μελιθσ vιvενδο. Σεδ ιγνοτα cοντεντιονεσ εθ, νε ιλλθδ ποσσε εστ. Εξ μελ cλιτα ομνιθμ vολθτπατ, μεα ταcιματεσ ελαβοραρετ αδ. Εξερcι εξπετενδα ει μεα, εα ηασ περσιθσ cοτιδιεqθε. 3 | Πθρτο δεσερθντ εvερτιτθρ cθμ cθ. Cλιτα ιθδιcο qθιδαμ νεc ιδ. Μελ νθλλαμ cονcεπταμ ει, ατ προ αδιπισcι ρεπθδιαρε cονστιτθτο. Γραεcε δεβιτισ γλοριατθρ νεc ιδ. Δθο ετιαμ ρεφερρεντθρ τε. 4 | Εοσ ιν οπορτερε σπλενδιδε. Εραντ δομινγ cομπρεηενσαμ εθ σεδ, ιμπετθσ νθσqθαμ ιντερπρεταρισ ετ ηασ, περ αβηορρεαντ μοδερατιθσ νε. Προ πθρτο vιρισ δολορε τε, οπτιον vερτερεμ αδολεσcενσ ιθσ νο. Νο cθμ σθασ δεβιτισ τραcτατοσ, πλαcερατ φαcιλισισ cθ μελ, cασε ιλλθμ θρβανιτασ ετ ιθσ. Απεριρι σcριβεντθρ vελ ιν, νεc αθγθε λαβιτθρ εα. 5 | Cθ πριμα φθισσετ τινcιδθντ περ, vιμ cθ ινανι ελαβοραρετ. Ατ δθο νισλ ηομερο. Νο διcιτ εσσεντ ελειφενδ εθμ, ετ διcτα ιντερπρεταρισ ηισ. Προδεσσετ μνεσαρcηθμ μεδιοcριτατεμ σεα εα, γραεcε φαβθλασ πρι ει. Vιμ αδ vιδε πηιλοσοπηια. 6 | Εστ νε ερατ ριδενσ, προ cθ πθτεντ σεντεντιαε. Ατqθι φαcιλισ ιντελλεγατ αδ ηισ, qθο σθμμο vολθμθσ περσεqθερισ εξ. Μει ιμπεδιτ τηεοπηραστθσ cθ. Ατ αδμοδθμ δενιqθε ιθσ, αν σαεπε προπριαε ιντελλεγατ δθο, ηισ εθ αccθσατα πετεντιθμ. Ιδ σιτ μθνερε ινcορρθπτε. Ποστθλαντ ινcιδεριντ νεc αδ, λαθδεμ cονσθλατθ ιμπερδιετ vιμ τε, vιvενδθμ qθαλισqθε vολθπταρια θτ ναμ. 7 | Εθ vερι ιντερεσσετ vισ. Μοδο βρθτε ιισqθε θτ vισ. Μελ ιδ δεβετ vιτθπερατα ιντερπρεταρισ, μει εροσ qθιδαμ cομπλεcτιτθρ νε, ατ vιταε μαιορθμ cονσεcτετθερ νεc. Ιν δολορθμ φαcιλισισ εοσ, εξ ενιμ λαβοραμθσ ηισ, δοcτθσ απειριαν σεντεντιαε qθο εθ. Σονετ δελιcατισσιμι εοσ νο, σονετ vιταε νο σεα. Εξ σεδ φαcιλισι cορρθμπιτ. 8 | Πριμα τηεοπηραστθσ εα ιθσ. Ατ vελ ατqθι vιδισσε ινvιδθντ. Εαμ cαθσαε οπτιον ελιγενδι αν. Ετ εριπθιτ νομινατι ελαβοραρετ εαμ, τιμεαμ λεγιμθσ πατριοqθε δθο αδ, ετ δθο σενσιβθσ τεμποριβθσ περσεqθερισ. 9 | Αν vιμ οδιο ταλε, σεδ τε ιδqθε τεμποριβθσ. Ηινc γραεcι ασσθεvεριτ μει εθ, vελ ιν περσιθσ ινερμισ περτιναξ, αθτεμ τολλιτ μολεστιαε δθο ατ. Σεντεντιαε τεμποριβθσ ει σεδ, qθι νο μινιμ δολορ qθανδο. Θτ ηισ cονσθλ φθισσετ, vολθμθσ cονσθλατθ σcριβεντθρ vισ αδ. Ειρμοδ ομνιθμ σενσιβθσ ηισ νε, εαμ περσιθσ αντιοπαμ περιcθλισ εα, vισ cασε σολθμ ιντελλεγατ ατ. 10 | Ει τοτα ρεπριμιqθε δελιcατισσιμι vιξ, εξπετενδα vθλπθτατε τινcιδθντ νεc αδ, ιλλθμ σcριπτα περ θτ. Απεριαμ εριπθιτ πρι ει, ετ εοσ πθταντ εqθιδεμ vθλπθτατε. Νεc διcθντ λατινε δεσερθισσε εθ, cθ εαμ μοδθσ qθοδσι ινvιδθντ. Νο μελ μοδθσ cετεροσ νεcεσσιτατιβθσ, πριμα σονετ απειριαν εθμ εξ. Ελιγενδι τηεοπηραστθσ νε qθι, διcατ τεμπορ λαορεετ μελ εα, εα σcριπτα δοcενδι vολθπτθα σιτ. Νο σαλε λθδθσ αππελλαντθρ. -------------------------------------------------------------------------------- /corpora/hebrew.txt: -------------------------------------------------------------------------------- 1 | זאת ולחבר מדריכים את. עוד הגרפים ייִדיש גם, או שמו הרוח חינוך אספרנטו, תנך ספרדית תיקונים ביולוגיה ב. שתי גם אחרים קלאסיים בויקיפדיה, או שתי עסקים תרומה חבריכם. כלל ראשי בחירות לעריכה של, ב מוגש יכול המזנון כתב. החול בחירות בדף על, בה הטבע בקלות האטמוספירה צעד. 2 | רבה מדריכים סוציולוגיה או. גם בקלות מיזמים מועמדים קרן, גם אחר מושגי ומדעים רב־לשוני. ב מפתח זכויות התפתחות מדע, תורת רביעי גם זכר. היום לציין או כתב, שאלות והנדסה בה אנא, את דפים עיצוב כתב. אם שנורו רב־לשוני בקר, על הטבע לחיבור סדר, או כלל מוסיקה הסביבה. 3 | ובמתן והנדסה על תנך, כלים בקרבת מונחים קרן אל, חפש נבחרים אירועים פוליטיקה או. על כדי ערכים לחיבור שינויים. שער בה ברית כלשהו אנגלית. ננקטת וקשקש רבה של, או החלל מיתולוגיה שער, תנך או רוסית באגים שימושיים. 4 | זאת בה תיבת יסוד לערוך, גרמנית קישורים האנציקלופדיה מה אחר, ב נפלו העזרה אנא. והוא התפתחות ויקיפדיה בה אתה. תיבת שאלות טבלאות עזה דת. אל מתן מדעי ניווט ביולי. או רבה אירועים אינטרנט האטמוספירה, פיסול אודות ב כלל. 5 | או ברית דרכה מתן. כלל קהילה ומהימנה על, צ'ט אל הבהרה בקלות הספרות. אחד את החלל סטטיסטיקה, כלשהו טכניים עוד אם, היא אל מונחונים האטמוספירה. אל ציור עסקים בעברית זאת. 6 | תנך או בידור קודמות. אל בקר אחרות פולנית. ביולי ספינות שמו את, בקר שונה ומהימנה ופיתוחה אם. פיסול ערכים המשפט אחר בה. 7 | או מדעי תורת ויקימדיה תנך. עזה אל זקוק ביוטכנולוגיה, על כדור רוסית לחשבון תנך. זכר דת מיזמי ספורט בהשחתה, מה דפים מושגי אדריכלות אחר. אחר היום בגרסה את. על מדעי מדריכים מתן, רביעי לערכים מה סדר. זכר גם זכויות אקטואליה אתנולוגיה. 8 | צ'ט הטבע יכול מושגי של, אל לוח שתפו הראשי. על הסביבה מיוחדים שכל, דת שמו יכול ואמנות. מיזם לעריכת קרימינולוגיה את רבה, רבה ב הגולשות רב־לשוני. את כיצד גרמנית למאמרים רבה. 9 | שכל ב ביוני התפתחות תאולוגיה. על עזרה ותשובות כתב, בהבנה בויקיפדיה כדי בה, קרן בקרבת תקשורת או. שאלות קישורים אתה ב, בה רבה מיזם לימודים. קלאסיים טכנולוגיה מדע על. שמו לחשבון הקנאים סוציולוגיה גם. ניווט משחקים מדע אם. שנתי ספורט בקר גם, החברה אחרונים שימושיים קרן בה. 10 | דפים בארגז שמו או, דת עוד לערך אנציקלופדיה, כדי ארכיאולוגיה ביוטכנולוגיה מה. מפתח שמות מועמדים כלל אם. מתוך ציור לעריכת זאת של, הטבע בארגז ומהימנה או כתב. ביולי משפטית רומנית דת כתב, גם לוח פנאי התפתחות. -------------------------------------------------------------------------------- /corpora/hindi.txt: -------------------------------------------------------------------------------- 1 | कैसे संसाध ज्यादा अंग्रेजी गटकउसि हमेहो। प्रसारन सक्षम पसंद ढांचा देते उदेशीत स्वतंत्रता बाटते उद्योग उपलब्धता दिये हिंदी एछित उदेशीत अनुवाद विवरन सकती सारांश सुस्पश्ट अंग्रेजी मुक्त सारांश प्रसारन हार्डवेर एकएस हैं। लेकिन आधुनिक वर्णित बारे जानते संपादक यन्त्रालय 2 | चुनने मानसिक विकसित जिसे करता। दारी बातसमय प्रेरना दस्तावेज जागरुक है।अभी अनुवाद दौरान होगा समाजो प्रसारन सम्पर्क बीसबतेबोध विषय मेमत बनाकर जागरुक लचकनहि विभाजनक्षमता उसीएक् स्वतंत्र प्रौध्योगिकी बारे तरीके आंतरजाल अमितकुमार विकेन्द्रित खरिदे हीकम यधपि करता। हमेहो। केन्द्रिय व्याख्यान मानसिक बिन्दुओमे अर्थपुर्ण सकती बनाकर सके। उपेक्ष प्रोत्साहित माहितीवानीज्य चुनने संस्था शारिरिक सोफ़्टवेर तरीके देते चिदंश मुखय दिनांक व्यवहार 3 | माध्यम दोषसके बातसमय पसंद वर्णन उन्हे वेबजाल हुएआदि प्रोत्साहित लेने हमारी ऎसाजीस प्राधिकरन पढने पहोच हुआआदी हुएआदि ध्वनि वर्तमान केन्द्रिय पुष्टिकर्ता एवम् पहेला सदस्य आशाआपस स्थापित ७हल बाटते भारतीय हमेहो। क्षमता। मेमत समस्याओ व्याख्या प्रति जाता वैश्विक सुस्पश्ट विभाग वहहर लक्षण सुना प्रमान मेंभटृ विश्लेषण किया चिदंश वैश्विक वास्तविक मुश्किले 4 | लेने बनाना खरिदने पत्रिका कम्प्युटर सुचना नीचे विनिमय सोफ़तवेर निरपेक्ष हुआआदी व्याख्यान सहयोग अधिक उपलब्ध संपुर्ण अंग्रेजी बनाने दिये पहोच। जैसी किएलोग स्वतंत्र बीसबतेबोध दारी विकास खयालात विभाजन 5 | असक्षम प्राथमिक वर्णन बारे संस्था सभिसमज मुश्किले देते दौरान भोगोलिक केन्द्रित लेकिन कैसे कीने विशेष वर्णित अधिकांश ध्येय काम उद्योग निर्देश प्रतिबध हमारि सकते प्रव्रुति पुष्टिकर्ता विवरन भाषए पत्रिका विकेन्द्रियकरण 6 | लक्षण पासपाई प्रसारन रखते उनका आवश्यक आंतरजाल अर्थपुर्ण खरिदे बढाता चाहे सभिसमज दस्तावेज स्वतंत्रता सहयोग कीने प्रेरना मुश्किले लगती प्रेरना लाभान्वित करके(विशेष हिंदी वर्णन विनिमय प्रेरना जानकारी हीकम देने बलवान संपादक परिभाषित मुक्त २४भि ऎसाजीस तकनीकी अधिकांश कराना आपके परस्पर वातावरण सार्वजनिक समस्याए संपुर्ण विषय लेकिन विवरन तरहथा। हमेहो। अधिक लगती पहेला करेसाथ वैश्विक प्रति उपलब्धता विभाजनक्षमता कोहम वर्ष विश्व सम्पर्क बीसबतेबोध जिम्मे प्रव्रुति प्राधिकरन लेकिन रचना सामूहिक ढांचामात्रुभाषा बाजार उशकी 7 | एसेएवं समस्याओ लाभान्वित हैं। मुक्त डाले। किके सहायता विस्तरणक्षमता हीकम ब्रौशर ज्यादा मर्यादित प्राण वेबजाल सभीकुछ एकत्रित सेऔर विकास प्रति विषय उपेक्ष नयेलिए वर्तमान कम्प्युटर पुर्व अथवा देखने अपनि अधिकार परस्पर वास्तविक सादगि मानव असक्षम दिनांक सिद्धांत प्राधिकरन बढाता होभर प्राण लक्ष्य बिन्दुओ सुस्पश्ट सकती 8 | विनिमय कम्प्युटर वार्तालाप थातक खरिदे बहुत जानते सामूहिक निर्माता विभाजनक्षमता चाहे सारांश निरपेक्ष सीमित अथवा भाषाओ गुजरना प्राथमिक अधिकांश अन्तरराष्ट्रीयकरन उनके बनाने अत्यंत कार्यलय रचना उन्हे विषय हमारी दिशामे वेबजाल कार्यकर्ता देने हुआआदी सभिसमज बनाकर गएआप नयेलिए दर्शाता समस्याओ संदेश 9 | बाटते ब्रौशर विभाजन विकसित खण्ड मुक्त खयालात उद्योग काम विवरण अधिकार हमेहो। प्रोत्साहित पडता वातावरण समस्याओ विकेन्द्रित लिए। समजते बहुत आधुनिक अर्थपुर्ण हिंदी आवश्यक प्रदान स्वतंत्रता विकास सके। होभर चुनने मानसिक वातावरण विज्ञान किया पत्रिका मुश्किल समस्याए हिंदी 10 | सुचना आपको विज्ञान समस्याओ बेंगलूर एछित डाले। कार्यकर्ता अमितकुमार जानते सुविधा कार्यकर्ता करके(विशेष होभर एकत्रित व्याख्या देने कार्य सादगि जिम्मे वास्तव व्रुद्धि भाषा २४भि उपेक्ष तकनिकल करके(विशेष उन्हे करती परिभाषित माहितीवानीज्य बढाता खयालात एसलिये सुना जानते विवरन दोषसके सोफ़्टवेर सिद्धांत करके दोषसके आवश्यकत व्याख्यान अपनि सुचना जिम्मे आवश्यकत आंतरकार्यक्षमता बाधा चिदंश -------------------------------------------------------------------------------- /corpora/l33t.txt: -------------------------------------------------------------------------------- 1 | 5IM1LaR c0mm4ndz h4D 1T. 1n70 kl1k be 4r3. 45 n33d 1nDeX3D, 4u70m471(4lly y3r. N33d miL4R 83 70p, d@ w1ll w1+hOUT INt3r35+3d n0w. +o joo m0r3 4v41|4b|3, Up c|1ck d0(um3n7 d159l4y3d, 73h, 4|| t0 w1tH (4(h3d 51m1l4r. W17h @n33 f00l! 93t y0, 94g3 WI5h h4D da. 2 | |235u|7z 4v41|4b|3 k@n d@. @R3 da 250m \/\/3b 7|24n5|4735. @$ KWIckLy, 1nd1c473z0r w3b. Vve +H@T wh1(h c@ch3d be. 0vvn3r z3aRc|-| 1T f4q. 7he p@93 p1cz! 4z. 3 | K@n |4unch f34tUr3 f337u|23, 0f. M1t3 b33n d0 0u7, @$ iTz f0|2 r35ul7. 4u70m471(4lly. Be @R3 f00l! De@l. |-|@v3, d@ 4r3 vv3b p|20g|24m, 5|\|4p5)-(07. D0 5peNDInG KWIckLy, 4nd. D0 fOr h7m| Wh3|\| L0Ok1NG, 3nT3R Ph13LD, Up y3r. 4 | 0n d1z b4(| d33z, CaN p1>< 5(0u7s |247h3r d0, p4g3, INt3r35+3d m4y 1T. TH4T n33d w17h +h3 83, be @R3 F1ND p49E$ pr3f3|23nc35, d3n MOr3 M4NY qu3ry 17. Why 7h15 r35u|7 0f, @$ 7he p@93 p4g3, |235u|7z, 83 t3H p4g3 f|20n7. 0f73n @8ou+ g3t 83, 47 1iNx v3|2510n pdf, p1x 83 qu3ry |\|0t 717|3z. 70 joo P4935 r35u|7z, 4rE U5Ed m0r3 g00g13 iN, d@ kl1x 73rm5 z3aRcH, c4n. 0t|-|3r k0pYr1t3d t3H d@, 717|3z r35ul7. kvv3r33, 17 f0r. 5 | 73x7 l1nx iz 937, +hO53 m1-|-3 0t|-|3r pdf 1+, LINk mIGH+ 94g3s CaN y4. Be pdf d33z l1nx c|1ck, @R n0n-3N9l1sh p4|271cu|4|2 f4q. D1z HAV3 v3r510|\| (ra\/\/1z !=. 0R w3b wh1(h d0wn|04d, @$ 93t M155In9 tHUm841|_, 4r3 0f 4cc355 4lvvAyz tHUm841|_. Da NUMbER 1nd1c473z0r 0u7, 17 h4D 5IM1LaR d0wn|04d. WI5h 7h4n 93t u5, @R3 @$ wIlL NUMbER f1|\|d. 6 | Up 7he 4|50 m47(h z3aRc|-|, 7h3y (4(h3d k0pYr1t3d c@N b3, 7h47 tR4nz|_4t10n @nD 1n. 5O f0r p4g3 h4v3 0f73n. +3's yOU, d0(um3n7 1+ fOr. IN 937 f|20n7 0t|-|3r zp33k3rz, 0u7 Th@t 4b0u7 c0mpu73|2, 83. 0n d0n't 74|<3z v3|2510n H45, 7h3 De@l. 0p710n 17. B33n w17h p@g3z y3r b3, alz0 |21gh7 3n4b|3 kUm 0n. 7 | N0n 1+ Th@t phr0|\/|, 937 15 |4unch 53rv3s |_@n9U493. 1F u\/ c|1ck 0vvn3r, d@t, f00l! 0p710n BuT y@. IN @nD De@l. 51m1l4r. D4 8Ut 94g3s |_@n9U493, No+ be 94g3 w4nN@, 1iNx w1+hOUT 1F w1t. IN +hO53 c0mpu73|2, iTz, +H@T v3ry c0nT3nT, 73h 0n. 1T z3aRcH k4cH3d k0pYr1t3d 4nd, |1nk 4lvvAyz 1T fOr, 45 BuT 51m1l4r r3l473d. 8 | Iz 4r3 73rm5 534|2ch KeYW0rD5, n33d n0n-3N9l1sh 17 +HE, 83 pHinD 5It35 4r3. PHor d159l4y3d, @$ p1x, l3tz f4m1|14r 1nDeX3D, why |7. N33d 34513r d3n y4, M155In9 c0nT3nT, 1T f0r. (0py d33z 5O c4n, 4r3 v3r510n tR@nz|_4t3d 4u70m471c4||y 47. 9 | 4nd vv1t 0f73n |_@n9U493 d0. 1F @R3 p@g3 wIlL, pdf b3 HELp m47(h. F4q h7m| l1nx r35u|7z 0R. W1|| l3tz M@Y 1F, |1nk yOU, r33zUltz 0f d1z, LINk 3n4b|3d, g3t +o. 17 kl1x Wh0 |23p|4c3d z33. 0f HAV3 p1cz! 3ng|335h 8Ut, 1F vve p@g3 vv0rx pr3f3|23nc35. 10 | D0nT f00l! pR0dUc+ m4y 1+. L1|\|k tR4nz|_4t10n 4u70m471c4||y u5 4r3, w17h p1cz! k0nt@kt 4z pdf, vv1t L0Ok1NG 0t|-|3r N0+ Up. P1cz! 1nF0, r3zUltz y4 w3b, != u\/ 1nt0 p|20g|24m,. (0py 3N9l1sh t0 c@N. 45 (4(]-[3z d0(um3n7 1PH, 1n p1x |-|@v3 |3tz0rz 4u70m471(4lly. != M4NY 3nT3R 73h, 0u7 F1lt3r f0|2m4771ng 17. -------------------------------------------------------------------------------- /csv/.checkoutinfo: -------------------------------------------------------------------------------- 1 | Sun Aug 3 13:02:07 CEST 2025 2 | URL: https://github.com/mzaks/mojo-csv 3 | Path: csv 4 | -------------------------------------------------------------------------------- /csv/__init__.mojo: -------------------------------------------------------------------------------- 1 | from .csv_builder import CsvBuilder 2 | from .csv_table import CsvTable 3 | -------------------------------------------------------------------------------- /csv/csv_builder.mojo: -------------------------------------------------------------------------------- 1 | from memory.memory import memcpy 2 | from buffer import Buffer, Dim 3 | from .string_utils import find_indices, contains_any_of, string_from_pointer 4 | 5 | alias BufferType = Buffer[DType.int8] 6 | alias CR_CHAR = "\r" 7 | alias CR = ord(CR_CHAR) 8 | alias LF_CHAR = "\n" 9 | alias LF = ord(LF_CHAR) 10 | alias COMMA_CHAR = "," 11 | alias COMMA = ord(COMMA_CHAR) 12 | alias QUOTE_CHAR = '"' 13 | alias QUOTE = UInt8(ord(QUOTE_CHAR)) 14 | 15 | 16 | struct CsvBuilder: 17 | var _buffer: UnsafePointer[UInt8] 18 | var _capacity: Int 19 | var num_bytes: Int 20 | var _column_count: Int 21 | var _elements_count: Int 22 | var _finished: Bool 23 | 24 | fn __init__(out self, column_count: Int): 25 | self._capacity = 1024 26 | self._buffer = UnsafePointer[UInt8].alloc(self._capacity) 27 | self._column_count = column_count 28 | self._elements_count = 0 29 | self._finished = False 30 | self.num_bytes = 0 31 | 32 | fn __init__(out self, *coulmn_names: StaticString): 33 | self._capacity = 1024 34 | self._buffer = UnsafePointer[UInt8].alloc(self._capacity) 35 | self._elements_count = 0 36 | self._finished = False 37 | self.num_bytes = 0 38 | 39 | var column_name_list: VariadicList[StaticString] = coulmn_names 40 | self._column_count = len(column_name_list) 41 | for i in range(len(column_name_list)): 42 | self.push(coulmn_names[i]) 43 | 44 | fn __del__(owned self): 45 | if not self._finished: 46 | self._buffer.free() 47 | 48 | fn push[S: Stringable](mut self, value: S, consider_escaping: Bool = False): 49 | self.push(String(value), consider_escaping) 50 | 51 | fn push_empty(mut self): 52 | self.push("", False) 53 | 54 | fn fill_up_row(mut self): 55 | var num_empty = self._column_count - (self._elements_count % self._column_count) 56 | if num_empty < self._column_count: 57 | for _ in range(num_empty): 58 | self.push_empty() 59 | 60 | fn push(mut self, s: String, consider_escaping: Bool = True): 61 | if consider_escaping and contains_any_of( 62 | s, CR_CHAR, LF_CHAR, COMMA_CHAR, QUOTE_CHAR 63 | ): 64 | return self.push(QUOTE_CHAR + escape_quotes_in(s) + QUOTE_CHAR, False) 65 | 66 | var size = len(s) 67 | self._extend_buffer_if_needed(size + 2) 68 | if self._elements_count > 0: 69 | if self._elements_count % self._column_count == 0: 70 | self._buffer.offset(self.num_bytes).store(CR) 71 | self._buffer.offset(self.num_bytes + 1).store(LF) 72 | self.num_bytes += 2 73 | else: 74 | self._buffer.offset(self.num_bytes).store(COMMA) 75 | self.num_bytes += 1 76 | 77 | memcpy(self._buffer.offset(self.num_bytes), s.unsafe_ptr(), size) 78 | 79 | self.num_bytes += size 80 | self._elements_count += 1 81 | 82 | @always_inline 83 | fn _extend_buffer_if_needed(mut self, size: Int): 84 | if self.num_bytes + size < self._capacity: 85 | return 86 | var new_size = self._capacity 87 | while new_size < self.num_bytes + size: 88 | new_size *= 2 89 | var p = UnsafePointer[UInt8].alloc(new_size) 90 | memcpy(p, self._buffer, self.num_bytes) 91 | self._buffer.free() 92 | self._capacity = new_size 93 | self._buffer = p 94 | 95 | fn finish(owned self) -> String: 96 | self._finished = True 97 | self.fill_up_row() 98 | self._buffer.offset(self.num_bytes).store(CR) 99 | self._buffer.offset(self.num_bytes + 1).store(LF) 100 | self.num_bytes += 3 101 | return string_from_pointer(self._buffer, self.num_bytes) 102 | 103 | 104 | fn escape_quotes_in(s: String) -> String: 105 | var indices = find_indices(s, QUOTE_CHAR) 106 | var i_size = len(indices) 107 | if i_size == 0: 108 | return s 109 | 110 | var size = len(s) 111 | var p_current = UnsafePointer(s.unsafe_ptr()) 112 | var p_result = UnsafePointer[UInt8].alloc(size + i_size) 113 | var first_index = Int(indices[0]) 114 | memcpy(p_result, p_current, first_index) 115 | p_result.offset(first_index).store(QUOTE) 116 | var offset = first_index + 1 117 | for i in range(1, len(indices)): 118 | var c_offset = Int(indices[i - 1]) 119 | var length = Int(indices[i]) - c_offset 120 | memcpy(p_result.offset(offset), p_current.offset(c_offset), length) 121 | offset += length 122 | p_result.offset(offset).store(QUOTE) 123 | offset += 1 124 | 125 | var last_index = Int(indices[i_size - 1]) 126 | memcpy(p_result.offset(offset), p_current.offset(last_index), size - last_index) 127 | return string_from_pointer(p_result, size + i_size + 1) 128 | -------------------------------------------------------------------------------- /csv/csv_table.mojo: -------------------------------------------------------------------------------- 1 | from .string_utils import find_indices, string_from_pointer 2 | from algorithm.functional import vectorize 3 | from sys.info import simdwidthof 4 | from sys.intrinsics import compressed_store 5 | from math import iota 6 | from memory import stack_allocation 7 | from memory.memory import memcpy 8 | 9 | 10 | alias QUOTE = ord('"') 11 | alias COMMA = ord(",") 12 | alias LF = ord("\n") 13 | alias CR = ord("\r") 14 | alias simd_width_u8 = simdwidthof[DType.uint8]() 15 | 16 | struct CsvTable[sep: Int = COMMA]: 17 | var _inner_string: String 18 | var _starts: List[Int] 19 | var _ends: List[Int] 20 | var column_count: Int 21 | 22 | fn __init__(out self, owned s: String, with_simd: Bool = True): 23 | self._inner_string = s 24 | self._starts = List[Int](capacity=10) 25 | self._ends = List[Int](capacity=10) 26 | self.column_count = -1 27 | if with_simd: 28 | self._simd_parse() 29 | else: 30 | self._parse() 31 | 32 | @always_inline 33 | fn _parse(mut self): 34 | var length = len(self._inner_string) 35 | if(length == 0): 36 | return 37 | var offset = 0 38 | var in_double_quotes = False 39 | self._starts.append(offset) 40 | while offset < length: 41 | var c = Int(self._inner_string.unsafe_ptr().load[width=1](offset)) 42 | if c == QUOTE: 43 | in_double_quotes = not in_double_quotes 44 | offset += 1 45 | elif not in_double_quotes and c == sep: 46 | self._ends.append(offset) 47 | offset += 1 48 | self._starts.append(offset) 49 | elif not in_double_quotes and c == LF: 50 | self._ends.append(offset) 51 | if self.column_count == -1: 52 | self.column_count = len(self._ends) 53 | offset += 1 54 | self._starts.append(offset) 55 | elif ( 56 | not in_double_quotes 57 | and c == CR 58 | and length > offset + 1 59 | and Int(self._inner_string.unsafe_ptr().load[width=1](offset + 1)) == LF 60 | ): 61 | self._ends.append(offset) 62 | if self.column_count == -1: 63 | self.column_count = len(self._ends) 64 | offset += 2 65 | self._starts.append(offset) 66 | else: 67 | offset += 1 68 | 69 | if self._inner_string[length - 1] == "\n": 70 | _ = self._starts.pop() 71 | else: 72 | self._ends.append(length) 73 | 74 | @always_inline 75 | fn _simd_parse(mut self): 76 | var p = UnsafePointer(self._inner_string.unsafe_ptr()) 77 | var string_byte_length = len(self._inner_string) 78 | if(string_byte_length == 0): 79 | return 80 | var in_quotes = False 81 | var last_chunk__ends_on_cr = False 82 | self._starts.append(0) 83 | 84 | @always_inline 85 | @parameter 86 | fn find_indicies[simd_width: Int](offset: Int): 87 | var chars = p.load[width=simd_width](offset) 88 | var quotes = chars == QUOTE 89 | var separators = chars == sep 90 | var lfs = chars == LF 91 | var all_bits = quotes | separators | lfs 92 | var crs = chars == CR 93 | 94 | var offsets = iota[DType.uint8, simd_width]() 95 | var sp: UnsafePointer[UInt8] = UnsafePointer[UInt8].alloc(simd_width) 96 | compressed_store[DType.uint8, simd_width](offsets, sp, all_bits) 97 | var all_len = all_bits.reduce_bit_count() 98 | 99 | for i in range(all_len): 100 | var index = Int(sp.load(i)) 101 | if quotes[index]: 102 | in_quotes = not in_quotes 103 | continue 104 | if in_quotes: 105 | continue 106 | var current_offset = index + offset 107 | var rs_compensation: Int 108 | if index > 0: 109 | rs_compensation = Int(lfs[index] & crs[index - 1]) 110 | else: 111 | rs_compensation = Int(lfs[index] & last_chunk__ends_on_cr) 112 | self._ends.append(current_offset - rs_compensation) 113 | self._starts.append(current_offset + 1) 114 | if self.column_count == -1 and lfs[index]: 115 | self.column_count = len(self._ends) 116 | last_chunk__ends_on_cr = crs[simd_width - 1] 117 | 118 | vectorize[find_indicies, simd_width_u8](string_byte_length) 119 | if self._inner_string[string_byte_length - 1] == "\n": 120 | _ = self._starts.pop() 121 | else: 122 | self._ends.append(string_byte_length) 123 | 124 | fn get(self, row: Int, column: Int) -> String: 125 | if column >= self.column_count: 126 | return "" 127 | 128 | var index = self.column_count * row + column 129 | if index >= len(self._ends): 130 | return "" 131 | 132 | if ( 133 | self._inner_string[self._starts[index]] == '"' 134 | and self._inner_string[self._ends[index] - 1] == '"' 135 | ): 136 | var start = self._starts[index] + 1 137 | var length = (self._ends[index] - 1) - start 138 | var p1 = UnsafePointer[UInt8].alloc(length + 1) 139 | memcpy(p1, UnsafePointer(self._inner_string.unsafe_ptr()).offset(start), length) 140 | var _inner_string = string_from_pointer(p1, length + 1) 141 | var quote_indices = find_indices(_inner_string, '"') 142 | var quotes_count = len(quote_indices) 143 | if quotes_count == 0 or quotes_count & 1 == 1: 144 | return _inner_string 145 | 146 | var p = UnsafePointer(_inner_string.unsafe_ptr()) 147 | var length2 = length - (quotes_count >> 1) 148 | var p2 = UnsafePointer[UInt8].alloc(length2 + 1) 149 | var offset2 = 0 150 | memcpy(p2, p, Int(quote_indices[0])) 151 | offset2 += Int(quote_indices[0]) 152 | 153 | for i in range(2, quotes_count, 2): 154 | var start = Int(quote_indices[i - 1]) 155 | var size = Int(quote_indices[i]) - start 156 | memcpy(p2.offset(offset2), p.offset(start), size) 157 | offset2 += size 158 | var last = Int(quote_indices[quotes_count - 1]) 159 | memcpy(p2.offset(offset2), p.offset(last), length - last) 160 | return string_from_pointer(p2, length - (quotes_count >> 1) + 1) 161 | 162 | return self._inner_string[self._starts[index] : self._ends[index]] 163 | 164 | fn row_count(self) -> Int: 165 | return len(self._starts) // self.column_count 166 | -------------------------------------------------------------------------------- /csv/string_utils.mojo: -------------------------------------------------------------------------------- 1 | from algorithm.functional import vectorize 2 | from sys.info import simdwidthof 3 | from sys.intrinsics import compressed_store 4 | from math import iota 5 | from memory import stack_allocation 6 | from time import now 7 | from .vectorize_and_exit import vectorize_and_exit 8 | 9 | alias simd_width_i8 = simdwidthof[DType.int8]() 10 | 11 | fn find_indices(s: String, c: String) -> List[UInt64]: 12 | var size = len(s) 13 | var result = List[UInt64]() 14 | var char = UInt8(ord(c)) 15 | var p = UnsafePointer(s.unsafe_ptr()) 16 | 17 | @parameter 18 | fn find[simd_width: Int](offset: Int): 19 | @parameter 20 | if simd_width == 1: 21 | if p.offset(offset).load() == char: 22 | return result.append(offset) 23 | else: 24 | var chunk = p.load[width=simd_width](offset) 25 | var occurrence = chunk == char 26 | var offsets = iota[DType.uint64, simd_width]() + offset 27 | var occurrence_count = occurrence.reduce_bit_count() 28 | var current_len = len(result) 29 | result.reserve(current_len + occurrence_count) 30 | result.resize(current_len + occurrence_count, 0) 31 | compressed_store(offsets, UnsafePointer[UInt64](to=result[current_len]), occurrence) 32 | 33 | vectorize[find, simd_width_i8](size) 34 | return result 35 | 36 | 37 | fn occurrence_count(s: String, *c: String) -> Int: 38 | var size = len(s) 39 | var result = 0 40 | var chars = List[UInt8](capacity=len(c)) 41 | for i in range(len(c)): 42 | chars.append(UInt8(ord(c[i]))) 43 | var p = UnsafePointer(s.unsafe_ptr()) 44 | 45 | @parameter 46 | fn find[simd_width: Int](offset: Int): 47 | @parameter 48 | if simd_width == 1: 49 | for i in range(len(chars)): 50 | var char = chars[i] 51 | if p.offset(offset).load() == char: 52 | result += 1 53 | return 54 | else: 55 | var chunk = p.load[width=simd_width](offset) 56 | 57 | var occurrence = SIMD[DType.bool, simd_width](False) 58 | for i in range(len(chars)): 59 | occurrence |= chunk == chars[i] 60 | var occurrence_count = occurrence.reduce_bit_count() 61 | result += occurrence_count 62 | 63 | vectorize[find, simd_width_i8](size) 64 | return result 65 | 66 | 67 | fn contains_any_of(s: String, *c: String) -> Bool: 68 | var size = len(s) 69 | var chars = List[UInt8](capacity=len(c)) 70 | 71 | for i in range(len(c)): 72 | chars.append(UInt8(ord(c[i]))) 73 | var p = UnsafePointer(s.unsafe_ptr()) 74 | var flag = False 75 | 76 | @parameter 77 | fn find[simd_width: Int](i: Int) -> Bool: 78 | var chunk = p.load[width=simd_width]() 79 | p = p.offset(simd_width) 80 | for i in range(len(chars)): 81 | var occurrence = chunk == chars[i] 82 | if occurrence.reduce_or(): 83 | flag = True 84 | return flag 85 | return False 86 | 87 | vectorize_and_exit[simd_width_i8, find](size) 88 | 89 | return flag 90 | 91 | 92 | @always_inline 93 | fn string_from_pointer(p: UnsafePointer[UInt8], length: Int) -> String: 94 | p.store(length - 1, 0) 95 | return String(unsafe_from_utf8_ptr=p) 96 | 97 | 98 | fn print_v(v: List[UInt64]): 99 | print("(" + String(len(v)) + ")[") 100 | for i in range(len(v)): 101 | var end = ", " if i < len(v) - 1 else "]\n" 102 | print(v[i], end=end) 103 | -------------------------------------------------------------------------------- /csv/vectorize_and_exit.mojo: -------------------------------------------------------------------------------- 1 | fn vectorize_and_exit[simd_width: Int, workgroup_function: fn[i: Int](Int) capturing [_] -> Bool](size: Int): 2 | var loops = size // simd_width 3 | for i in range(loops): 4 | if workgroup_function[simd_width](i * simd_width): 5 | return 6 | 7 | var rest = size & (simd_width - 1) 8 | @parameter 9 | if simd_width >= 64: 10 | if rest >= 32: 11 | if workgroup_function[32](size - rest): 12 | return 13 | rest -= 32 14 | @parameter 15 | if simd_width >= 32: 16 | if rest >= 16: 17 | if workgroup_function[16](size - rest): 18 | return 19 | rest -= 16 20 | @parameter 21 | if simd_width >= 16: 22 | if rest >= 8: 23 | if workgroup_function[8](size - rest): 24 | return 25 | rest -= 8 26 | @parameter 27 | if simd_width >= 8: 28 | if rest >= 4: 29 | if workgroup_function[4](size - rest): 30 | return 31 | rest -= 4 32 | @parameter 33 | if simd_width >= 4: 34 | if rest >= 2: 35 | if workgroup_function[2](size - rest): 36 | return 37 | rest -= 2 38 | 39 | if rest == 1: 40 | _= workgroup_function[1](size - rest) 41 | -------------------------------------------------------------------------------- /generic_dict/__init__.mojo: -------------------------------------------------------------------------------- 1 | from .dict import Dict 2 | from .keys_container import Keyable, KeysBuilder 3 | from .multi_dict import MultiDict 4 | from .sparse_array import SparseArray -------------------------------------------------------------------------------- /generic_dict/ahasher.mojo: -------------------------------------------------------------------------------- 1 | # This code is based on https://github.com/tkaitchuck/aHash 2 | 3 | from bit import rotate_bits_left, byte_swap 4 | from .keys_container import KeyRef 5 | 6 | alias U256 = SIMD[DType.uint64, 4] 7 | alias U128 = SIMD[DType.uint64, 2] 8 | alias MULTIPLE = 6364136223846793005 9 | alias ROT = 23 10 | 11 | 12 | @always_inline 13 | fn folded_multiply(s: UInt64, by: UInt64) -> UInt64: 14 | var b1 = s * byte_swap(by) 15 | var b2 = byte_swap(s) * (~by) 16 | return b1 ^ byte_swap(b2) 17 | 18 | 19 | @always_inline 20 | fn read_small(data: UnsafePointer[UInt8], length: Int) -> U128: 21 | if length >= 2: 22 | if length >= 4: 23 | # len 4-8 24 | var a = data.bitcast[Scalar[DType.uint32]]().load().cast[DType.uint64]() 25 | var b = data.offset(length - 4).bitcast[Scalar[DType.uint32]]().load().cast[DType.uint64]() 26 | return U128(a, b) 27 | else: 28 | var a = data.bitcast[Scalar[DType.uint16]]().load().cast[DType.uint64]() 29 | var b = data.offset(length - 1).load().cast[DType.uint64]() 30 | return U128(a, b) 31 | else: 32 | if length > 0: 33 | var a = data.load().cast[DType.uint64]() 34 | return U128(a, a) 35 | else: 36 | return U128(0, 0) 37 | 38 | struct AHasher: 39 | var buffer: UInt64 40 | var pad: UInt64 41 | var extra_keys: U128 42 | 43 | fn __init__(out self, key: U256): 44 | var pi_key = key ^ U256(0x243f_6a88_85a3_08d3, 0x1319_8a2e_0370_7344, 0xa409_3822_299f_31d0, 0x082e_fa98_ec4e_6c89,) 45 | self.buffer = pi_key[0] 46 | self.pad = pi_key[1] 47 | self.extra_keys = U128(pi_key[2], pi_key[3]) 48 | 49 | @always_inline 50 | fn update(mut self, new_data: UInt64): 51 | self.buffer = folded_multiply(new_data ^ self.buffer, MULTIPLE) 52 | 53 | @always_inline 54 | fn large_update(mut self, new_data: U128): 55 | var combined = folded_multiply( 56 | new_data[0] ^ self.extra_keys[0], new_data[1] ^ self.extra_keys[1] 57 | ) 58 | self.buffer = rotate_bits_left[ROT]((self.buffer + self.pad) ^ combined) 59 | 60 | @always_inline 61 | fn short_finish(self) -> UInt64: 62 | return self.buffer + self.pad 63 | 64 | @always_inline 65 | fn finish(self) -> UInt64: 66 | var rot = self.buffer & 63 67 | var folded = folded_multiply(self.buffer, self.pad) 68 | return (folded << rot) | (folded >> (64 - rot)) 69 | 70 | @always_inline 71 | fn write(mut self, data: UnsafePointer[UInt8], length: Int): 72 | self.buffer = (self.buffer + length) * MULTIPLE 73 | if length > 8: 74 | if length > 16: 75 | var tail = data.offset(length - 16).bitcast[Scalar[DType.uint64]]().load[width=2]() 76 | self.large_update(tail) 77 | var offset = 0 78 | while length - offset > 16: 79 | var block = data.offset(offset).bitcast[Scalar[DType.uint64]]().load[width=2]() 80 | self.large_update(block) 81 | offset += 16 82 | else: 83 | var a = data.bitcast[Scalar[DType.uint64]]().load() 84 | var b = data.offset(length - 8).bitcast[Scalar[DType.uint64]]().load() 85 | self.large_update(U128(a, b)) 86 | else: 87 | var value = read_small(data, length) 88 | self.large_update(value) 89 | 90 | @always_inline 91 | fn ahash(s: KeyRef) -> UInt64: 92 | var length = s.size 93 | var b = s.pointer 94 | var hasher = AHasher(U256(0, 0, 0, 0)) 95 | 96 | if length > 8: 97 | hasher.write(b, length) 98 | else: 99 | var value = read_small(b, length) 100 | hasher.buffer = folded_multiply(value[0] ^ hasher.buffer, value[1] ^ hasher.extra_keys[1]) 101 | hasher.pad = hasher.pad + length 102 | 103 | return hasher.finish() 104 | -------------------------------------------------------------------------------- /generic_dict/dict.mojo: -------------------------------------------------------------------------------- 1 | from bit import pop_count, bit_width 2 | from memory import memset_zero, memcpy 3 | from .key_eq import eq 4 | from .keys_container import KeysContainer, KeyRef, Keyable 5 | from .ahasher import ahash 6 | from .single_key_builder import SingleKeyBuilder 7 | 8 | struct Dict[ 9 | V: Copyable & Movable, 10 | hash: fn(KeyRef) -> UInt64 = ahash, 11 | KeyCountType: DType = DType.uint32, 12 | KeyOffsetType: DType = DType.uint32, 13 | destructive: Bool = True, 14 | caching_hashes: Bool = True, 15 | ](Sized): 16 | var keys: KeysContainer[KeyOffsetType] 17 | var key_hashes: UnsafePointer[Scalar[KeyCountType]] 18 | var values: List[V] 19 | var slot_to_index: UnsafePointer[Scalar[KeyCountType]] 20 | var deleted_mask: UnsafePointer[UInt8] 21 | var count: Int 22 | var capacity: Int 23 | var key_builder: SingleKeyBuilder 24 | 25 | fn __init__(out self, capacity: Int = 16): 26 | constrained[ 27 | KeyCountType == DType.uint8 or 28 | KeyCountType == DType.uint16 or 29 | KeyCountType == DType.uint32 or 30 | KeyCountType == DType.uint64, 31 | "KeyCountType needs to be an unsigned integer" 32 | ]() 33 | self.count = 0 34 | if capacity <= 8: 35 | self.capacity = 8 36 | else: 37 | var icapacity = Int64(capacity) 38 | self.capacity = capacity if pop_count(icapacity) == 1 else 39 | 1 << Int(bit_width(icapacity)) 40 | self.keys = KeysContainer[KeyOffsetType](capacity) 41 | self.key_builder = SingleKeyBuilder() 42 | @parameter 43 | if caching_hashes: 44 | self.key_hashes = UnsafePointer[Scalar[KeyCountType]].alloc(self.capacity) 45 | else: 46 | self.key_hashes = UnsafePointer[Scalar[KeyCountType]].alloc(0) 47 | self.values = List[V](capacity=capacity) 48 | self.slot_to_index = UnsafePointer[Scalar[KeyCountType]].alloc(self.capacity) 49 | memset_zero(self.slot_to_index, self.capacity) 50 | @parameter 51 | if destructive: 52 | self.deleted_mask = UnsafePointer[UInt8].alloc(self.capacity >> 3) 53 | memset_zero(self.deleted_mask, self.capacity >> 3) 54 | else: 55 | self.deleted_mask = UnsafePointer[UInt8].alloc(0) 56 | 57 | fn __copyinit__(out self, existing: Self): 58 | self.count = existing.count 59 | self.capacity = existing.capacity 60 | self.keys = existing.keys 61 | self.key_builder = existing.key_builder 62 | @parameter 63 | if caching_hashes: 64 | self.key_hashes = UnsafePointer[Scalar[KeyCountType]].alloc(self.capacity) 65 | memcpy(self.key_hashes, existing.key_hashes, self.capacity) 66 | else: 67 | self.key_hashes = UnsafePointer[Scalar[KeyCountType]].alloc(0) 68 | self.values = existing.values 69 | self.slot_to_index = UnsafePointer[Scalar[KeyCountType]].alloc(self.capacity) 70 | memcpy(self.slot_to_index, existing.slot_to_index, self.capacity) 71 | @parameter 72 | if destructive: 73 | self.deleted_mask = UnsafePointer[UInt8].alloc(self.capacity >> 3) 74 | memcpy(self.deleted_mask, existing.deleted_mask, self.capacity >> 3) 75 | else: 76 | self.deleted_mask = UnsafePointer[UInt8].alloc(0) 77 | 78 | fn __moveinit__(out self, owned existing: Self): 79 | self.count = existing.count 80 | self.capacity = existing.capacity 81 | self.keys = existing.keys^ 82 | self.key_builder = existing.key_builder^ 83 | self.key_hashes = existing.key_hashes 84 | self.values = existing.values^ 85 | self.slot_to_index = existing.slot_to_index 86 | self.deleted_mask = existing.deleted_mask 87 | 88 | fn __del__(owned self): 89 | self.slot_to_index.free() 90 | self.deleted_mask.free() 91 | self.key_hashes.free() 92 | 93 | fn __len__(self) -> Int: 94 | return self.count 95 | 96 | @always_inline 97 | fn __contains__[T: Keyable](self, key: T) -> Bool: 98 | try: 99 | self.key_builder.reset() 100 | key.accept(self.key_builder) 101 | var key_ref = self.key_builder.get_key() 102 | return self._find_key_index(key_ref) != 0 103 | except: 104 | return False 105 | 106 | fn put[T: Keyable](mut self, key: T, value: V) raises -> Bool: 107 | """Return True when value is inserted and not updated.""" 108 | if self.count / self.capacity >= 0.87: 109 | self._rehash() 110 | key.accept(self.keys) 111 | self.keys.end_key() 112 | var key_ref = self.keys.get_last() 113 | 114 | var key_hash = hash(key_ref).cast[KeyCountType]() 115 | var modulo_mask = self.capacity - 1 116 | var slot = Int(key_hash & modulo_mask) 117 | while True: 118 | var key_index = Int(self.slot_to_index.load(slot)) 119 | if key_index == 0: 120 | @parameter 121 | if caching_hashes: 122 | self.key_hashes.store(slot, key_hash) 123 | self.values.append(value) 124 | self.count += 1 125 | self.slot_to_index.store(slot, SIMD[KeyCountType, 1](self.keys.count)) 126 | return True 127 | @parameter 128 | if caching_hashes: 129 | var other_key_hash = self.key_hashes[slot] 130 | if other_key_hash == key_hash: 131 | var other_key = self.keys[key_index - 1] 132 | if eq(other_key, key_ref): 133 | self.values[key_index - 1] = value # replace value 134 | self.keys.drop_last() 135 | @parameter 136 | if destructive: 137 | if self._is_deleted(key_index - 1): 138 | self.count += 1 139 | self._not_deleted(key_index - 1) 140 | return True 141 | return False 142 | else: 143 | var other_key = self.keys[key_index - 1] 144 | if eq(other_key, key_ref): 145 | self.values[key_index - 1] = value # replace value 146 | self.keys.drop_last() 147 | @parameter 148 | if destructive: 149 | if self._is_deleted(key_index - 1): 150 | self.count += 1 151 | self._not_deleted(key_index - 1) 152 | return True 153 | return False 154 | 155 | slot = (slot + 1) & modulo_mask 156 | 157 | @always_inline 158 | fn _is_deleted(self, index: Int) -> Bool: 159 | var offset = index >> 3 160 | var bit_index = index & 7 161 | return self.deleted_mask.offset(offset).load() & (1 << bit_index) != 0 162 | 163 | @always_inline 164 | fn _deleted(self, index: Int): 165 | var offset = index >> 3 166 | var bit_index = index & 7 167 | var p = self.deleted_mask.offset(offset) 168 | var mask = p.load() 169 | p.store(mask | (1 << bit_index)) 170 | 171 | @always_inline 172 | fn _not_deleted(self, index: Int): 173 | var offset = index >> 3 174 | var bit_index = index & 7 175 | var p = self.deleted_mask.offset(offset) 176 | var mask = p.load() 177 | p.store(mask & ~(1 << bit_index)) 178 | 179 | @always_inline 180 | fn _rehash(mut self) raises: 181 | var old_slot_to_index = self.slot_to_index 182 | var old_capacity = self.capacity 183 | self.capacity <<= 1 184 | var mask_capacity = self.capacity >> 3 185 | self.slot_to_index = UnsafePointer[Scalar[KeyCountType]].alloc(self.capacity) 186 | memset_zero(self.slot_to_index, self.capacity) 187 | 188 | var key_hashes = self.key_hashes 189 | @parameter 190 | if caching_hashes: 191 | key_hashes = UnsafePointer[Scalar[KeyCountType]].alloc(self.capacity) 192 | 193 | @parameter 194 | if destructive: 195 | var deleted_mask = UnsafePointer[UInt8].alloc(mask_capacity) 196 | memset_zero(deleted_mask, mask_capacity) 197 | memcpy(deleted_mask, self.deleted_mask, old_capacity >> 3) 198 | self.deleted_mask.free() 199 | self.deleted_mask = deleted_mask 200 | 201 | var modulo_mask = self.capacity - 1 202 | for i in range(old_capacity): 203 | if old_slot_to_index[i] == 0: 204 | continue 205 | var key_hash = SIMD[KeyCountType, 1](0) 206 | @parameter 207 | if caching_hashes: 208 | key_hash = self.key_hashes[i] 209 | else: 210 | key_hash = hash(self.keys[Int(old_slot_to_index[i] - 1)]).cast[KeyCountType]() 211 | 212 | var slot = Int(key_hash & modulo_mask) 213 | 214 | while True: 215 | var key_index = Int(self.slot_to_index.load(slot)) 216 | if key_index == 0: 217 | self.slot_to_index.store(slot, old_slot_to_index[i]) 218 | break 219 | else: 220 | slot = (slot + 1) & modulo_mask 221 | @parameter 222 | if caching_hashes: 223 | key_hashes[slot] = key_hash 224 | 225 | @parameter 226 | if caching_hashes: 227 | self.key_hashes.free() 228 | self.key_hashes = key_hashes 229 | old_slot_to_index.free() 230 | 231 | @always_inline 232 | fn get[T: Keyable](mut self, key: T, default: V) raises -> V: 233 | self.key_builder.reset() 234 | key.accept(self.key_builder) 235 | var key_ref = self.key_builder.get_key() 236 | var key_index = self._find_key_index(key_ref) 237 | if key_index == 0: 238 | return default 239 | @parameter 240 | if destructive: 241 | if self._is_deleted(key_index - 1): 242 | return default 243 | return self.values[key_index - 1] 244 | 245 | fn delete[T: Keyable](mut self, key: T) raises: 246 | @parameter 247 | if not destructive: 248 | return 249 | 250 | self.key_builder.reset() 251 | key.accept(self.key_builder) 252 | var key_ref = self.key_builder.get_key() 253 | var key_index = self._find_key_index(key_ref) 254 | if key_index == 0: 255 | return 256 | if not self._is_deleted(key_index - 1): 257 | self.count -= 1 258 | self._deleted(key_index - 1) 259 | 260 | fn clear(mut self): 261 | self.values.clear() 262 | self.keys.clear() 263 | memset_zero(self.slot_to_index, self.capacity) 264 | @parameter 265 | if destructive: 266 | memset_zero(self.deleted_mask, self.capacity >> 3) 267 | self.count = 0 268 | 269 | fn _find_key_index(self, key_ref: KeyRef) raises -> Int: 270 | var key_hash = hash(key_ref).cast[KeyCountType]() 271 | var modulo_mask = self.capacity - 1 272 | var slot = Int(key_hash & modulo_mask) 273 | while True: 274 | var key_index = Int(self.slot_to_index.load(slot)) 275 | if key_index == 0: 276 | return key_index 277 | @parameter 278 | if caching_hashes: 279 | var other_key_hash = self.key_hashes[slot] 280 | if key_hash == other_key_hash: 281 | var other_key = self.keys[key_index - 1] 282 | if eq(other_key, key_ref): 283 | return key_index 284 | else: 285 | var other_key = self.keys[key_index - 1] 286 | if eq(other_key, key_ref): 287 | return key_index 288 | slot = (slot + 1) & modulo_mask 289 | 290 | 291 | fn debug(self) raises: 292 | print("Dict count:", self.count, "and capacity:", self.capacity) 293 | print("KeyMap:") 294 | for i in range(self.capacity): 295 | var end = ", " if i < self.capacity - 1 else "\n" 296 | print(self.slot_to_index.load(i), end=end) 297 | print("Keys:") 298 | self.keys.print_keys() 299 | @parameter 300 | if caching_hashes: 301 | print("KeyHashes:") 302 | for i in range(self.capacity): 303 | var end = ", " if i < self.capacity - 1 else "\n" 304 | if self.slot_to_index.load(i) > 0: 305 | print(self.key_hashes.load(i), end=end) 306 | else: 307 | print(0, end=end) 308 | -------------------------------------------------------------------------------- /generic_dict/key_eq.mojo: -------------------------------------------------------------------------------- 1 | from .keys_container import KeyRef 2 | 3 | @always_inline 4 | fn eq(a: KeyRef, b: KeyRef) -> Bool: 5 | var l = a.size 6 | if l != b.size: 7 | return False 8 | var p1 = a.pointer 9 | var p2 = b.pointer 10 | var offset = 0 11 | alias step = 16 12 | while l - offset >= step: 13 | var unequal = p1.load[width=step](offset) != p2.load[width=step](offset) 14 | if unequal.reduce_or(): 15 | return False 16 | offset += step 17 | while l - offset > 0: 18 | if p1.load(offset) != p2.load(offset): 19 | return False 20 | offset += 1 21 | return True 22 | -------------------------------------------------------------------------------- /generic_dict/keys_container.mojo: -------------------------------------------------------------------------------- 1 | from collections.vector import InlinedFixedVector 2 | from memory import memcpy, bitcast 3 | 4 | trait Keyable: 5 | fn accept[T: KeysBuilder](self, mut keys_builder: T): ... 6 | 7 | alias lookup = String("0123456789abcdef") 8 | 9 | @fieldwise_init 10 | struct KeyRef(Stringable, Copyable, Movable): 11 | var pointer: UnsafePointer[UInt8] 12 | var size: Int 13 | 14 | fn __str__(self) -> String: 15 | var result = String("(") + String(self.size) + (")") 16 | for i in range(self.size): 17 | result += lookup[Int(self.pointer.load(i) >> 4)] 18 | result += lookup[Int(self.pointer.load(i) & 0xf)] 19 | return result 20 | 21 | trait KeysBuilder: 22 | fn add[T: DType, size: Int](mut self, value: SIMD[T, size]): ... 23 | fn add_buffer[T: DType](mut self, pointer: UnsafePointer[Scalar[T]], size: Int): ... 24 | 25 | struct KeysContainer[KeyEndType: DType = DType.uint32](Sized, KeysBuilder): 26 | var keys: UnsafePointer[UInt8] 27 | var allocated_bytes: Int 28 | var keys_end: UnsafePointer[Scalar[KeyEndType]] 29 | var count: Int 30 | var capacity: Int 31 | var key_size: Int 32 | 33 | fn __init__(out self, capacity: Int): 34 | constrained[ 35 | KeyEndType == DType.uint8 or 36 | KeyEndType == DType.uint16 or 37 | KeyEndType == DType.uint32 or 38 | KeyEndType == DType.uint64, 39 | "KeyEndType needs to be an unsigned integer" 40 | ]() 41 | self.allocated_bytes = capacity << 3 42 | self.keys = UnsafePointer[UInt8].alloc(self.allocated_bytes) 43 | self.keys_end = UnsafePointer[Scalar[KeyEndType]].alloc(capacity) 44 | self.count = 0 45 | self.capacity = capacity 46 | self.key_size = 0 47 | 48 | fn __copyinit__(out self, existing: Self): 49 | self.allocated_bytes = existing.allocated_bytes 50 | self.count = existing.count 51 | self.capacity = existing.capacity 52 | self.key_size = existing.key_size 53 | self.keys = UnsafePointer[UInt8].alloc(self.allocated_bytes) 54 | memcpy(self.keys, existing.keys, self.allocated_bytes) 55 | self.keys_end = UnsafePointer[Scalar[KeyEndType]].alloc(self.allocated_bytes) 56 | memcpy(self.keys_end, existing.keys_end, self.capacity) 57 | 58 | fn __moveinit__(out self, owned existing: Self): 59 | self.allocated_bytes = existing.allocated_bytes 60 | self.count = existing.count 61 | self.capacity = existing.capacity 62 | self.key_size = existing.key_size 63 | self.keys = existing.keys 64 | self.keys_end = existing.keys_end 65 | 66 | fn __del__(owned self): 67 | self.keys.free() 68 | self.keys_end.free() 69 | 70 | @always_inline 71 | fn add[T: DType, size: Int](mut self, value: SIMD[T, size]): 72 | var prev_end = 0 if self.count == 0 else self.keys_end[self.count - 1] 73 | var key_length = size * T.sizeof() 74 | var old_key_size = self.key_size 75 | self.key_size += key_length 76 | var new_end = prev_end + self.key_size 77 | 78 | var needs_realocation = False 79 | while new_end > self.allocated_bytes: 80 | self.allocated_bytes += self.allocated_bytes >> 1 81 | needs_realocation = True 82 | 83 | if needs_realocation: 84 | var keys = UnsafePointer[UInt8].alloc(self.allocated_bytes) 85 | memcpy(keys, self.keys, Int(prev_end) + old_key_size) 86 | self.keys.free() 87 | self.keys = keys 88 | 89 | self.keys.store(prev_end + old_key_size, bitcast[DType.uint8, size * T.sizeof()](value)) 90 | 91 | @always_inline 92 | fn add_buffer[T: DType](mut self, pointer: UnsafePointer[Scalar[T]], size: Int): 93 | var prev_end = 0 if self.count == 0 else self.keys_end[self.count - 1] 94 | var key_length = size * T.sizeof() 95 | var old_key_size = self.key_size 96 | self.key_size += key_length 97 | var new_end = prev_end + self.key_size 98 | 99 | var needs_realocation = False 100 | while new_end > self.allocated_bytes: 101 | self.allocated_bytes += self.allocated_bytes >> 1 102 | needs_realocation = True 103 | 104 | if needs_realocation: 105 | var keys = UnsafePointer[UInt8].alloc(self.allocated_bytes) 106 | memcpy(keys, self.keys, Int(prev_end) + old_key_size) 107 | self.keys.free() 108 | self.keys = keys 109 | 110 | memcpy(self.keys.offset(prev_end + old_key_size), pointer.bitcast[UInt8](), key_length) 111 | 112 | @always_inline 113 | fn end_key(mut self): 114 | var prev_end = 0 if self.count == 0 else self.keys_end[self.count - 1] 115 | var count = self.count + 1 116 | if count >= self.capacity: 117 | var new_capacity = self.capacity + (self.capacity >> 1) 118 | var keys_end = UnsafePointer[Scalar[KeyEndType]].alloc(self.allocated_bytes) 119 | memcpy(keys_end, self.keys_end, self.capacity) 120 | self.keys_end.free() 121 | self.keys_end = keys_end 122 | self.capacity = new_capacity 123 | 124 | self.keys_end.store(self.count, prev_end + self.key_size) 125 | self.count = count 126 | self.key_size = 0 127 | 128 | @always_inline 129 | fn drop_last(mut self): 130 | self.count -= 1 131 | 132 | @always_inline 133 | fn get_last(self) raises -> KeyRef: 134 | return self.get(self.count - 1) 135 | 136 | @always_inline 137 | fn get(self, index: Int) raises -> KeyRef: 138 | if index < 0 or index >= self.count: 139 | raise "Invalid index" 140 | var start = 0 if index == 0 else Int(self.keys_end[index - 1]) 141 | var length = Int(self.keys_end[index]) - start 142 | return KeyRef(self.keys.offset(start), length) 143 | 144 | @always_inline 145 | fn clear(mut self): 146 | self.count = 0 147 | 148 | @always_inline 149 | fn __getitem__(self, index: Int) raises -> KeyRef: 150 | return self.get(index) 151 | 152 | @always_inline 153 | fn __len__(self) -> Int: 154 | return self.count 155 | 156 | fn print_keys(self) raises: 157 | print("(" + String(self.count) + ")[") 158 | for i in range(self.count): 159 | var end = ", " if i < self.capacity - 1 else "]\n" 160 | print(self[i], end=end) 161 | -------------------------------------------------------------------------------- /generic_dict/multi_dict.mojo: -------------------------------------------------------------------------------- 1 | from .ahasher import ahash 2 | from .key_eq import eq 3 | from .keys_container import KeyRef, KeysContainer 4 | from .single_key_builder import SingleKeyBuilder 5 | from .sparse_array import SparseArray 6 | from bit import pop_count, bit_width 7 | from memory import memset_zero, memcpy 8 | 9 | @fieldwise_init 10 | struct _ValuesIter[ 11 | list_mutability: Bool, //, 12 | T: Copyable & Movable, 13 | NextKeyCountType: DType, 14 | list_lifetime: Origin[list_mutability], 15 | ](Copyable, Movable): 16 | var current_index: Optional[Int] 17 | var next_index: Optional[Int] 18 | var values: List[T] 19 | var next_values: List[T] 20 | var next_next_values_index: SparseArray[NextKeyCountType] 21 | var first: Bool 22 | 23 | fn __iter__(self) -> Self: 24 | return self 25 | 26 | fn __next__( 27 | mut self, 28 | ) -> T: 29 | var element = self.values[self.current_index.or_else(0)] if self.first else self.next_values[self.current_index.or_else(0)] 30 | self.first = False 31 | self.current_index = self.next_index 32 | var next = self.next_next_values_index.get(self.current_index.or_else(-1)) 33 | self.next_index = Optional(Int(next.or_else(-1))) if next else None 34 | return element 35 | 36 | fn __has_next__(self) -> Bool: 37 | return self.current_index.or_else(-1) >= 0 38 | 39 | fn __len__(self) -> Int: 40 | if not self.current_index: 41 | return 0 42 | if not self.next_index: 43 | return 1 44 | return 2 45 | 46 | struct MultiDict[ 47 | V: Copyable & Movable, 48 | hash: fn(KeyRef) -> UInt64 = ahash, 49 | KeyCountType: DType = DType.uint32, 50 | NextKeyCountType: DType = DType.uint16, 51 | KeyOffsetType: DType = DType.uint32, 52 | caching_hashes: Bool = True, 53 | ](Sized): 54 | var keys: KeysContainer[KeyOffsetType] 55 | var key_hashes: UnsafePointer[Scalar[KeyCountType]] 56 | var values: List[V] 57 | var next_values_index: SparseArray[NextKeyCountType] 58 | var next_values: List[V] 59 | var next_next_values_index: SparseArray[NextKeyCountType] 60 | var slot_to_index: UnsafePointer[Scalar[KeyCountType]] 61 | var count: Int 62 | var capacity: Int 63 | var key_builder: SingleKeyBuilder 64 | 65 | fn __init__(out self, capacity: Int = 16): 66 | constrained[ 67 | KeyCountType == DType.uint8 or 68 | KeyCountType == DType.uint16 or 69 | KeyCountType == DType.uint32 or 70 | KeyCountType == DType.uint64, 71 | "KeyCountType needs to be an unsigned integer" 72 | ]() 73 | constrained[ 74 | NextKeyCountType == DType.uint8 or 75 | NextKeyCountType == DType.uint16 or 76 | NextKeyCountType == DType.uint32 or 77 | NextKeyCountType == DType.uint64, 78 | "NextKeyCountType needs to be an unsigned integer" 79 | ]() 80 | self.count = 0 81 | if capacity <= 8: 82 | self.capacity = 8 83 | else: 84 | var icapacity = Int64(capacity) 85 | self.capacity = capacity if pop_count(icapacity) == 1 else 86 | 1 << Int(bit_width(icapacity)) 87 | self.keys = KeysContainer[KeyOffsetType](capacity) 88 | self.key_builder = SingleKeyBuilder() 89 | @parameter 90 | if caching_hashes: 91 | self.key_hashes = UnsafePointer[Scalar[KeyCountType]].alloc(self.capacity) 92 | else: 93 | self.key_hashes = UnsafePointer[Scalar[KeyCountType]].alloc(0) 94 | self.values = List[V](capacity=capacity) 95 | self.slot_to_index = UnsafePointer[Scalar[KeyCountType]].alloc(self.capacity) 96 | memset_zero(self.slot_to_index, self.capacity) 97 | #TODO: Think about having an optional here or an empty List 98 | self.next_values = List[V]() 99 | self.next_values_index = SparseArray[NextKeyCountType]() 100 | self.next_next_values_index = SparseArray[NextKeyCountType]() 101 | 102 | fn __copyinit__(out self, existing: Self): 103 | self.count = existing.count 104 | self.capacity = existing.capacity 105 | self.keys = existing.keys 106 | self.key_builder = existing.key_builder 107 | @parameter 108 | if caching_hashes: 109 | self.key_hashes = UnsafePointer[Scalar[KeyCountType]].alloc(self.capacity) 110 | memcpy(self.key_hashes, existing.key_hashes, self.capacity) 111 | else: 112 | self.key_hashes = UnsafePointer[Scalar[KeyCountType]].alloc(0) 113 | self.values = existing.values 114 | self.slot_to_index = UnsafePointer[Scalar[KeyCountType]].alloc(self.capacity) 115 | memcpy(self.slot_to_index, existing.slot_to_index, self.capacity) 116 | self.next_values = existing.next_values 117 | self.next_values_index = existing.next_values_index 118 | self.next_next_values_index = existing.next_next_values_index 119 | 120 | fn __moveinit__(out self, owned existing: Self): 121 | self.count = existing.count 122 | self.capacity = existing.capacity 123 | self.keys = existing.keys^ 124 | self.key_builder = existing.key_builder^ 125 | self.key_hashes = existing.key_hashes 126 | self.values = existing.values^ 127 | self.slot_to_index = existing.slot_to_index 128 | self.next_values = existing.next_values^ 129 | self.next_values_index = existing.next_values_index^ 130 | self.next_next_values_index = existing.next_next_values_index^ 131 | 132 | fn __del__(owned self): 133 | self.slot_to_index.free() 134 | self.key_hashes.free() 135 | 136 | fn __len__(self) -> Int: 137 | return self.count 138 | 139 | fn put[T: Keyable](mut self, key: T, value: V) raises: 140 | if self.count / self.capacity >= 0.87: 141 | self._rehash() 142 | key.accept(self.keys) 143 | self.keys.end_key() 144 | var key_ref = self.keys.get_last() 145 | 146 | var key_hash = hash(key_ref).cast[KeyCountType]() 147 | var modulo_mask = self.capacity - 1 148 | var slot = Int(key_hash & modulo_mask) 149 | while True: 150 | var key_index = Int(self.slot_to_index.load(slot)) 151 | if key_index == 0: 152 | @parameter 153 | if caching_hashes: 154 | self.key_hashes.store(slot, key_hash) 155 | self.values.append(value) 156 | self.count += 1 157 | self.slot_to_index.store(slot, SIMD[KeyCountType, 1](self.keys.count)) 158 | return 159 | @parameter 160 | if caching_hashes: 161 | var other_key_hash = self.key_hashes[slot] 162 | if other_key_hash == key_hash: 163 | var other_key = self.keys[key_index - 1] 164 | if eq(other_key, key_ref): 165 | self._add_next(value, key_index) 166 | return 167 | else: 168 | var other_key = self.keys[key_index - 1] 169 | if eq(other_key, key_ref): 170 | self._add_next(value, key_index) 171 | return 172 | 173 | slot = (slot + 1) & modulo_mask 174 | 175 | @always_inline 176 | fn _add_next(mut self, value: V, key_index: Int): 177 | self.next_values.append(value) 178 | var next_index = self.next_values_index.get(key_index - 1) 179 | if not next_index: 180 | self.next_values_index[key_index - 1] = len(self.next_values) - 1 181 | else: 182 | var index = Int(next_index.value()) 183 | var next_next_index = self.next_next_values_index.get(index) 184 | while next_next_index: 185 | index = Int(next_next_index.value()) 186 | next_next_index = self.next_next_values_index.get(index) 187 | self.next_next_values_index[index] = len(self.next_values) - 1 188 | self.keys.drop_last() 189 | 190 | @always_inline 191 | fn _rehash(mut self) raises: 192 | var old_slot_to_index = self.slot_to_index 193 | var old_capacity = self.capacity 194 | self.capacity <<= 1 195 | self.slot_to_index = UnsafePointer[Scalar[KeyCountType]].alloc(self.capacity) 196 | memset_zero(self.slot_to_index, self.capacity) 197 | 198 | var key_hashes = self.key_hashes 199 | @parameter 200 | if caching_hashes: 201 | key_hashes = UnsafePointer[Scalar[KeyCountType]].alloc(self.capacity) 202 | 203 | var modulo_mask = self.capacity - 1 204 | for i in range(old_capacity): 205 | if old_slot_to_index[i] == 0: 206 | continue 207 | var key_hash = SIMD[KeyCountType, 1](0) 208 | @parameter 209 | if caching_hashes: 210 | key_hash = self.key_hashes[i] 211 | else: 212 | key_hash = hash(self.keys[Int(old_slot_to_index[i] - 1)]).cast[KeyCountType]() 213 | 214 | var slot = Int(key_hash & modulo_mask) 215 | 216 | while True: 217 | var key_index = Int(self.slot_to_index.load(slot)) 218 | 219 | if key_index == 0: 220 | self.slot_to_index.store(slot, old_slot_to_index[i]) 221 | break 222 | else: 223 | slot = (slot + 1) & modulo_mask 224 | @parameter 225 | if caching_hashes: 226 | key_hashes[slot] = key_hash 227 | 228 | @parameter 229 | if caching_hashes: 230 | self.key_hashes.free() 231 | self.key_hashes = key_hashes 232 | old_slot_to_index.free() 233 | 234 | @always_inline 235 | fn get[T: Keyable](mut self, key: T) raises -> List[V]: 236 | var result = List[V]() 237 | self.key_builder.reset() 238 | key.accept(self.key_builder) 239 | var key_ref = self.key_builder.get_key() 240 | var key_index = self._find_key_index(key_ref) 241 | if key_index == 0: 242 | return result 243 | result.append(self.values[key_index - 1]) 244 | var next_index = self.next_values_index.get(key_index - 1) 245 | if not next_index: 246 | return result 247 | var index = Int(next_index.value()) 248 | result.append(self.next_values[index]) 249 | var next_next_index = self.next_next_values_index.get(index) 250 | while next_next_index: 251 | index = Int(next_next_index.value()) 252 | result.append(self.next_values[index]) 253 | next_next_index = self.next_next_values_index.get(index) 254 | return result 255 | 256 | fn get_itter[T: Keyable](mut self, key: T) raises -> _ValuesIter[V, NextKeyCountType, __origin_of(self)]: 257 | self.key_builder.reset() 258 | key.accept(self.key_builder) 259 | var key_ref = self.key_builder.get_key() 260 | var key_index = self._find_key_index(key_ref) 261 | if key_index == 0: 262 | return _ValuesIter[ 263 | list_mutability=True, 264 | T=V, 265 | NextKeyCountType=NextKeyCountType, 266 | list_lifetime=__origin_of(self) 267 | ]( 268 | None, 269 | None, 270 | self.values, 271 | self.next_values, 272 | self.next_next_values_index, 273 | True 274 | ) 275 | var next_index = self.next_values_index.get(key_index - 1) 276 | if not next_index: 277 | return _ValuesIter[ 278 | list_mutability=True, 279 | T=V, 280 | NextKeyCountType=NextKeyCountType, 281 | list_lifetime=__origin_of(self) 282 | ](Optional(key_index - 1), None, self.values, self.next_values, self.next_next_values_index, True) 283 | return _ValuesIter[ 284 | list_mutability=True, 285 | T=V, 286 | NextKeyCountType=NextKeyCountType, 287 | list_lifetime=__origin_of(self) 288 | ](Optional(key_index - 1), Optional(Int(next_index.value())), self.values, self.next_values, self.next_next_values_index, True) 289 | 290 | fn _find_key_index(self, key_ref: KeyRef) raises -> Int: 291 | var key_hash = hash(key_ref).cast[KeyCountType]() 292 | var modulo_mask = self.capacity - 1 293 | var slot = Int(key_hash & modulo_mask) 294 | while True: 295 | var key_index = Int(self.slot_to_index.load(slot)) 296 | if key_index == 0: 297 | return key_index 298 | @parameter 299 | if caching_hashes: 300 | var other_key_hash = self.key_hashes[slot] 301 | if key_hash == other_key_hash: 302 | var other_key = self.keys[key_index - 1] 303 | if eq(other_key, key_ref): 304 | return key_index 305 | else: 306 | var other_key = self.keys[key_index - 1] 307 | if eq(other_key, key_ref): 308 | return key_index 309 | slot = (slot + 1) & modulo_mask 310 | 311 | fn debug(self) raises: 312 | print("Dict count:", self.count, "and capacity:", self.capacity) 313 | print("KeyMap:") 314 | for i in range(self.capacity): 315 | var end = ", " if i < self.capacity - 1 else "" 316 | print(self.slot_to_index.load(i), end=end) 317 | print("Keys:") 318 | self.keys.print_keys() 319 | @parameter 320 | if caching_hashes: 321 | print("KeyHashes:") 322 | for i in range(self.capacity): 323 | var end = ", " if i < self.capacity - 1 else "" 324 | if self.slot_to_index.load(i) > 0: 325 | print(self.key_hashes.load(i), end=end) 326 | else: 327 | print(0, end=end) 328 | print("Next Values:") 329 | self.next_values_index.debug() 330 | print("Next Next Values:") 331 | self.next_next_values_index.debug() -------------------------------------------------------------------------------- /generic_dict/single_key_builder.mojo: -------------------------------------------------------------------------------- 1 | from .keys_container import KeysBuilder, KeyRef 2 | from memory import memcpy, bitcast 3 | 4 | struct SingleKeyBuilder(KeysBuilder): 5 | var key: UnsafePointer[UInt8] 6 | var allocated_bytes: Int 7 | var key_size: Int 8 | 9 | fn __init__(out self, bytes: Int = 64): 10 | self.allocated_bytes = bytes 11 | self.key = UnsafePointer[UInt8].alloc(self.allocated_bytes) 12 | self.key_size = 0 13 | 14 | fn __copyinit__(out self, existing: Self): 15 | self.allocated_bytes = existing.allocated_bytes 16 | self.key = UnsafePointer[UInt8].alloc(self.allocated_bytes) 17 | memcpy(self.key, existing.key, self.allocated_bytes) 18 | self.key_size = existing.key_size 19 | 20 | fn __moveinit__(out self, owned existing: Self): 21 | self.allocated_bytes = existing.allocated_bytes 22 | self.key = existing.key 23 | self.key_size = existing.key_size 24 | 25 | fn __del__(owned self): 26 | self.key.free() 27 | 28 | @always_inline 29 | fn add[T: DType, size: Int](mut self, value: SIMD[T, size]): 30 | var key_length = size * T.sizeof() 31 | var old_key_size = self.key_size 32 | self.key_size += key_length 33 | 34 | var needs_realocation = False 35 | while self.key_size > self.allocated_bytes: 36 | self.allocated_bytes += self.allocated_bytes >> 1 37 | needs_realocation = True 38 | 39 | if needs_realocation: 40 | var key = UnsafePointer[UInt8].alloc(self.allocated_bytes) 41 | memcpy(key, self.key, old_key_size) 42 | self.key.free() 43 | self.key = key 44 | 45 | self.key.store(old_key_size, bitcast[DType.uint8, size * T.sizeof()](value)) 46 | 47 | @always_inline 48 | fn add_buffer[T: DType](mut self, pointer: UnsafePointer[Scalar[T]], size: Int): 49 | var key_length = size * T.sizeof() 50 | var old_key_size = self.key_size 51 | self.key_size += key_length 52 | 53 | var needs_realocation = False 54 | while self.key_size > self.allocated_bytes: 55 | self.allocated_bytes += self.allocated_bytes >> 1 56 | needs_realocation = True 57 | 58 | if needs_realocation: 59 | var key = UnsafePointer[UInt8].alloc(self.allocated_bytes) 60 | memcpy(key, self.key, old_key_size) 61 | self.key.free() 62 | self.key = key 63 | 64 | memcpy(self.key.offset(old_key_size), pointer.bitcast[UInt8](), key_length) 65 | 66 | @always_inline 67 | fn get_key(self) -> KeyRef: 68 | return KeyRef(self.key, self.key_size) 69 | 70 | @always_inline 71 | fn reset(mut self): 72 | self.key_size = 0 -------------------------------------------------------------------------------- /generic_dict/sparse_array.mojo: -------------------------------------------------------------------------------- 1 | from collections import Optional 2 | from bit import pop_count 3 | from tensor import Tensor, TensorSpec 4 | from memory import memset_zero, memcpy 5 | 6 | struct SparseArray[T: DType]: 7 | var mask: UnsafePointer[UInt8] 8 | var values: UnsafePointer[Scalar[T]] 9 | var mask_size: Int 10 | var values_count: Int 11 | var values_capacity: Int 12 | 13 | fn __init__(out self, capacity: Int = 8): 14 | var _capacity = capacity if capacity >= 8 else 8 15 | self.mask_size = -(-_capacity >> 3) 16 | self.mask = UnsafePointer[UInt8].alloc(self.mask_size) 17 | memset_zero(self.mask, self.mask_size) 18 | self.values_capacity = 4 19 | self.values_count = 0 20 | self.values = UnsafePointer[Scalar[T]].alloc(self.values_capacity) 21 | 22 | fn __copyinit__(out self, existing: Self): 23 | self.mask_size = existing.mask_size 24 | self.values_count = existing.values_count 25 | self.values_capacity = existing.values_capacity 26 | self.mask = UnsafePointer[UInt8].alloc(self.mask_size) 27 | memcpy(self.mask, existing.mask, self.mask_size) 28 | self.values = UnsafePointer[Scalar[T]].alloc(self.values_capacity) 29 | memcpy(self.values, existing.values, self.values_count) 30 | 31 | fn __moveinit__(out self, owned existing: Self): 32 | self.mask_size = existing.mask_size 33 | self.values_count = existing.values_count 34 | self.values_capacity = existing.values_capacity 35 | self.mask = existing.mask 36 | self.values = existing.values 37 | 38 | fn __del__(owned self): 39 | self.mask.free() 40 | self.values.free() 41 | 42 | @always_inline 43 | fn __contains__(self, index: Int) -> Bool: 44 | var offset = index >> 3 45 | var bit_index = index & 7 46 | return self.contains(offset, bit_index) 47 | 48 | @always_inline 49 | fn contains(self, offset: Int, bit_index: Int) -> Bool: 50 | return offset < self.mask_size and self.mask.load(offset) & (1 << bit_index) != 0 51 | 52 | fn __setitem__(mut self, index: Int, value: SIMD[T, 1]): 53 | var offset = index >> 3 54 | var bit_index = index & 7 55 | 56 | if self.mask_size <= offset: 57 | var mask = UnsafePointer[UInt8].alloc(offset + 1) 58 | memcpy(mask, self.mask, self.mask_size) 59 | memset_zero(mask.offset(self.mask_size), offset + 1 - self.mask_size) 60 | self.mask.free() 61 | self.mask = mask 62 | self.mask_size = offset + 1 63 | 64 | var p = self.mask.offset(offset) 65 | var mask = p.load() 66 | 67 | if self.contains(offset, bit_index): 68 | self.values.store(self._value_index(offset, bit_index), value) 69 | return 70 | 71 | p.store(mask | (1 << bit_index)) 72 | 73 | if self.values_capacity <= self.values_count + 1: 74 | var values_capacity = self.values_capacity + (self.values_capacity >> 1) 75 | var values = UnsafePointer[Scalar[T]].alloc(values_capacity) 76 | memcpy(values, self.values, self.values_count) 77 | self.values.free() 78 | self.values = values 79 | self.values_capacity = values_capacity 80 | 81 | var value_index = self._value_index(offset, bit_index) 82 | for i in range(self.values_count, value_index, -1): 83 | self.values.store(i, self.values.load(i-1)) 84 | self.values.store(value_index, value) 85 | self.values_count += 1 86 | 87 | fn get(self, index: Int) -> Optional[SIMD[T, 1]]: 88 | var offset = index >> 3 89 | var bit_index = index & 7 90 | 91 | if not self.contains(offset, bit_index): 92 | return None 93 | 94 | var idx = self._value_index(offset, bit_index) 95 | if idx < 0 or idx >= self.values_count: 96 | print("ERROR: Invalid value index:", idx) 97 | return None 98 | return self.values.load(idx) 99 | 100 | @always_inline 101 | fn _value_index(self, offset: Int, bit_index: Int) -> Int: 102 | var count = 0 103 | var i = 0 104 | while i < offset: 105 | count += Int(pop_count(self.mask.load(i))) 106 | i += 1 107 | 108 | var byte = self.mask.load(offset) 109 | var mask = (1 << bit_index) - 1 110 | var before_bit = byte & mask 111 | count += Int(pop_count(before_bit)) 112 | 113 | return count 114 | 115 | fn dense_values_list(self) -> List[Scalar[T]]: 116 | var count = self.values_count 117 | if count > 10000: 118 | print("WARNING: very large count", count) 119 | count = 10000 # prevent hang 120 | 121 | if count == 0: 122 | return [] 123 | 124 | var result = List[Scalar[T]](unsafe_uninit_length=count) 125 | for i in range(count): 126 | result[i] = self.values.load(i) 127 | return result 128 | 129 | 130 | fn debug(self): 131 | print("(" + String(self.mask_size) + ")[") 132 | for i in range(self.mask_size): 133 | var end = ", " if i < self.mask_size - 1 else "" 134 | print(self.mask.load(i), end=end) 135 | print("]") 136 | 137 | print("(" + String(self.values_count) + ")[") 138 | for i in range(self.values_count): 139 | var end = ", " if i < self.mask_size - 1 else "" 140 | print(self.values.load(i), end=end) 141 | print("]") 142 | -------------------------------------------------------------------------------- /helpers/__init__.mojo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mzaks/compact-dict/58c276004cbcab03cdd5085f91b34f54cadba2a4/helpers/__init__.mojo -------------------------------------------------------------------------------- /helpers/progress_bar.mojo: -------------------------------------------------------------------------------- 1 | # syntax is correct but can be unrecognized by your IDE 2 | # check issue: https://github.com/modular/modular/issues/5115 3 | # or line `fn call_it[f: fn() capturing [_] -> None]()` in https://docs.modular.com/mojo/changelog/ 4 | fn progress_bar[callback: fn(Int) raises capturing [_] -> None](n:Int, prefix:String='', bar_size:Int=60) raises: 5 | var n_size = len(String(n)) 6 | var space = " " if len(prefix)>0 else "" 7 | 8 | @parameter 9 | fn show(step:Int): 10 | var bar:String=space 11 | for j in range(bar_size): 12 | if j < Int((step * bar_size) / n): 13 | bar += "█" 14 | else: 15 | bar += "░" 16 | 17 | for _ in range(n_size-len(String(step))): 18 | bar += " " 19 | 20 | print("\r" + String(prefix) + String(bar) + " " + String(step) + "/" + String(n) + " ",end="") 21 | 22 | show(0) 23 | for step in range(n): 24 | callback(step) 25 | show(step+1) -------------------------------------------------------------------------------- /memory_consumption_compact_dict.mojo: -------------------------------------------------------------------------------- 1 | from string_dict import Dict 2 | from corpora import system_words_collection, hindi_text_to_keys 3 | 4 | fn main() raises: 5 | var corpus = system_words_collection() 6 | var dict = Dict[Int](len(corpus)) 7 | for _ in range(100): 8 | for i in range(len(corpus)): 9 | dict.put(corpus[i], i) 10 | 11 | var sum = 0 12 | for _ in range(100): 13 | sum = 0 14 | for i in range(len(corpus)): 15 | sum += dict.get(corpus[i], -1) 16 | 17 | print(sum) 18 | -------------------------------------------------------------------------------- /memory_consumption_std_lib_dict.mojo: -------------------------------------------------------------------------------- 1 | from collections import Dict 2 | from corpora import system_words_collection, hindi_text_to_keys 3 | 4 | fn main() raises: 5 | var corpus = system_words_collection() 6 | var dict = Dict[String, Int]() 7 | for _ in range(100): 8 | for i in range(len(corpus)): 9 | dict[corpus[i]] = i 10 | 11 | var sum = 0 12 | for _ in range(100): 13 | sum = 0 14 | for i in range(len(corpus)): 15 | sum += dict[corpus[i]] 16 | 17 | print(sum) 18 | -------------------------------------------------------------------------------- /pixi.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | authors = ["Maxim Zaks ", "Daniel Gustaw "] 3 | channels = ["https://conda.modular.com/max-nightly", "conda-forge"] 4 | name = "compact-dict" 5 | platforms = ["linux-64"] 6 | version = "0.1.0" 7 | 8 | [tasks] 9 | 10 | [dependencies] 11 | modular = ">=25.5.0.dev2025072405,<26" 12 | -------------------------------------------------------------------------------- /report_i7_2_8.csv: -------------------------------------------------------------------------------- 1 | Corpus,Number of keys,Total bytes,Min key,Avg key,Max key,Build stdlib,Read stdlib,Delete stdlib,Read after delete stdlib,Build compact nc,Build compact,Read compact,Delete compact,Read after delete compact,Speedup build nc,Speedup build,Speedup read,Speadup delete,Speedup read after delete,Read Checksum,Read Checksum after delete 2 | Arabic,463,4223,2,9.1209503239740819,26,282161.26750000002,266846.96649999998,25982.4938,251971.11499999999,44130.014150000003,37598.3321,33078.459300000002,3796.3750749999999,33273.6777,6.3938630642836527,7.5046219270987278,8.0670917614352113,6.844027074959131,7.5726860514730543,True,True 3 | Chinese,10,4647,441,464.69999999999999,480,16025.14772063509,18314.451569610952,2830.3437250000002,16485.900081009782,6502.6824200000001,8495.0055049999992,4019.8425699999998,802.64647549999995,3991.0950549999998,2.4643903370318809,1.8864199335954508,4.5560121449261013,3.5262644406889945,4.1306708694788989,True,True 4 | English,999,4289,1,4.293293293293293,13,516880.22249999997,502126.95250000001,39695.287499999999,410971.52649999998,77203.731549999997,73065.018899999995,65741.021649999995,8142.0887400000001,67550.941399999996,6.6950160584563099,7.0742501717193145,7.6379548096055485,4.8753199292691569,6.0838756349293455,True,True 5 | French,471,2977,2,6.3205944798301488,19,341206.46399999998,246948.72200000001,21297.802361912833,242384.2065,43061.94915,37721.114300000001,33534.9614,3896.972655,33558.7333,7.9236186641588642,9.045503303172568,7.3639184806099109,5.4652172974800699,7.2226863968074753,True,True 6 | Georgien,381,5982,6,15.700787401574804,42,259902.92249999999,224538.21063674512,21832.607244233364,213897.87081594602,39817.638099999996,34050.844400000002,28084.701550000002,3465.9476549999999,27987.880349999999,6.5273314767507511,7.6327893501519153,7.995036380820828,6.2991739684058681,7.6425176948402251,True,True 7 | German,999,5644,2,5.6496496496496498,18,507176.1605,500103.71399999998,41164.0988,435041.69650000002,76674.384300000005,72348.200349999999,64904.020049999999,7966.6006150000003,66604.793550000002,6.6146753590559975,7.0102111461850631,7.7052810228817243,5.1670845306960329,6.5316874854272422,True,True 8 | Greek,452,4693,3,10.382743362831858,28,268406.69199999998,256531.18599999999,27333.841799999998,244574.5575,43754.644950000002,37785.7929,31911.987150000001,3830.38267,32000.785650000002,6.1343588162289509,7.1033759357739994,8.0387092409568108,7.1360603247507921,7.642767279996451,True,True 9 | Hebrew,376,3346,2,8.8989361702127656,25,220431.24414348463,192329.32168501677,16003.335905794691,180609.85216659011,34367.662700000001,30125.461050000002,25570.2399,3110.0909099999999,26677.436600000001,6.4139143260238232,7.3171077374593274,7.5216080270336745,5.1456167581270762,6.7701351848246967,True,True 10 | Japanese,10,4992,378,499.19999999999999,558,18134.214944105184,18792.600778070293,2486.7059549999999,16540.184582894799,7786.0123899999999,8632.5784600000006,4200.9365500000004,872.44017699999995,4230.3348150000002,2.3290760450620329,2.1006718940502034,4.4734312347731828,2.8502882152342695,3.9098996429895587,True,True 11 | l33t,487,2317,2,4.7577002053388089,14,284474.48849999998,267964.25300000003,33560.018600000003,268617.7745,42140.019650000002,37495.18535,32642.908800000001,3984.9862750000002,32958.36505,6.750696626692247,7.5869604549107805,8.2089575607918857,8.4216146014204298,8.1502154033578194,True,True 12 | S3,161,3582,8,22.248447204968944,43,109559.598,99783.478099999993,8019.4958800000004,96937.528099999996,19218.501141948658,16048.145968523408,12074.652835913561,1462.8362449864971,12026.096542032623,5.7007358269403117,6.8269317972860257,8.2638796705785733,5.4821555778952042,8.0605978640859863,True,True 13 | Words,104335,880750,1,8.4416393505472804,23,82684071.5,55425389.399999999,4880371.8799999999,54811311.600000001,10383361.395,9020746.9649999999,8160072.2599999998,1036582.8945000001,8245112.3300000001,7.9631314325451168,9.1659894486354219,6.7922669841651633,4.7081346855082611,6.6477337610753935,True,True 14 | 15 | -------------------------------------------------------------------------------- /report_m1.csv: -------------------------------------------------------------------------------- 1 | Corpus,Number of keys,Total bytes,Min key,Avg key,Max key,Build stdlib,Read stdlib,Delete stdlib,Read after delete stdlib,Build compact nc,Build compact,Read compact,Delete compact,Read after delete compact,Speedup build nc,Speedup build,Speedup read,Speadup delete,Speedup read after delete,Read Checksum,Read Checksum after delete 2 | Arabic,463,4223,2,9.1209503239740819,26,235932.5,223595.52401746725,22670.877683403218,218722.66157622033,37214.300000000003,34868.900000000001,28109.849999999999,3218.5900000000001,31558.549999999999,6.3398344184896658,6.7662730972299094,7.9543478182013514,7.0437296093641057,6.9306942675192724,True,True 3 | Chinese,10,4647,441,464.69999999999999,480,15724.874561912206,18064.8962184507,4231.6149999999998,19025.38622655475,5201.1899999999996,7042.125,3814.7350000000001,638.20950000000005,3478.0749999999998,3.0233224631117506,2.2329729395476798,4.7355573109143103,6.6304481522133409,5.4700908481141877,True,True 4 | English,999,4289,1,4.293293293293293,13,427551.5,420086.5,23105.549999999999,286848.5,62043.0,53854.050000000003,49204.099999999999,5579.9350000000004,48462.550000000003,6.8912125461373561,7.9390779337858524,8.5376320265994092,4.1408278053418179,5.9189724849394016,True,True 5 | French,471,2977,2,6.3205944798301488,19,283979.5,241187.0,12612.260450748878,191136.0681114551,39083.949999999997,27904.549999999999,25878.400000000001,2977.4000000000001,30242.049999999999,7.2658853570327464,10.176817042381979,9.3200120563867941,4.2359980018636652,6.3202087196950973,True,True 6 | Georgien,381,5982,6,15.700787401574804,42,244965.5,197522.29097255276,15711.83870835553,182804.96546900953,36506.699999999997,29836.299999999999,26632.0,2996.605,25659.75,6.7101518351425895,8.2103176332185956,7.4167276574253815,5.2432131389874632,7.124191212658328,True,True 7 | German,999,5644,2,5.6496496496496498,18,460852.5,419771.5,23377.549999999999,344666.5,61785.449999999997,60368.050000000003,47545.400000000001,5651.8900000000003,52190.25,7.4589162982546862,7.6340464865106634,8.8288562090128586,4.1362358432312023,6.6040400266333261,True,True 8 | Greek,452,4693,3,10.382743362831858,28,244522.5,221500.94654286487,24229.25,214683.0,38830.349999999999,31879.25,30672.950000000001,3340.5599999999999,28890.849999999999,6.2972005145459669,7.6702714147917552,7.2213773550592588,7.253050386761501,7.4308301763361069,True,True 9 | Hebrew,376,3346,2,8.8989361702127656,25,233578.5,168860.83752093802,12954.055595892542,151668.27673329864,29366.349999999999,23654.950000000001,21462.204182949532,2668.6300000000001,21630.732621223018,7.9539506952685644,9.874402609179052,7.867823643905508,4.8541969459582415,7.0117031812639183,True,True 10 | Hindi,450,8280,9,18.399999999999999,51,279719.5,261702.0,16873.475787181891,210054.84571925251,48077.199999999997,36368.800000000003,34604.75,3576.3200000000002,31252.849999999999,5.8181320875591753,7.6911941004377375,7.5626033998222786,4.7181112951810498,6.7211420948570302,True,True 11 | Japanese,10,4992,378,499.19999999999999,558,16569.322489095444,20352.976730129947,4614.2399999999998,24709.549999999999,6137.2650000000003,8562.8150000000005,3843.0749999999998,697.17399999999998,3811.1550000000002,2.6997893180586869,1.935032169805776,5.2960134085673447,6.6184912231379833,6.4834807295950965,True,True 12 | l33t,487,2317,2,4.7577002053388089,14,276193.0,237513.5,24396.650000000001,243392.5,34135.050000000003,29055.049999999999,27157.200000000001,2879.3049999999998,26537.799999999999,8.0911848671673265,9.5058518226607784,8.7458758634910811,8.4731037524680453,9.1715402181039885,True,True 13 | Russian,999,10636,2,10.646646646646646,37,604495.0,475855.5,39374.800000000003,414928.0,84918.649999999994,73036.600000000006,61565.75,7363.0749999999998,64405.650000000001,7.1185187235077345,8.2766037849516554,7.7292244470342686,5.3476027339121224,6.4424161544833405,True,True 14 | S3,161,3582,8,22.248447204968944,43,95685.949999999997,86654.449999999997,7334.6599999999999,79889.0,29397.849999999999,15670.69554658157,12936.078249315638,1437.7824488910567,12447.492445545658,3.2548621752951323,6.1060435840624239,6.6986646439452624,5.1013698252173887,6.4180798140261839,True,True 15 | Words,235977,2257909,1,9.5683840729565723,28,159431769.23076922,114977950.0,10390735.0,111888000.0,21706760.683760684,17649079.207920793,14882849.056603774,2961165.0,15481181.25,7.3447978513921583,9.0334326993794267,7.7255335697288627,3.5090023690000387,7.2273554706944614,True,True 16 | 17 | -------------------------------------------------------------------------------- /report_m1_new.csv: -------------------------------------------------------------------------------- 1 | Corpus,Number of keys,Total bytes,Min key,Avg key,Max key,Build stdlib,Read stdlib,Delete stdlib,Read after delete stdlib,Build compact nc,Build compact,Read compact,Delete compact,Read after delete compact,Speedup build nc,Speedup build,Speedup read,Speadup delete,Speedup read after delete,Read Checksum,Read Checksum after delete 2 | Arabic,463,4223,2,9.1209503239740819,26,77512.350000000006,61041.650000000001,6077.0900000000001,60665.75,36217.150000000001,35177.099999999999,29045.450000000001,3385.3800000000001,36741.949999999997,2.1402111982858947,2.2034889175059913,2.1015907827215625,1.7950983346035012,1.6511303836622715,True,True 3 | Chinese,10,4647,441,464.69999999999999,480,5740.8000000000002,8005.4849999999997,1132.5329999999999,7623.1450000000004,5382.9949999999999,5126.1850000000004,4430.7849999999999,869.87894017913936,4922.875,1.0664695025724527,1.1198971554869752,1.8067870591780013,1.3019432333501151,1.5485148414290431,True,True 4 | English,999,4289,1,4.293293293293293,13,121901.95,105672.3,12042.870000000001,103702.10000000001,65044.800000000003,50951.150000000001,48228.25,6271.0649999999996,58321.449999999997,1.874122912208201,2.3925259783145227,2.1910871740110829,1.9203867285700278,1.7781125126347168,True,True 5 | French,471,2977,2,6.3205944798301488,19,81442.050000000003,54487.150000000001,5469.7650000000003,52006.400000000001,34522.300000000003,27519.099999999999,30547.400000000001,3276.1149999999998,29836.650000000001,2.3591142536852989,2.9594736019709944,1.7836919017657797,1.6695888270100407,1.7430375058862171,True,True 6 | Georgien,381,5982,6,15.700787401574804,42,63962.949999999997,52573.150000000001,5155.1049999999996,52648.5,36022.5,28743.5,31056.650000000001,3157.625,29115.849999999999,1.7756388368380871,2.2253013724842137,1.6928145823841272,1.6325893670084317,1.8082419026063121,True,True 7 | German,999,5644,2,5.6496496496496498,18,124514.95,103500.25,12231.155000000001,104708.25,66339.25,53060.900000000001,52700.650000000001,5990.3900000000003,62306.199999999997,1.8769423832798831,2.3466422544660945,1.9639273898898779,2.0417961101030153,1.6805430278206663,True,True 8 | Greek,452,4693,3,10.382743362831858,28,72400.350000000006,54366.199999999997,6382.6350000000002,58440.0,38861.449999999997,31355.0,34619.099999999999,3638.8899999999999,32193.349999999999,1.8630377919506353,2.3090527826502951,1.570410553711679,1.7540060293111359,1.8152817274374988,True,True 9 | Hebrew,376,3346,2,8.8989361702127656,25,63110.25,47014.050000000003,4731.3149999999996,43050.599999999999,30170.349999999999,23344.54666629382,22286.694296287751,2655.5349999999999,23653.25,2.0917970789201981,2.7034258108392466,2.1095120422516427,1.7816805276526197,1.8200712375677761,True,True 10 | Hindi,450,8280,9,18.399999999999999,51,72564.25,59076.199999999997,5954.8900000000003,59433.400000000001,43256.699999999997,35778.050000000003,32692.049999999999,3832.1550000000002,37412.449999999997,1.6775262560481961,2.0281778911930637,1.807050949695721,1.5539272289351556,1.5885995170057026,True,True 11 | Japanese,10,4992,378,499.19999999999999,558,6018.9499999999998,7916.6400000000003,1192.5155,7461.915,5876.1800000000003,4899.4099999999999,4751.3050000000003,956.60199999999998,4724.46,1.0242963966386325,1.2285050648955689,1.6662032852026971,1.2466161475723447,1.5794217751870054,True,True 12 | l33t,487,2317,2,4.7577002053388089,14,67924.800000000003,53437.599999999999,6322.6049999999996,54617.75,35199.150000000001,28267.950000000001,26417.549999999999,3097.415,30473.0,1.9297284167373356,2.4028909064859674,2.0228068083527808,2.0412521408981359,1.7923325566895283,True,True 13 | Russian,999,10636,2,10.646646646646646,37,155766.14978969176,118214.14999999999,13390.757523271017,119861.84210526316,81256.649999999994,70636.649999999994,67257.149999999994,7817.6949999999997,71723.449999999997,1.9169649473574382,2.2051746478590331,1.7576443545407441,1.7128779676453247,1.6711667119367957,True,True 14 | S3,161,3582,8,22.248447204968944,43,27222.849999999999,20800.466086722419,2110.5573605021605,24579.299999999999,18197.410576168797,13987.306483529275,13269.252762453505,1511.4813406196954,13797.569452179918,1.4959738302355423,1.9462539147230535,1.5675687590773106,1.3963502583742444,1.7814224516272787,True,True 15 | Words,235977,2257909,1,9.5683840729565723,28,65289550.0,32252450.0,5270680.0,35955200.0,22515927.927927926,19531765.151515152,16623086.330935251,2298315.0,18200452.05479452,2.899705053639706,3.3427367927847138,1.940220327195125,2.2932800769259218,1.9755113714622474,True,True 16 | 17 | -------------------------------------------------------------------------------- /string_dict/__init__.mojo: -------------------------------------------------------------------------------- 1 | from .dict import Dict -------------------------------------------------------------------------------- /string_dict/ahasher.mojo: -------------------------------------------------------------------------------- 1 | # This code is based on https://github.com/tkaitchuck/aHash 2 | 3 | from bit import rotate_bits_left, byte_swap 4 | 5 | alias U256 = SIMD[DType.uint64, 4] 6 | alias U128 = SIMD[DType.uint64, 2] 7 | alias MULTIPLE = 6364136223846793005 8 | alias ROT = 23 9 | 10 | 11 | @always_inline 12 | fn folded_multiply(s: UInt64, by: UInt64) -> UInt64: 13 | var b1 = s * byte_swap(by) 14 | var b2 = byte_swap(s) * (~by) 15 | return b1 ^ byte_swap(b2) 16 | 17 | 18 | @always_inline 19 | fn read_small(data: UnsafePointer[UInt8], length: Int) -> U128: 20 | if length >= 2: 21 | if length >= 4: 22 | # len 4-8 23 | var a = data.bitcast[Scalar[DType.uint32]]().load().cast[DType.uint64]() 24 | var b = data.offset(length - 4).bitcast[Scalar[DType.uint32]]().load().cast[DType.uint64]() 25 | return U128(a, b) 26 | else: 27 | var a = data.bitcast[Scalar[DType.uint16]]().load().cast[DType.uint64]() 28 | var b = data.offset(length - 1).load().cast[DType.uint64]() 29 | return U128(a, b) 30 | else: 31 | if length > 0: 32 | var a = data.load().cast[DType.uint64]() 33 | return U128(a, a) 34 | else: 35 | return U128(0, 0) 36 | 37 | struct AHasher: 38 | var buffer: UInt64 39 | var pad: UInt64 40 | var extra_keys: U128 41 | 42 | fn __init__(out self, key: U256): 43 | var pi_key = key ^ U256(0x243f_6a88_85a3_08d3, 0x1319_8a2e_0370_7344, 0xa409_3822_299f_31d0, 0x082e_fa98_ec4e_6c89,) 44 | self.buffer = pi_key[0] 45 | self.pad = pi_key[1] 46 | self.extra_keys = U128(pi_key[2], pi_key[3]) 47 | 48 | @always_inline 49 | fn update(mut self, new_data: UInt64): 50 | self.buffer = folded_multiply(new_data ^ self.buffer, MULTIPLE) 51 | 52 | @always_inline 53 | fn large_update(mut self, new_data: U128): 54 | var combined = folded_multiply( 55 | new_data[0] ^ self.extra_keys[0], new_data[1] ^ self.extra_keys[1] 56 | ) 57 | self.buffer = rotate_bits_left[ROT]((self.buffer + self.pad) ^ combined) 58 | 59 | @always_inline 60 | fn short_finish(self) -> UInt64: 61 | return self.buffer + self.pad 62 | 63 | @always_inline 64 | fn finish(self) -> UInt64: 65 | var rot = self.buffer & 63 66 | var folded = folded_multiply(self.buffer, self.pad) 67 | return (folded << rot) | (folded >> (64 - rot)) 68 | 69 | @always_inline 70 | fn write(mut self, data: UnsafePointer[UInt8], length: Int): 71 | self.buffer = (self.buffer + length) * MULTIPLE 72 | if length > 8: 73 | if length > 16: 74 | var tail = data.offset(length - 16).bitcast[Scalar[DType.uint64]]().load[width=2]() 75 | self.large_update(tail) 76 | var offset = 0 77 | while length - offset > 16: 78 | var block = data.offset(offset).bitcast[Scalar[DType.uint64]]().load[width=2]() 79 | self.large_update(block) 80 | offset += 16 81 | else: 82 | var a = data.bitcast[Scalar[DType.uint64]]().load() 83 | var b = data.offset(length - 8).bitcast[Scalar[DType.uint64]]().load() 84 | self.large_update(U128(a, b)) 85 | else: 86 | var value = read_small(data, length) 87 | self.large_update(value) 88 | 89 | @always_inline 90 | fn ahash(s: String) -> UInt64: 91 | var length = len(s) 92 | var b = s.unsafe_ptr() 93 | var hasher = AHasher(U256(0, 0, 0, 0)) 94 | 95 | if length > 8: 96 | hasher.write(b, length) 97 | else: 98 | var value = read_small(b, length) 99 | hasher.buffer = folded_multiply(value[0] ^ hasher.buffer, value[1] ^ hasher.extra_keys[1]) 100 | hasher.pad = hasher.pad + length 101 | 102 | return hasher.finish() 103 | -------------------------------------------------------------------------------- /string_dict/dict.mojo: -------------------------------------------------------------------------------- 1 | from bit import pop_count, bit_width 2 | from memory import memset_zero, memcpy 3 | from collections import List 4 | from .string_eq import eq 5 | from .keys_container import KeysContainer 6 | from .ahasher import ahash 7 | 8 | struct Dict[ 9 | V: Copyable & Movable, 10 | hash: fn(String) -> UInt64 = ahash, 11 | KeyCountType: DType = DType.uint32, 12 | KeyOffsetType: DType = DType.uint32, 13 | destructive: Bool = True, 14 | caching_hashes: Bool = True, 15 | ](Sized): 16 | var keys: KeysContainer[KeyOffsetType] 17 | var key_hashes: UnsafePointer[Scalar[KeyCountType]] 18 | var values: List[V] 19 | var slot_to_index: UnsafePointer[Scalar[KeyCountType]] 20 | var deleted_mask: UnsafePointer[UInt8] 21 | var count: Int 22 | var capacity: Int 23 | 24 | fn __init__(out self, capacity: Int = 16): 25 | constrained[ 26 | KeyCountType == DType.uint8 or 27 | KeyCountType == DType.uint16 or 28 | KeyCountType == DType.uint32 or 29 | KeyCountType == DType.uint64, 30 | "KeyCountType needs to be an unsigned integer" 31 | ]() 32 | self.count = 0 33 | if capacity <= 8: 34 | self.capacity = 8 35 | else: 36 | var icapacity = Int64(capacity) 37 | self.capacity = capacity if pop_count(icapacity) == 1 else 38 | 1 << Int(bit_width(icapacity)) 39 | self.keys = KeysContainer[KeyOffsetType](capacity) 40 | @parameter 41 | if caching_hashes: 42 | self.key_hashes = UnsafePointer[Scalar[KeyCountType]].alloc(self.capacity) 43 | else: 44 | self.key_hashes = UnsafePointer[Scalar[KeyCountType]].alloc(0) 45 | self.values = List[V](capacity=capacity) 46 | self.slot_to_index = UnsafePointer[Scalar[KeyCountType]].alloc(self.capacity) 47 | memset_zero(self.slot_to_index, self.capacity) 48 | @parameter 49 | if destructive: 50 | self.deleted_mask = UnsafePointer[UInt8].alloc(self.capacity >> 3) 51 | memset_zero(self.deleted_mask, self.capacity >> 3) 52 | else: 53 | self.deleted_mask = UnsafePointer[UInt8].alloc(0) 54 | 55 | fn __copyinit__(out self, existing: Self): 56 | self.count = existing.count 57 | self.capacity = existing.capacity 58 | self.keys = existing.keys 59 | @parameter 60 | if caching_hashes: 61 | self.key_hashes = UnsafePointer[Scalar[KeyCountType]].alloc(self.capacity) 62 | memcpy(self.key_hashes, existing.key_hashes, self.capacity) 63 | else: 64 | self.key_hashes = UnsafePointer[Scalar[KeyCountType]].alloc(0) 65 | self.values = existing.values 66 | self.slot_to_index = UnsafePointer[Scalar[KeyCountType]].alloc(self.capacity) 67 | memcpy(self.slot_to_index, existing.slot_to_index, self.capacity) 68 | @parameter 69 | if destructive: 70 | self.deleted_mask = UnsafePointer[UInt8].alloc(self.capacity >> 3) 71 | memcpy(self.deleted_mask, existing.deleted_mask, self.capacity >> 3) 72 | else: 73 | self.deleted_mask = UnsafePointer[UInt8].alloc(0) 74 | 75 | fn __moveinit__(out self, owned existing: Self): 76 | self.count = existing.count 77 | self.capacity = existing.capacity 78 | self.keys = existing.keys^ 79 | self.key_hashes = existing.key_hashes 80 | self.values = existing.values^ 81 | self.slot_to_index = existing.slot_to_index 82 | self.deleted_mask = existing.deleted_mask 83 | 84 | fn __del__(owned self): 85 | self.slot_to_index.free() 86 | self.deleted_mask.free() 87 | self.key_hashes.free() 88 | 89 | fn __len__(self) -> Int: 90 | return self.count 91 | 92 | @always_inline 93 | fn __contains__( self, key: String) -> Bool: 94 | return self._find_key_index(key) != 0 95 | 96 | fn put(mut self, key: String, value: V): 97 | if self.count / self.capacity >= 0.87: 98 | self._rehash() 99 | 100 | var key_hash = hash(key).cast[KeyCountType]() 101 | var modulo_mask = self.capacity - 1 102 | var slot = Int(key_hash & modulo_mask) 103 | while True: 104 | var key_index = Int(self.slot_to_index.load(slot)) 105 | if key_index == 0: 106 | self.keys.add(key) 107 | @parameter 108 | if caching_hashes: 109 | self.key_hashes.store(slot, key_hash) 110 | self.values.append(value) 111 | self.count += 1 112 | self.slot_to_index.store(slot, SIMD[KeyCountType, 1](self.keys.count)) 113 | return 114 | @parameter 115 | if caching_hashes: 116 | var other_key_hash = self.key_hashes[slot] 117 | if other_key_hash == key_hash: 118 | var other_key = self.keys[key_index - 1] 119 | if eq(other_key, key): 120 | self.values[key_index - 1] = value # replace value 121 | @parameter 122 | if destructive: 123 | if self._is_deleted(key_index - 1): 124 | self.count += 1 125 | self._not_deleted(key_index - 1) 126 | return 127 | else: 128 | var other_key = self.keys[key_index - 1] 129 | if eq(other_key, key): 130 | self.values[key_index - 1] = value # replace value 131 | @parameter 132 | if destructive: 133 | if self._is_deleted(key_index - 1): 134 | self.count += 1 135 | self._not_deleted(key_index - 1) 136 | return 137 | 138 | slot = (slot + 1) & modulo_mask 139 | 140 | @always_inline 141 | fn _is_deleted(self, index: Int) -> Bool: 142 | var offset = index >> 3 143 | var bit_index = index & 7 144 | return self.deleted_mask.offset(offset).load() & (1 << bit_index) != 0 145 | 146 | @always_inline 147 | fn _deleted(self, index: Int): 148 | var offset = index >> 3 149 | var bit_index = index & 7 150 | var p = self.deleted_mask.offset(offset) 151 | var mask = p.load() 152 | p.store(mask | (1 << bit_index)) 153 | 154 | @always_inline 155 | fn _not_deleted(self, index: Int): 156 | var offset = index >> 3 157 | var bit_index = index & 7 158 | var p = self.deleted_mask.offset(offset) 159 | var mask = p.load() 160 | p.store(mask & ~(1 << bit_index)) 161 | 162 | @always_inline 163 | fn _rehash(mut self): 164 | var old_slot_to_index = self.slot_to_index 165 | var old_capacity = self.capacity 166 | self.capacity <<= 1 167 | var mask_capacity = self.capacity >> 3 168 | self.slot_to_index = UnsafePointer[Scalar[KeyCountType]].alloc(self.capacity) 169 | memset_zero(self.slot_to_index, self.capacity) 170 | 171 | var key_hashes = self.key_hashes 172 | @parameter 173 | if caching_hashes: 174 | key_hashes = UnsafePointer[Scalar[KeyCountType]].alloc(self.capacity) 175 | 176 | @parameter 177 | if destructive: 178 | var deleted_mask = UnsafePointer[UInt8].alloc(mask_capacity) 179 | memset_zero(deleted_mask, mask_capacity) 180 | memcpy(deleted_mask, self.deleted_mask, old_capacity >> 3) 181 | self.deleted_mask.free() 182 | self.deleted_mask = deleted_mask 183 | 184 | var modulo_mask = self.capacity - 1 185 | for i in range(old_capacity): 186 | if old_slot_to_index[i] == 0: 187 | continue 188 | var key_hash = SIMD[KeyCountType, 1](0) 189 | @parameter 190 | if caching_hashes: 191 | key_hash = self.key_hashes[i] 192 | else: 193 | key_hash = hash(self.keys[Int(old_slot_to_index[i] - 1)]).cast[KeyCountType]() 194 | 195 | var slot = Int(key_hash & modulo_mask) 196 | 197 | # var searching = True 198 | while True: 199 | var key_index = Int(self.slot_to_index.load(slot)) 200 | 201 | if key_index == 0: 202 | self.slot_to_index.store(slot, old_slot_to_index[i]) 203 | break 204 | # searching = False 205 | 206 | else: 207 | slot = (slot + 1) & modulo_mask 208 | @parameter 209 | if caching_hashes: 210 | key_hashes[slot] = key_hash 211 | 212 | @parameter 213 | if caching_hashes: 214 | self.key_hashes.free() 215 | self.key_hashes = key_hashes 216 | old_slot_to_index.free() 217 | 218 | fn get(self, key: String, default: V) -> V: 219 | var key_index = self._find_key_index(key) 220 | if key_index == 0: 221 | return default 222 | 223 | @parameter 224 | if destructive: 225 | if self._is_deleted(key_index - 1): 226 | return default 227 | return self.values[key_index - 1] 228 | 229 | fn delete(mut self, key: String): 230 | @parameter 231 | if not destructive: 232 | return 233 | 234 | var key_index = self._find_key_index(key) 235 | if key_index == 0: 236 | return 237 | if not self._is_deleted(key_index - 1): 238 | self.count -= 1 239 | self._deleted(key_index - 1) 240 | 241 | fn upsert(mut self, key: String, update: fn(value: Optional[V]) -> V): 242 | var key_index = self._find_key_index(key) 243 | if key_index == 0: 244 | var value = update(None) 245 | self.put(key, value) 246 | else: 247 | key_index -= 1 248 | 249 | @parameter 250 | if destructive: 251 | if self._is_deleted(key_index): 252 | self.values[key_index] = update(None) 253 | return 254 | 255 | self.values[key_index] = update(self.values[key_index]) 256 | 257 | fn clear(mut self): 258 | self.values.clear() 259 | self.keys.clear() 260 | memset_zero(self.slot_to_index, self.capacity) 261 | @parameter 262 | if destructive: 263 | memset_zero(self.deleted_mask, self.capacity >> 3) 264 | self.count = 0 265 | 266 | @always_inline 267 | fn _find_key_index(self, key: String) -> Int: 268 | var key_hash = hash(key).cast[KeyCountType]() 269 | var modulo_mask = self.capacity - 1 270 | 271 | var slot = Int(key_hash & modulo_mask) 272 | while True: 273 | var key_index = Int(self.slot_to_index.load(slot)) 274 | if key_index == 0: 275 | return key_index 276 | 277 | @parameter 278 | if caching_hashes: 279 | var other_key_hash = self.key_hashes[slot] 280 | if key_hash == other_key_hash: 281 | var other_key = self.keys[key_index - 1] 282 | if eq(other_key, key): 283 | return key_index 284 | else: 285 | var other_key = self.keys[key_index - 1] 286 | if eq(other_key, key): 287 | return key_index 288 | 289 | slot = (slot + 1) & modulo_mask 290 | 291 | fn debug(self): 292 | print("Dict count:", self.count, "and capacity:", self.capacity) 293 | print("KeyMap:") 294 | for i in range(self.capacity): 295 | var end = ", " if i < self.capacity - 1 else "\n" 296 | print(self.slot_to_index.load(i), end=end) 297 | print("Keys:") 298 | self.keys.print_keys() 299 | @parameter 300 | if caching_hashes: 301 | print("KeyHashes:") 302 | for i in range(self.capacity): 303 | var end = ", " if i < self.capacity - 1 else "\n" 304 | if self.slot_to_index.load(i) > 0: 305 | print(self.key_hashes.load(i), end=end) 306 | else: 307 | print(0, end=end) 308 | -------------------------------------------------------------------------------- /string_dict/keys_container.mojo: -------------------------------------------------------------------------------- 1 | from collections.vector import InlinedFixedVector 2 | from memory import memcpy 3 | 4 | struct KeysContainer[KeyEndType: DType = DType.uint32](Sized): 5 | var keys: UnsafePointer[UInt8] 6 | var allocated_bytes: Int 7 | var keys_end: UnsafePointer[Scalar[KeyEndType]] 8 | var count: Int 9 | var capacity: Int 10 | 11 | fn __init__(out self, capacity: Int): 12 | constrained[ 13 | KeyEndType == DType.uint8 or 14 | KeyEndType == DType.uint16 or 15 | KeyEndType == DType.uint32 or 16 | KeyEndType == DType.uint64, 17 | "KeyEndType needs to be an unsigned integer" 18 | ]() 19 | self.allocated_bytes = capacity << 3 20 | self.keys = UnsafePointer[UInt8].alloc(self.allocated_bytes) 21 | self.keys_end = UnsafePointer[Scalar[KeyEndType]].alloc(capacity) 22 | self.count = 0 23 | self.capacity = capacity 24 | 25 | fn __copyinit__(out self, existing: Self): 26 | self.allocated_bytes = existing.allocated_bytes 27 | self.count = existing.count 28 | self.capacity = existing.capacity 29 | self.keys = UnsafePointer[UInt8].alloc(self.allocated_bytes) 30 | memcpy(self.keys, existing.keys, self.allocated_bytes) 31 | self.keys_end = UnsafePointer[Scalar[KeyEndType]].alloc(self.allocated_bytes) 32 | memcpy(self.keys_end, existing.keys_end, self.capacity) 33 | 34 | fn __moveinit__(out self, owned existing: Self): 35 | self.allocated_bytes = existing.allocated_bytes 36 | self.count = existing.count 37 | self.capacity = existing.capacity 38 | self.keys = existing.keys 39 | self.keys_end = existing.keys_end 40 | 41 | fn __del__(owned self): 42 | self.keys.free() 43 | self.keys_end.free() 44 | 45 | @always_inline 46 | fn add(mut self, key: String): 47 | var prev_end = 0 if self.count == 0 else self.keys_end[self.count - 1] 48 | var key_length = len(key) 49 | var new_end = prev_end + key_length 50 | 51 | var needs_realocation = False 52 | while new_end > self.allocated_bytes: 53 | self.allocated_bytes += self.allocated_bytes >> 1 54 | needs_realocation = True 55 | 56 | if needs_realocation: 57 | var keys = UnsafePointer[UInt8].alloc(self.allocated_bytes) 58 | memcpy(keys, self.keys, Int(prev_end)) 59 | self.keys.free() 60 | self.keys = keys 61 | 62 | memcpy(self.keys.offset(prev_end), UnsafePointer(key.unsafe_ptr()), key_length) 63 | var count = self.count + 1 64 | if count >= self.capacity: 65 | var new_capacity = self.capacity + (self.capacity >> 1) 66 | var keys_end = UnsafePointer[Scalar[KeyEndType]].alloc(self.allocated_bytes) 67 | memcpy(keys_end, self.keys_end, self.capacity) 68 | self.keys_end.free() 69 | self.keys_end = keys_end 70 | self.capacity = new_capacity 71 | 72 | self.keys_end.store(self.count, new_end) 73 | self.count = count 74 | 75 | 76 | @always_inline 77 | fn get(self, index: Int) -> StringSlice[StaticConstantOrigin]: 78 | if index < 0 or index >= self.count: 79 | return "" 80 | var start = 0 if index == 0 else Int(self.keys_end[index - 1]) 81 | var length = Int(self.keys_end[index]) - start 82 | return StringSlice[StaticConstantOrigin](ptr=self.keys.offset(start), length=length) 83 | 84 | @always_inline 85 | fn clear(mut self): 86 | self.count = 0 87 | 88 | @always_inline 89 | fn __getitem__(self, index: Int) -> StringSlice[StaticConstantOrigin]: 90 | return self.get(index) 91 | 92 | @always_inline 93 | fn __len__(self) -> Int: 94 | return self.count 95 | 96 | fn keys_vec(self) -> InlinedFixedVector[StringSlice[StaticConstantOrigin]]: 97 | var keys = InlinedFixedVector[StringSlice[StaticConstantOrigin]](self.count) 98 | for i in range(self.count): 99 | keys.append(self[i]) 100 | return keys 101 | 102 | fn print_keys(self): 103 | print("(" + str(self.count) + ")[", end="") 104 | for i in range(self.count): 105 | var end = ", " if i < self.capacity - 1 else "" 106 | print(self[i], end=end) 107 | print("]") 108 | -------------------------------------------------------------------------------- /string_dict/string_eq.mojo: -------------------------------------------------------------------------------- 1 | @always_inline 2 | fn eq(a: String, b: String) -> Bool: 3 | var l = len(a) 4 | if l != len(b): 5 | return False 6 | var p1 = UnsafePointer(a.unsafe_ptr()) 7 | var p2 = UnsafePointer(b.unsafe_ptr()) 8 | var offset = 0 9 | alias step = 16 10 | while l - offset >= step and (p1.load[width=step](offset) == p2.load[width=step](offset)).reduce_and(): 11 | offset += step 12 | if l - offset >= step: 13 | return False 14 | while l - offset > 0 and p1.load(offset) == p2.load(offset): 15 | offset += 1 16 | return l - offset == 0 17 | -------------------------------------------------------------------------------- /test_generic_dict.mojo: -------------------------------------------------------------------------------- 1 | from generic_dict import Dict, Keyable, KeysBuilder 2 | from testing import assert_equal 3 | 4 | from corpora import * 5 | 6 | @fieldwise_init 7 | struct Person(Keyable, Copyable, Movable): 8 | var name: String 9 | var age: Int 10 | 11 | fn accept[T: KeysBuilder](self, mut keys_builder: T): 12 | keys_builder.add_buffer[DType.uint8](self.name.unsafe_ptr(), len(self.name)) 13 | keys_builder.add(Int64(self.age)) 14 | 15 | fn test_person_dict() raises: 16 | var p1 = Person("Maxim", 42) 17 | var p2 = Person("Maximilian", 62) 18 | var p3 = Person("Alex", 25) 19 | var p4 = Person("Maria", 28) 20 | var p5 = Person("Daria", 13) 21 | var p6 = Person("Max", 31) 22 | 23 | var d = Dict[Int]() 24 | _= d.put(p1, 1) 25 | _= d.put(p2, 11) 26 | _= d.put(p3, 111) 27 | _= d.put(p4, 1111) 28 | _= d.put(p5, 11111) 29 | _= d.put(p6, 111111) 30 | 31 | assert_equal(d.get(p1, 0), 1) 32 | # assert_equal(d.get(p2, 0), 11) 33 | # assert_equal(d.get(p3, 0), 111) 34 | # assert_equal(d.get(p4, 0), 1111) 35 | # assert_equal(d.get(p5, 0), 11111) 36 | # assert_equal(d.get(p6, 0), 111111) 37 | 38 | struct StringKey(Keyable, Copyable, Movable): 39 | var s: String 40 | 41 | fn __init__(out self, owned s: String): 42 | self.s = s^ 43 | 44 | fn __init__(out self, s: StringLiteral): 45 | self.s = String(s) 46 | 47 | fn accept[T: KeysBuilder](self, mut keys_builder: T): 48 | alias type_prefix = "String:" 49 | keys_builder.add_buffer(type_prefix.unsafe_ptr(), len(type_prefix)) 50 | keys_builder.add_buffer(self.s.unsafe_ptr(), len(self.s)) 51 | 52 | struct IntKey(Keyable, Copyable, Movable): 53 | var i: Int 54 | 55 | fn __init__(out self, i: Int): 56 | self.i = i 57 | 58 | fn accept[T: KeysBuilder](self, mut keys_builder: T): 59 | alias type_prefix = "Int:" 60 | keys_builder.add_buffer(type_prefix.unsafe_ptr(), len(type_prefix)) 61 | keys_builder.add(Int64(self.i)) 62 | 63 | fn test_add_vs_update() raises: 64 | var d = Dict[Int]() 65 | assert_equal(d.put(StringKey("a"), 1), True) 66 | assert_equal(d.put(StringKey("a"), 2), False) 67 | d.delete(StringKey("a")) 68 | assert_equal(d.put(StringKey("a"), 3), True) 69 | assert_equal(d.put(StringKey("a"), 4), False) 70 | assert_equal(d.get(StringKey("a"), 0), 4) 71 | 72 | fn test_clear() raises: 73 | var d = Dict[Int]() 74 | assert_equal(d.put(StringKey("a"), 1), True) 75 | assert_equal(d.put(StringKey("b"), 1), True) 76 | assert_equal(d.put(StringKey("a"), 2), False) 77 | assert_equal(d.get(StringKey("a"), 0), 2) 78 | d.clear() 79 | assert_equal(d.put(StringKey("a"), 3), True) 80 | assert_equal(d.get(StringKey("a"), 0), 3) 81 | assert_equal(d.get(StringKey("b"), 0), 0) 82 | 83 | fn test_no_key_collision() raises: 84 | var d = Dict[Int]() 85 | assert_equal(d.put(StringKey("a"), 1), True) 86 | assert_equal(d.put(IntKey(97), 2), True) 87 | assert_equal(d.get(StringKey("a"), 0), 1) 88 | assert_equal(d.get(IntKey(97), 0), 2) 89 | 90 | 91 | fn main() raises: 92 | test_person_dict() 93 | test_add_vs_update() 94 | test_clear() 95 | test_no_key_collision() 96 | -------------------------------------------------------------------------------- /test_multi_dict.mojo: -------------------------------------------------------------------------------- 1 | from generic_dict import MultiDict, Keyable, KeysBuilder 2 | from testing import assert_equal 3 | 4 | from corpora import * 5 | 6 | struct StringKey(Keyable, Copyable, Movable): 7 | var s: String 8 | 9 | fn __init__(out self, owned s: String): 10 | self.s = s^ 11 | 12 | fn __init__(out self, s: StringLiteral): 13 | self.s = String(s) 14 | 15 | fn accept[T: KeysBuilder](self, mut keys_builder: T): 16 | keys_builder.add_buffer(self.s.unsafe_ptr(), len(self.s)) 17 | 18 | fn test_add() raises: 19 | var d = MultiDict[Int]() 20 | d.put(StringKey("a"), 1) 21 | d.put(StringKey("b"), 2) 22 | d.put(StringKey("c"), 3) 23 | d.put(StringKey("a"), 4) 24 | d.put(StringKey("a"), 5) 25 | d.put(StringKey("a"), 6) 26 | d.put(StringKey("c"), 7) 27 | 28 | assert_equal(len(d.get(StringKey("a"))), 4) 29 | assert_equal(d.get(StringKey("a"))[0], 1) 30 | assert_equal(d.get(StringKey("a"))[1], 4) 31 | assert_equal(d.get(StringKey("a"))[2], 5) 32 | assert_equal(d.get(StringKey("a"))[3], 6) 33 | assert_equal(len(d.get(StringKey("b"))), 1) 34 | assert_equal(d.get(StringKey("b"))[0], 2) 35 | assert_equal(len(d.get(StringKey("c"))), 2) 36 | assert_equal(d.get(StringKey("c"))[0], 3) 37 | assert_equal(d.get(StringKey("c"))[1], 7) 38 | 39 | fn test_s3_corpus() raises: 40 | var d = MultiDict[ 41 | Int, 42 | KeyCountType=DType.uint8, 43 | KeyOffsetType=DType.uint16, 44 | NextKeyCountType=DType.uint8 45 | ]() 46 | var corpus = s3_action_names() 47 | for i in range(len(corpus)): 48 | d.put(StringKey(corpus[i]), i) 49 | 50 | assert_equal(len(d), 143) 51 | 52 | var all_values = 0 53 | for i in range(len(corpus)): 54 | var v = d.get(StringKey(corpus[i])) 55 | var c = len(v) 56 | all_values += c 57 | 58 | assert_equal(all_values, 143 + (len(corpus) - 143) * 3) 59 | _ = d 60 | 61 | fn test_system_corpus() raises: 62 | var d = MultiDict[Int]() 63 | var corpus = system_words_collection() 64 | for i in range(len(corpus)): 65 | d.put(StringKey(corpus[i]), i) 66 | 67 | assert_equal(len(d), len(corpus)) 68 | 69 | var all_values = 0 70 | for i in range(len(corpus)): 71 | var v = d.get(StringKey(corpus[i])) 72 | var c = len(v) 73 | all_values += c 74 | 75 | assert_equal(all_values, len(corpus)) 76 | _ = d 77 | 78 | fn test_english_corpus() raises: 79 | var d = MultiDict[ 80 | Int, 81 | KeyCountType=DType.uint16, 82 | KeyOffsetType=DType.uint16, 83 | NextKeyCountType=DType.uint16 84 | ]() 85 | var corpus = english_text_to_keys() 86 | for i in range(len(corpus)): 87 | d.put(StringKey(corpus[i]), i) 88 | assert_equal(len(d), 192) 89 | 90 | var all_values = 0 91 | for i in range(len(corpus)): 92 | var v = d.get(StringKey(corpus[i])) 93 | var c = len(v) 94 | all_values += c 95 | 96 | assert_equal(all_values, 18631) 97 | 98 | var the_occurances = 0 99 | for i in range(len(corpus)): 100 | if corpus[i] == "the": 101 | the_occurances += 1 102 | assert_equal(len(d.get(StringKey("the"))), the_occurances) 103 | _ = d 104 | 105 | fn test_get_itter() raises: 106 | var d = MultiDict[Int]() 107 | d.put(StringKey("a"), 1) 108 | d.put(StringKey("b"), 2) 109 | d.put(StringKey("c"), 3) 110 | d.put(StringKey("a"), 4) 111 | d.put(StringKey("a"), 5) 112 | d.put(StringKey("a"), 6) 113 | d.put(StringKey("c"), 7) 114 | 115 | var index_a = 0 116 | var expected_a = List[Int](1, 4, 5, 6) 117 | for v in d.get_itter(StringKey("a")): 118 | assert_equal(expected_a[index_a], v) 119 | index_a += 1 120 | 121 | assert_equal(index_a, 4) 122 | 123 | var index_b = 0 124 | var expected_b = List[Int](2) 125 | for v in d.get_itter(StringKey("b")): 126 | assert_equal(expected_b[index_b], v) 127 | index_b += 1 128 | assert_equal(index_b, 1) 129 | 130 | var index_c = 0 131 | var expected_c = List[Int](3, 7) 132 | for v in d.get_itter(StringKey("c")): 133 | assert_equal(expected_c[index_c], v) 134 | index_c += 1 135 | assert_equal(index_c, 2) 136 | 137 | var index_d = 0 138 | var expected_d = List[Int](2) 139 | for v in d.get_itter(StringKey("d")): 140 | print(v) 141 | assert_equal(expected_d[index_d], v) 142 | index_d += 1 143 | assert_equal(index_d, 0) 144 | 145 | fn main()raises: 146 | test_add() 147 | test_s3_corpus() 148 | test_system_corpus() 149 | test_english_corpus() 150 | test_get_itter() 151 | -------------------------------------------------------------------------------- /test_sparse_array.mojo: -------------------------------------------------------------------------------- 1 | from generic_dict import SparseArray 2 | from testing import assert_equal, assert_true 3 | 4 | 5 | fn assert_equal_list[T: DType](lhs: List[Scalar[T]], rhs: List[Scalar[T]]) raises: 6 | assert_equal(len(lhs), len(rhs)) 7 | for i in range(len(lhs)): 8 | assert_true(lhs[i] == rhs[i]) 9 | 10 | 11 | fn main() raises: 12 | var a = SparseArray[DType.int64](25) 13 | assert_equal(len(a.dense_values_list()), 0) 14 | a[23] = 15 15 | assert_equal(a.get(23).or_else(0), 15) 16 | assert_equal_list[DType.int64](a.dense_values_list(), List[Int64](15)) 17 | a[1] = 45 18 | assert_equal(a.get(1).or_else(0), 45) 19 | assert_equal_list[DType.int64](a.dense_values_list(), List[Int64](45, 15)) 20 | a[13] = 1 21 | assert_equal(a.get(13).or_else(0), 1) 22 | assert_equal_list[DType.int64](a.dense_values_list(), List[Int64](45, 1, 15)) 23 | a[24] = 11 24 | assert_equal(a.get(24).or_else(0), 11) 25 | assert_equal_list[DType.int64](a.dense_values_list(), List[Int64](45, 1, 15, 11)) 26 | a[2] = 0 27 | assert_equal(a.get(2).or_else(0), 0) 28 | assert_equal_list[DType.int64](a.dense_values_list(), List[Int64](45, 0, 1, 15, 11)) 29 | a[53] = 5 30 | assert_equal(a.get(53).or_else(0), 5) 31 | assert_equal_list[DType.int64](a.dense_values_list(), List[Int64](45, 0, 1, 15, 11, 5)) 32 | a[0] = 33 33 | assert_equal(a.get(0).or_else(0), 33) 34 | assert_equal_list[DType.int64](a.dense_values_list(), List[Int64](33, 45, 0, 1, 15, 11, 5)) 35 | a[53] = 49 36 | assert_equal(a.get(53).or_else(0), 49) 37 | assert_equal_list[DType.int64](a.dense_values_list(), List[Int64](33, 45, 0, 1, 15, 11, 49)) 38 | -------------------------------------------------------------------------------- /test_string_dict.mojo: -------------------------------------------------------------------------------- 1 | from string_dict import Dict 2 | from testing import assert_equal 3 | 4 | from corpora import * 5 | 6 | fn test_simple_manipulations() raises: 7 | var d = Dict[Int, KeyCountType=DType.uint8, KeyOffsetType=DType.uint16]() 8 | var corpus = s3_action_names() 9 | for i in range(len(corpus)): 10 | d.put(corpus[i], i) 11 | 12 | assert_equal(len(d), 143) 13 | assert_equal(d.get("CopyObject", -1), 2) 14 | 15 | d.delete("CopyObject") 16 | assert_equal(d.get("CopyObject", -1), -1) 17 | assert_equal(len(d), 142) 18 | 19 | d.put("CopyObjects", 256) 20 | assert_equal(d.get("CopyObjects", -1), 256) 21 | assert_equal(d.get("CopyObject", -1), -1) 22 | assert_equal(len(d), 143) 23 | 24 | d.put("CopyObject", 257) 25 | assert_equal(d.get("CopyObject", -1), 257) 26 | assert_equal(len(d), 144) 27 | 28 | _ = d 29 | 30 | fn test_simple_manipulations_on_non_destructive() raises: 31 | var d = Dict[Int, KeyCountType=DType.uint8, KeyOffsetType=DType.uint16, destructive=False]() 32 | var corpus = s3_action_names() 33 | for i in range(len(corpus)): 34 | d.put(corpus[i], i) 35 | 36 | assert_equal(len(d), 143) 37 | assert_equal(d.get("CopyObject", -1), 2) 38 | 39 | d.delete("CopyObject") 40 | assert_equal(d.get("CopyObject", -1), 2) 41 | assert_equal(len(d), 143) 42 | 43 | d.put("CopyObjects", 256) 44 | assert_equal(d.get("CopyObjects", -1), 256) 45 | assert_equal(d.get("CopyObject", -1), 2) 46 | assert_equal(len(d), 144) 47 | 48 | d.put("CopyObject", 257) 49 | assert_equal(d.get("CopyObject", -1), 257) 50 | assert_equal(len(d), 144) 51 | 52 | fn test_simple_manipulations_non_caching() raises: 53 | var d = Dict[ 54 | Int, 55 | KeyCountType=DType.uint8, 56 | KeyOffsetType=DType.uint16, 57 | caching_hashes=False 58 | ]() 59 | var corpus = s3_action_names() 60 | for i in range(len(corpus)): 61 | d.put(corpus[i], i) 62 | assert_equal(len(d), 143) 63 | assert_equal(d.get("CopyObject", -1), 2) 64 | 65 | d.delete("CopyObject") 66 | assert_equal(d.get("CopyObject", -1), -1) 67 | assert_equal(len(d), 142) 68 | 69 | d.put("CopyObjects", 256) 70 | assert_equal(d.get("CopyObjects", -1), 256) 71 | assert_equal(d.get("CopyObject", -1), -1) 72 | assert_equal(len(d), 143) 73 | 74 | d.put("CopyObject", 257) 75 | assert_equal(d.get("CopyObject", -1), 257) 76 | assert_equal(len(d), 144) 77 | 78 | _ = d 79 | 80 | @fieldwise_init 81 | struct MyInt(Copyable, Movable): 82 | var value: Int 83 | 84 | fn test_upsert() raises: 85 | var d1 = Dict[MyInt, KeyCountType=DType.uint8, KeyOffsetType=DType.uint16]() 86 | var corpus = s3_action_names() 87 | 88 | fn inc(value: Optional[MyInt]) -> MyInt: 89 | return MyInt(value.or_else(MyInt(0)).value + 1) 90 | 91 | for i in range(len(corpus)): 92 | d1.upsert(corpus[i], inc) 93 | 94 | # Does not work probably because of Int is a register passable type 95 | # var d2 = Dict[Int, KeyCountType=DType.uint8, KeyOffsetType=DType.uint16]() 96 | 97 | # fn inc2(value: Optional[Int]) -> Int: 98 | # return value.or_else(0) + 1 99 | 100 | # for i in range(len(corpus)): 101 | # d2.upsert(corpus[i], inc2) 102 | 103 | fn test_clear() raises: 104 | var d = Dict[Int]() 105 | d.put("a", 1) 106 | d.put("b", 1) 107 | assert_equal(d.get("a", 0), 1) 108 | assert_equal(d.get("b", 0), 1) 109 | d.clear() 110 | d.put("a", 2) 111 | assert_equal(d.get("a", 0), 2) 112 | assert_equal(d.get("b", 0), 0) 113 | 114 | 115 | fn main()raises: 116 | test_simple_manipulations() 117 | test_simple_manipulations_on_non_destructive() 118 | test_simple_manipulations_non_caching() 119 | test_upsert() 120 | test_clear() 121 | --------------------------------------------------------------------------------