├── .clang-format ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.md ├── include ├── adjusting │ ├── adjusting.hpp │ └── adjusting_writer.hpp ├── comparators.hpp ├── configuration.hpp ├── constants.hpp ├── counter.hpp ├── counting │ ├── counting.hpp │ ├── counting_common.hpp │ ├── counting_reader.hpp │ ├── counting_writer.hpp │ ├── hash_utils.hpp │ ├── ngrams_hash_block.hpp │ ├── parallel_radix_sort.hpp │ └── sliding_window.hpp ├── estimation.hpp ├── front_coding.hpp ├── last │ ├── estimation_builder.hpp │ ├── index_types.hpp │ ├── last.hpp │ └── write.hpp ├── merge_utils.hpp ├── merging │ ├── merging.hpp │ └── merging_writer.hpp ├── ngrams_block.hpp ├── statistics.hpp ├── stream.hpp ├── tmp.hpp ├── util.hpp ├── util_types.hpp └── vocabulary.hpp ├── src ├── count.cpp └── estimate.cpp └── test_data └── 1Billion.1M.gz /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: Google 4 | AccessModifierOffset: -4 5 | AlignAfterOpenBracket: Align 6 | AlignConsecutiveAssignments: false 7 | AlignConsecutiveDeclarations: false 8 | AlignEscapedNewlines: Left 9 | AlignOperands: true 10 | AlignTrailingComments: true 11 | AllowAllParametersOfDeclarationOnNextLine: true 12 | AllowShortBlocksOnASingleLine: false 13 | AllowShortCaseLabelsOnASingleLine: false 14 | AllowShortFunctionsOnASingleLine: Empty 15 | AllowShortIfStatementsOnASingleLine: true 16 | AllowShortLoopsOnASingleLine: true 17 | AlwaysBreakAfterDefinitionReturnType: None 18 | AlwaysBreakAfterReturnType: None 19 | AlwaysBreakBeforeMultilineStrings: true 20 | AlwaysBreakTemplateDeclarations: Yes 21 | BinPackArguments: true 22 | BinPackParameters: true 23 | BraceWrapping: 24 | AfterClass: false 25 | AfterControlStatement: false 26 | AfterEnum: false 27 | AfterFunction: false 28 | AfterNamespace: false 29 | AfterObjCDeclaration: false 30 | AfterStruct: false 31 | AfterUnion: false 32 | AfterExternBlock: false 33 | BeforeCatch: false 34 | BeforeElse: false 35 | IndentBraces: false 36 | SplitEmptyFunction: true 37 | SplitEmptyRecord: true 38 | SplitEmptyNamespace: true 39 | BreakBeforeBinaryOperators: None 40 | BreakBeforeBraces: Attach 41 | BreakBeforeInheritanceComma: false 42 | BreakInheritanceList: BeforeComma 43 | BreakBeforeTernaryOperators: true 44 | BreakConstructorInitializersBeforeComma: true 45 | BreakConstructorInitializers: BeforeComma 46 | BreakAfterJavaFieldAnnotations: false 47 | BreakStringLiterals: true 48 | ColumnLimit: 80 49 | CommentPragmas: '^ IWYU pragma:' 50 | CompactNamespaces: false 51 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 52 | ConstructorInitializerIndentWidth: 4 53 | ContinuationIndentWidth: 4 54 | Cpp11BracedListStyle: true 55 | DerivePointerAlignment: false 56 | DisableFormat: false 57 | ExperimentalAutoDetectBinPacking: false 58 | FixNamespaceComments: true 59 | ForEachMacros: 60 | - foreach 61 | - Q_FOREACH 62 | - BOOST_FOREACH 63 | IncludeBlocks: Preserve 64 | IncludeCategories: 65 | - Regex: '^' 66 | Priority: 2 67 | - Regex: '^<.*\.h>' 68 | Priority: 1 69 | - Regex: '^<.*' 70 | Priority: 2 71 | - Regex: '.*' 72 | Priority: 3 73 | IncludeIsMainRegex: '([-_](test|unittest))?$' 74 | IndentCaseLabels: true 75 | IndentPPDirectives: None 76 | IndentWidth: 4 77 | IndentWrappedFunctionNames: false 78 | JavaScriptQuotes: Leave 79 | JavaScriptWrapImports: true 80 | KeepEmptyLinesAtTheStartOfBlocks: false 81 | MacroBlockBegin: '' 82 | MacroBlockEnd: '' 83 | MaxEmptyLinesToKeep: 1 84 | NamespaceIndentation: None 85 | ObjCBinPackProtocolList: Never 86 | ObjCBlockIndentWidth: 2 87 | ObjCSpaceAfterProperty: false 88 | ObjCSpaceBeforeProtocolList: true 89 | PenaltyBreakAssignment: 2 90 | PenaltyBreakBeforeFirstCallParameter: 1 91 | PenaltyBreakComment: 300 92 | PenaltyBreakFirstLessLess: 120 93 | PenaltyBreakString: 1000 94 | PenaltyBreakTemplateDeclaration: 10 95 | PenaltyExcessCharacter: 1000000 96 | PenaltyReturnTypeOnItsOwnLine: 200 97 | PointerAlignment: Left 98 | RawStringFormats: 99 | - Language: Cpp 100 | Delimiters: 101 | - cc 102 | - CC 103 | - cpp 104 | - Cpp 105 | - CPP 106 | - 'c++' 107 | - 'C++' 108 | CanonicalDelimiter: '' 109 | BasedOnStyle: google 110 | - Language: TextProto 111 | Delimiters: 112 | - pb 113 | - PB 114 | - proto 115 | - PROTO 116 | EnclosingFunctions: 117 | - EqualsProto 118 | - EquivToProto 119 | - PARSE_PARTIAL_TEXT_PROTO 120 | - PARSE_TEST_PROTO 121 | - PARSE_TEXT_PROTO 122 | - ParseTextOrDie 123 | - ParseTextProtoOrDie 124 | CanonicalDelimiter: '' 125 | BasedOnStyle: google 126 | ReflowComments: true 127 | SortIncludes: false 128 | SortUsingDeclarations: true 129 | SpaceAfterCStyleCast: false 130 | SpaceAfterTemplateKeyword: true 131 | SpaceBeforeAssignmentOperators: true 132 | SpaceBeforeCpp11BracedList: false 133 | SpaceBeforeCtorInitializerColon: true 134 | SpaceBeforeInheritanceColon: true 135 | SpaceBeforeParens: ControlStatements 136 | SpaceBeforeRangeBasedForLoopColon: true 137 | SpaceInEmptyParentheses: false 138 | SpacesBeforeTrailingComments: 2 139 | SpacesInAngles: false 140 | SpacesInContainerLiterals: true 141 | SpacesInCStyleCastParentheses: false 142 | SpacesInParentheses: false 143 | SpacesInSquareBrackets: false 144 | Standard: Auto 145 | StatementMacros: 146 | - Q_UNUSED 147 | - QT_REQUIRE_VERSION 148 | TabWidth: 8 149 | UseTab: Never 150 | ... 151 | 152 | 153 | 154 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | .DS_Store 3 | python/files.txt 4 | python/build -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "external/tongrams"] 2 | path = external/tongrams 3 | url = https://github.com/jermp/tongrams.git 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | project(TONGRAMS_ESTIMATION) 3 | 4 | if(CMAKE_BUILD_TYPE MATCHES Debug) 5 | MESSAGE(STATUS "DEBUG defined") 6 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEBUG") 7 | endif() 8 | 9 | if(NOT CMAKE_BUILD_TYPE) 10 | set(CMAKE_BUILD_TYPE "Release") 11 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNDEBUG") 12 | endif() 13 | 14 | if(LSD_RADIX_SORT) 15 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLSD_RADIX_SORT") 16 | MESSAGE(STATUS "Sorting with LSD_RADIX_SORT") 17 | endif() 18 | 19 | MESSAGE(STATUS "CMAKE_BUILD_TYPE: " ${CMAKE_BUILD_TYPE}) 20 | 21 | 22 | if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 23 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") 24 | endif () 25 | 26 | if (UNIX AND NOT APPLE) 27 | MESSAGE(STATUS "Compiling with openmp") 28 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") # for __gnu_parallel::sort 29 | endif() 30 | 31 | if (UNIX) 32 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") 33 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") 34 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") 35 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb") 36 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-missing-braces") 37 | 38 | if(TONGRAMS_USE_SANITIZERS) 39 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer") 40 | endif() 41 | 42 | endif() 43 | 44 | find_package(Boost COMPONENTS iostreams filesystem thread REQUIRED) 45 | include_directories(${Boost_INCLUDE_DIRS}) 46 | link_directories(${Boost_LIBRARY_DIRS}) 47 | 48 | include_directories(${TONGRAMS_ESTIMATION_SOURCE_DIR}/include) 49 | include_directories(${TONGRAMS_ESTIMATION_SOURCE_DIR}/external/tongrams/include) 50 | 51 | add_subdirectory(external/tongrams) 52 | 53 | file(GLOB SRC_SOURCES src/*.cpp) 54 | foreach(SRC ${SRC_SOURCES}) 55 | get_filename_component (SRC_NAME ${SRC} NAME_WE) # without extension 56 | add_executable(${SRC_NAME} ${SRC}) 57 | target_link_libraries(${SRC_NAME} ${Boost_LIBRARIES}) 58 | endforeach(SRC) 59 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright 2017-2019 Giulio Ermanno Pibiri 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included 13 | in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 | OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Tongrams Estimation 2 | =================== 3 | 4 | Modified [Kneser-Ney](https://en.wikipedia.org/wiki/Kneser%E2%80%93Ney_smoothing) language model estimation powered by [Tongrams](https://github.com/jermp/tongrams). 5 | 6 | This C++ library implements the 1-Sort algorithm described in the paper 7 | [*Handling Massive N-Gram Datasets Efficiently*](http://pages.di.unipi.it/pibiri/papers/TOIS19.pdf) by Giulio Ermanno Pibiri and Rossano Venturini, published in ACM TOIS, 2019 [1]. 8 | 9 | ### Compiling the code 10 | 11 | git clone --recursive https://github.com/jermp/tongrams_estimation.git 12 | mkdir -p build; cd build 13 | cmake .. 14 | make -j 15 | 16 | ### Sample usage 17 | 18 | After installation of dependencies and compilation of the code, you can use 19 | the sample text (first 1M lines from the 1Billion corpus; see the paper for dataset 20 | information) in the directory 21 | `test_data`. The text is gzipped, so it must be first uncompressed. 22 | 23 | cd build 24 | gunzip ../test_data/1Billion.1M.gz 25 | 26 | ##### 1. Estimation 27 | 28 | Then you can estimate a Kneser-Ney language model of order 5 (using 25% of RAM and whose index is serialized to the file `index.bin`) as follows. 29 | 30 | ./estimate ../test_data/1Billion.1M 5 --tmp tmp_dir --ram 0.25 --out index.bin 31 | 32 | ##### 2. Computing Perplexity 33 | 34 | With the index built and serialized to `index.bin` you can compute 35 | the perplexity score with: 36 | 37 | ./external/tongrams/score index.bin ../test_data/1Billion.1M 38 | 39 | ##### 3. Counting N-Grams 40 | 41 | You can also extract n-gram counts. An example follows below, for 3-grams. 42 | 43 | ./count ../test_data/1Billion.1M 3 --tmp tmp_dir --ram 0.25 --out 3-grams 44 | 45 | The output file `3-grams` will list all extracted 3-grams sorted lexicographically 46 | in the following standard format: 47 | 48 | 49 | 50 | 51 | 52 | ... 53 | 54 | where each `` is a sequence of words separated by a whitespace character. 55 | 56 | ### Dependencies 57 | 58 | 1. [boost](https://www.boost.org/) 59 | 2. [sparsehash](https://github.com/sparsehash/sparsehash) 60 | 61 | ### Bibliography 62 | 63 | [1] Pibiri, Giulio Ermanno, and Rossano Venturini. "Handling Massive N-Gram Datasets Efficiently." ACM Transactions on Information Systems (TOIS) 37.2 (2019): 1-41. 64 | -------------------------------------------------------------------------------- /include/adjusting/adjusting.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "util.hpp" 4 | #include "constants.hpp" 5 | #include "stream.hpp" 6 | #include "statistics.hpp" 7 | #include "merge_utils.hpp" 8 | #include "adjusting_writer.hpp" 9 | 10 | namespace tongrams { 11 | 12 | template 13 | struct adjusting { 14 | typedef cursor_comparator 15 | cursor_comparator_type; 16 | 17 | adjusting(configuration const& config, tmp::data& tmp_data, 18 | tmp::statistics& tmp_stats, statistics& stats) 19 | : m_config(config) 20 | , m_tmp_data(tmp_data) 21 | , m_stats(stats) 22 | , m_stats_builder(config, tmp_data, tmp_stats) 23 | , m_writer(config, constants::file_extension::merged) 24 | , m_comparator(config.max_order) 25 | , m_cursors(cursor_comparator_type(config.max_order)) 26 | , m_CPU_time(0.0) 27 | , m_I_time(0.0) 28 | , m_O_time(0.0) 29 | , m_total_smooth_time(0.0) 30 | , m_total_time_waiting_for_disk(0.0) { 31 | auto start = clock_type::now(); 32 | size_t vocab_size = m_stats.num_ngrams(1); 33 | if (!vocab_size) { 34 | throw std::runtime_error("vocabulary size must not be 0"); 35 | } 36 | std::cerr << "vocabulary size: " << vocab_size << std::endl; 37 | tmp_stats.resize(1, vocab_size); 38 | m_stats_builder.init(vocab_size); 39 | auto end = clock_type::now(); 40 | std::chrono::duration elapsed = end - start; 41 | m_CPU_time += elapsed.count(); 42 | } 43 | 44 | typedef typename StreamGenerator::block_type input_block_type; 45 | 46 | void print_stats() const { 47 | std::cout << "\"CPU\":" << m_CPU_time << ", "; 48 | std::cout << "\"I\":" << m_I_time << ", "; 49 | std::cout << "\"O\":" << m_O_time << ", "; 50 | } 51 | 52 | void run() { 53 | auto start = clock_type::now(); 54 | std::vector filenames; 55 | { 56 | essentials::directory tmp_dir(m_config.tmp_dirname); 57 | for (auto const& filename : tmp_dir) { 58 | if (filename.extension == constants::file_extension::counts) { 59 | filenames.push_back(filename.fullpath); 60 | } 61 | } 62 | } 63 | 64 | size_t num_files_to_merge = filenames.size(); 65 | assert(num_files_to_merge > 0); 66 | std::cerr << "merging " << num_files_to_merge << " files" << std::endl; 67 | 68 | uint64_t record_size = ngrams_block::record_size(m_config.max_order); 69 | uint64_t min_load_size = m_config.RAM / (2 * num_files_to_merge + 1) / 70 | record_size * record_size; 71 | uint64_t default_load_size = 72 | (64 * essentials::MiB) / record_size * record_size; 73 | uint64_t load_size = default_load_size; 74 | if (min_load_size < default_load_size) { 75 | std::cerr << "\tusing min. load size of " << min_load_size 76 | << " because not enough RAM is available" << std::endl; 77 | load_size = min_load_size; 78 | } 79 | assert(load_size % record_size == 0); 80 | 81 | for (auto const& filename : filenames) { 82 | m_stream_generators.emplace_back(m_config.max_order); 83 | auto& gen = m_stream_generators.back(); 84 | gen.open(filename); 85 | assert(gen.size() == 0); 86 | gen.fetch_next_block(load_size); 87 | } 88 | 89 | auto get_block = [](StreamGenerator& gen) { 90 | auto* block = gen.get_block(); 91 | assert(block->template is_sorted( 92 | block->begin(), block->end())); 93 | return block; 94 | }; 95 | 96 | assert(m_cursors.empty()); 97 | for (uint64_t k = 0; k != m_stream_generators.size(); ++k) { 98 | auto& gen = m_stream_generators[k]; 99 | auto* block = get_block(gen); 100 | cursor c(block->begin(), 101 | block->end(), k); 102 | m_cursors.push(c); 103 | } 104 | 105 | uint64_t num_ngrams_per_block = load_size / record_size; 106 | std::cerr << "num_ngrams_per_block = " << num_ngrams_per_block 107 | << " ngrams" << std::endl; 108 | 109 | uint8_t N = m_config.max_order; 110 | ngrams_block result(N); 111 | result.resize_memory(num_ngrams_per_block); 112 | result.reserve_index(num_ngrams_per_block); 113 | uint64_t limit = num_ngrams_per_block; 114 | 115 | auto compute_left_extensions = [&]() { 116 | assert(result.template is_sorted( 117 | result.begin(), result.end())); 118 | auto start = clock_type::now(); 119 | m_stats_builder.compute_left_extensions(result.begin(), 120 | result.size()); 121 | auto end = clock_type::now(); 122 | std::chrono::duration elapsed = end - start; 123 | m_total_smooth_time += elapsed.count(); 124 | }; 125 | 126 | uint64_t num_Ngrams = 0; 127 | uint64_t prev_offset = 0; 128 | 129 | auto save_offsets = [&]() { 130 | uint64_t offset = num_Ngrams - prev_offset; 131 | std::vector offsets = {offset}; 132 | m_tmp_data.blocks_offsets.push_back(std::move(offsets)); 133 | prev_offset = num_Ngrams; 134 | limit = num_Ngrams + num_ngrams_per_block; 135 | }; 136 | 137 | m_writer.start(); 138 | 139 | while (!m_cursors.empty()) { 140 | auto& top = m_cursors.top(); 141 | auto min = *(top.range.begin); 142 | 143 | if (!result.size()) { 144 | result.push_back(min.data, min.data + N, *(min.value(N))); 145 | ++num_Ngrams; 146 | } else { 147 | auto& back = result.back(); 148 | bool equal = equal_to(min.data, back.data, sizeof_ngram(N)); 149 | 150 | if (not equal) { 151 | if (num_Ngrams >= limit and 152 | compare_i( 153 | min, back, m_comparator.begin()) > 0 // greater 154 | ) { 155 | save_offsets(); 156 | } 157 | 158 | if (result.size() == num_ngrams_per_block) { 159 | compute_left_extensions(); 160 | auto start = clock_type::now(); 161 | while (m_writer.size() > 0) 162 | ; // wait for flush 163 | auto end = clock_type::now(); 164 | std::chrono::duration elapsed = end - start; 165 | m_total_time_waiting_for_disk += elapsed.count(); 166 | 167 | m_writer.push(result); 168 | 169 | result.init(N); 170 | result.resize_memory(num_ngrams_per_block); 171 | result.reserve_index(num_ngrams_per_block); 172 | assert(result.empty()); 173 | } 174 | 175 | result.push_back(min.data, min.data + N, *(min.value(N))); 176 | ++num_Ngrams; 177 | 178 | } else { 179 | *(back.value(N)) += *(min.value(N)); 180 | } 181 | } 182 | 183 | ++(top.range.begin); 184 | 185 | if (top.range.begin == top.range.end) { 186 | auto& gen = m_stream_generators[top.index]; 187 | gen.release_block(); 188 | if (gen.eos()) { 189 | assert(gen.empty()); 190 | gen.close_and_remove(); 191 | m_cursors.pop(); 192 | } else { 193 | gen.fetch_next_block(load_size); 194 | auto* block = get_block(gen); 195 | top.range.begin = block->begin(); 196 | top.range.end = block->end(); 197 | } 198 | } 199 | 200 | m_cursors.heapify(); 201 | } 202 | 203 | std::cerr << "MERGE DONE: " << num_Ngrams << " N-grams" << std::endl; 204 | std::cerr << "\ttime waiting for disk = " 205 | << m_total_time_waiting_for_disk << " [sec]\n"; 206 | std::cerr << "\tsmoothing time: " << m_total_smooth_time << " [sec]" 207 | << std::endl; 208 | 209 | save_offsets(); 210 | compute_left_extensions(); 211 | m_stats_builder.finalize(); 212 | 213 | auto end = clock_type::now(); 214 | std::chrono::duration elapsed = end - start; 215 | m_CPU_time += elapsed.count(); 216 | 217 | m_writer.push(result); 218 | m_writer.terminate(); 219 | 220 | m_CPU_time -= m_total_time_waiting_for_disk; 221 | for (auto& sg : m_stream_generators) m_I_time += sg.I_time(); 222 | 223 | start = clock_type::now(); 224 | m_stats_builder.build(m_stats); 225 | end = clock_type::now(); 226 | elapsed = end - start; 227 | m_CPU_time += elapsed.count(); 228 | m_CPU_time -= m_I_time; 229 | m_O_time += m_writer.time(); 230 | } 231 | 232 | private: 233 | configuration const& m_config; 234 | tmp::data& m_tmp_data; 235 | statistics& m_stats; 236 | statistics::builder m_stats_builder; 237 | std::deque m_stream_generators; 238 | adjusting_writer m_writer; 239 | context_order_comparator_type m_comparator; 240 | 241 | min_heap, 242 | cursor_comparator_type> 243 | m_cursors; 244 | 245 | double m_CPU_time; 246 | double m_I_time; 247 | double m_O_time; 248 | double m_total_smooth_time; 249 | double m_total_time_waiting_for_disk; 250 | }; 251 | 252 | } // namespace tongrams 253 | -------------------------------------------------------------------------------- /include/adjusting/adjusting_writer.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "configuration.hpp" 4 | #include "tmp.hpp" 5 | 6 | namespace tongrams { 7 | 8 | struct adjusting_writer { 9 | adjusting_writer(configuration const& config, 10 | std::string const& file_extension) 11 | : m_num_flushes(0), m_time(0.0) { 12 | m_buffer.open(); 13 | std::string output_filename = 14 | filename_generator(config.tmp_dirname, "", file_extension)(); 15 | m_os.open(output_filename.c_str(), std::ofstream::binary | 16 | std::ofstream::ate | 17 | std::ofstream::app); 18 | } 19 | 20 | ~adjusting_writer() { 21 | if (!m_buffer.empty()) { 22 | std::cerr << "Error: some data still need to be written" 23 | << std::endl; 24 | std::terminate(); 25 | } 26 | } 27 | 28 | void start() { 29 | m_thread = std::thread(&adjusting_writer::run, this); 30 | } 31 | 32 | void terminate() { 33 | m_buffer.lock(); 34 | m_buffer.close(); 35 | m_buffer.unlock(); 36 | if (m_thread.joinable()) m_thread.join(); 37 | assert(!m_buffer.active()); 38 | while (!m_buffer.empty()) flush(); 39 | m_os.close(); 40 | std::cerr << "\tadjusting_writer thread stats:\n"; 41 | std::cerr << "\tflushed blocks: " << m_num_flushes << "\n"; 42 | std::cerr << "\twrite time: " << m_time << "\n"; 43 | } 44 | 45 | void push(ngrams_block& block) { 46 | m_buffer.lock(); 47 | m_buffer.push(block); 48 | m_buffer.unlock(); 49 | } 50 | 51 | size_t size() { 52 | m_buffer.lock(); 53 | size_t s = m_buffer.size(); 54 | m_buffer.unlock(); 55 | return s; 56 | } 57 | 58 | double time() const { 59 | return m_time; 60 | } 61 | 62 | private: 63 | semi_sync_queue m_buffer; 64 | std::ofstream m_os; 65 | std::thread m_thread; 66 | uint64_t m_num_flushes; 67 | double m_time; 68 | 69 | void run() { 70 | while (m_buffer.active()) flush(); 71 | } 72 | 73 | void flush() { 74 | m_buffer.lock(); 75 | if (m_buffer.empty()) { 76 | m_buffer.unlock(); 77 | return; 78 | } 79 | auto& block = m_buffer.pick(); 80 | m_buffer.unlock(); 81 | 82 | auto start = clock_type::now(); 83 | block.write_memory(m_os); 84 | auto end = clock_type::now(); 85 | std::chrono::duration elapsed = end - start; 86 | m_time += elapsed.count(); 87 | 88 | block.release(); 89 | 90 | m_buffer.lock(); 91 | m_buffer.pop(); 92 | m_buffer.unlock(); 93 | ++m_num_flushes; 94 | if (m_num_flushes % 20 == 0) { 95 | std::cerr << "flushed " << m_num_flushes << " blocks" << std::endl; 96 | } 97 | } 98 | }; 99 | 100 | } // namespace tongrams -------------------------------------------------------------------------------- /include/comparators.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace tongrams { 4 | 5 | template 6 | int compare_i(T const& x, T const& y, int i) { 7 | if (x[i] != y[i]) { 8 | return x[i] < y[i] ? -1 : 1; 9 | } 10 | return 0; 11 | } 12 | 13 | template 14 | struct prefix_order_comparator { 15 | prefix_order_comparator() {} 16 | 17 | void init(uint8_t N) { 18 | m_N = N; 19 | } 20 | 21 | prefix_order_comparator(uint8_t N) { 22 | init(N); 23 | } 24 | 25 | int order() const { 26 | return m_N; 27 | } 28 | 29 | void swap(prefix_order_comparator& other) { 30 | std::swap(m_N, other.m_N); 31 | } 32 | 33 | bool operator()(T const& x, T const& y) const { 34 | return compare(x, y) < 0; 35 | } 36 | 37 | inline int begin() const { 38 | return 0; 39 | } 40 | 41 | inline int end() const { // last valid index, not one-past the end 42 | return m_N - 1; 43 | } 44 | 45 | inline void next(int& i) const { 46 | ++i; 47 | } 48 | 49 | inline void advance(int& i, int n) const { 50 | i += n; 51 | } 52 | 53 | // returns the length of lcp(x,y) 54 | int lcp(T const& x, T const& y) const { 55 | for (int i = begin(); i != end(); next(i)) { 56 | int cmp = compare_i(x, y, i); 57 | if (cmp != 0) return i; 58 | } 59 | return m_N; 60 | } 61 | 62 | int compare(T const& x, T const& y) const { 63 | for (int i = begin(); i < m_N; ++i) { 64 | int cmp = compare_i(x, y, i); 65 | if (cmp != 0) return cmp; 66 | } 67 | return 0; 68 | } 69 | 70 | bool equals(T const& x, T const& y) const { 71 | return compare(x, y) == 0; 72 | } 73 | 74 | private: 75 | int m_N; 76 | }; 77 | 78 | template 79 | struct context_order_comparator { 80 | context_order_comparator() {} 81 | 82 | void init(uint8_t N) { 83 | m_N = N; 84 | } 85 | 86 | context_order_comparator(uint8_t N) { 87 | init(N); 88 | } 89 | 90 | int order() const { 91 | return m_N; 92 | } 93 | 94 | void swap(context_order_comparator& other) { 95 | std::swap(m_N, other.m_N); 96 | } 97 | 98 | bool operator()(T const& x, T const& y) const { 99 | return compare(x, y) < 0; 100 | } 101 | 102 | inline int begin() const { 103 | return m_N - 2; 104 | } 105 | 106 | inline int end() const { // last valid index, not one-past the end 107 | return m_N - 1; 108 | } 109 | 110 | inline void next(int& i) const { 111 | if (i == 0) { 112 | i = end(); 113 | } else { 114 | --i; 115 | } 116 | } 117 | 118 | inline void advance(int& i, int n) const { 119 | assert(n <= m_N); 120 | i -= n; // i -= n % m_N to fall back 121 | if (i < 0) { 122 | i += m_N; 123 | } 124 | } 125 | 126 | int lcp(T const& x, T const& y) const { 127 | int l = 0; // length of lcp(x,y) 128 | for (int i = begin(); i != end(); next(i)) { 129 | int cmp = compare_i(x, y, i); 130 | if (cmp != 0) return l; 131 | ++l; 132 | } 133 | return l; 134 | } 135 | 136 | int compare(T const& x, T const& y) const { 137 | for (int i = int(begin()); i != -1; --i) { 138 | int cmp = compare_i(x, y, i); 139 | if (cmp != 0) return cmp; 140 | } 141 | return compare_i(x, y, begin() + 1); 142 | } 143 | 144 | bool equals(T const& x, T const& y) const { 145 | return compare(x, y) == 0; 146 | } 147 | 148 | private: 149 | int m_N; 150 | }; 151 | 152 | } // namespace tongrams 153 | -------------------------------------------------------------------------------- /include/configuration.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "constants.hpp" 6 | 7 | namespace tongrams { 8 | 9 | struct configuration { 10 | configuration() 11 | : RAM(1 * essentials::GiB) 12 | , max_order(5) 13 | , num_threads(std::thread::hardware_concurrency()) 14 | , text_size(0) 15 | , tmp_dirname(constants::default_tmp_dirname) 16 | , vocab_tmp_subdirname(tmp_dirname + "/vocab") 17 | , vocab_filename("/vocabulary") 18 | , output_filename(constants::default_output_filename) 19 | , compress_blocks(false) 20 | , probs_quantization_bits(global::default_probs_quantization_bits) 21 | , backoffs_quantization_bits( 22 | global::default_backoffs_quantization_bits) {} 23 | 24 | uint64_t RAM; 25 | uint64_t max_order; 26 | uint64_t num_threads; 27 | uint64_t text_size; 28 | std::string tmp_dirname; 29 | std::string vocab_tmp_subdirname; 30 | std::string vocab_filename; 31 | std::string text_filename; 32 | std::string output_filename; 33 | bool compress_blocks; 34 | uint8_t probs_quantization_bits; 35 | uint8_t backoffs_quantization_bits; 36 | }; 37 | 38 | } // namespace tongrams 39 | -------------------------------------------------------------------------------- /include/constants.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "util.hpp" 4 | #include "util_types.hpp" 5 | #include "../external/tongrams/include/utils/util_types.hpp" 6 | 7 | namespace tongrams { 8 | namespace constants { 9 | 10 | static const uint64_t invalid_hash = 0; 11 | 12 | namespace file_extension { 13 | static const std::string counts("c"); 14 | static const std::string merged("m"); 15 | } // namespace file_extension 16 | 17 | static const std::string default_tmp_dirname("./tmp_dir"); 18 | static const std::string default_output_filename("out.bin"); 19 | 20 | static const std::string empty_token(""); 21 | static const word_id empty_token_word_id = 0; 22 | 23 | static const byte_range empty_token_byte_range{ 24 | reinterpret_cast(empty_token.c_str()), 25 | reinterpret_cast(empty_token.c_str()) + empty_token.size()}; 26 | 27 | } // namespace constants 28 | } // namespace tongrams 29 | -------------------------------------------------------------------------------- /include/counter.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "vocabulary.hpp" 4 | #include "tmp.hpp" 5 | #include "statistics.hpp" 6 | #include "stream.hpp" 7 | #include "counting/counting.hpp" 8 | #include "merging/merging.hpp" 9 | 10 | namespace tongrams { 11 | 12 | struct counter { 13 | counter(configuration const& config) 14 | : m_config(config) 15 | , m_tmp_data() 16 | , m_tmp_stats(config.max_order) 17 | , m_stats(config.max_order) { 18 | m_timings.reserve(2); 19 | std::cout << "{"; 20 | std::cout << "\"dataset\":" 21 | << boost::filesystem::path(config.text_filename).stem() 22 | << ", "; 23 | std::cout << "\"order\":" << config.max_order << ", "; 24 | std::cout << "\"RAM\":" << config.RAM << ", "; 25 | std::cout << "\"threads\":" << config.num_threads; 26 | } 27 | 28 | ~counter() { 29 | std::cout << "}" << std::endl; 30 | } 31 | 32 | void run() { 33 | if (m_config.compress_blocks) { 34 | typedef fc::writer block_writer_type; 35 | run>( 36 | "counting"); 37 | } else { 38 | run>( 39 | "counting"); 40 | } 41 | 42 | m_stats.num_ngrams(1) = m_tmp_data.word_ids.size(); 43 | m_tmp_data.word_ids.clear(); 44 | // write_vocab(); 45 | 46 | if (m_config.compress_blocks) { 47 | run>("merging"); 48 | } else { 49 | run>("merging"); 50 | } 51 | } 52 | 53 | void print_stats() { 54 | int step = 1; 55 | for (auto t : m_timings) { 56 | std::cerr << "step-" << step << ": " << t << " [sec]\n"; 57 | ++step; 58 | } 59 | } 60 | 61 | private: 62 | configuration const& m_config; 63 | tmp::data m_tmp_data; 64 | tmp::statistics m_tmp_stats; 65 | statistics m_stats; 66 | std::vector m_timings; 67 | 68 | template 69 | void run(std::string const& name) { 70 | std::cout << ", "; 71 | std::cout << "\"" + name + "\": {"; 72 | auto start = clock_type::now(); 73 | Step step(m_config, m_tmp_data, m_tmp_stats, m_stats); 74 | step.run(); 75 | auto end = clock_type::now(); 76 | std::chrono::duration elapsed = end - start; 77 | double total_time = elapsed.count(); 78 | m_timings.push_back(total_time); 79 | std::cout << "\"total\":" << total_time; 80 | std::cout << "}"; 81 | } 82 | 83 | // std::function write_vocab = [&]() { 84 | // std::ofstream os(m_config.vocab_tmp_subdirname + 85 | // m_config.vocab_filename); 86 | // size_t vocab_size = m_stats.num_ngrams(1); 87 | // vocabulary vocab; 88 | // m_tmp_data.vocab_builder.build(vocab); 89 | // for (size_t id = 0; id != vocab_size; ++id) { 90 | // util::write(os, vocab[id]); 91 | // os << "\n"; 92 | // } 93 | // os.close(); 94 | // }; 95 | }; 96 | } // namespace tongrams 97 | -------------------------------------------------------------------------------- /include/counting/counting.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "configuration.hpp" 4 | #include "tmp.hpp" 5 | #include "constants.hpp" 6 | #include "statistics.hpp" 7 | #include "util.hpp" 8 | #include "util_types.hpp" 9 | 10 | #include "../external/tongrams/include/utils/util.hpp" 11 | 12 | #include "counting_common.hpp" 13 | #include "counting_writer.hpp" 14 | #include "counting_reader.hpp" 15 | 16 | namespace tongrams { 17 | 18 | template 19 | struct counting { 20 | counting(configuration const& config, tmp::data& tmp_data, tmp::statistics&, 21 | statistics&) 22 | : m_config(config) 23 | , m_CPU_time(0.0) 24 | , m_I_time(0.0) 25 | , m_writer(config, tmp_data, constants::file_extension::counts) 26 | , m_reader(config, tmp_data, m_writer) { 27 | tmp_data.vocab_builder.push_empty(); 28 | tmp_data.word_ids[hash_utils::hash_empty_token] = 29 | constants::empty_token_word_id; 30 | } 31 | 32 | void run() { 33 | bool file_begin = true; 34 | bool file_end = false; 35 | static constexpr uint64_t mm_region_size = 1 * essentials::GiB; 36 | uint64_t blocks = util::ceil_div(m_config.text_size, mm_region_size); 37 | uint64_t page_size = sysconf(_SC_PAGESIZE); 38 | assert(mm_region_size >= page_size and mm_region_size % page_size == 0); 39 | 40 | m_writer.start(); 41 | 42 | for (uint64_t block = 0, 43 | page_id = 0; // disk page containing the beginning of 44 | // current file block 45 | block != blocks; ++block) { 46 | uint64_t chunk_size = mm_region_size; 47 | uint64_t offset = page_id * page_size; 48 | if (offset + chunk_size > m_config.text_size) { 49 | file_end = true; 50 | chunk_size = m_config.text_size - offset; 51 | } 52 | 53 | m_data = util::open_file_partition(m_file, m_config.text_filename, 54 | chunk_size, offset); 55 | uint64_t begin = 0; 56 | uint64_t end = m_file.size(); 57 | assert(end > 0); 58 | 59 | util::optimize_sequential_access(m_data, end); 60 | 61 | if (!file_begin) align_forward(begin); 62 | std::string boundary = m_boundary; 63 | m_boundary.clear(); 64 | if (!is_aligned(end - 1)) align_backward(begin, --end); 65 | 66 | uint64_t n = end; 67 | assert(n != 0 and n <= mm_region_size); 68 | m_reader.init(m_data, boundary, begin, end, file_begin, file_end); 69 | m_reader.run(); 70 | file_begin = false; 71 | 72 | uint64_t num_pages = util::ceil_div(n, page_size); 73 | assert(num_pages > 0); 74 | page_id += num_pages; 75 | m_file.close(); 76 | } 77 | 78 | m_CPU_time += m_reader.CPU_time(); 79 | m_I_time += m_reader.I_time(); 80 | m_writer.terminate(); 81 | m_reader.print_stats(); 82 | } 83 | 84 | void print_stats() const { 85 | std::cout << "\"CPU\":" << m_CPU_time << ", "; 86 | std::cout << "\"I\":" << m_I_time << ", "; 87 | std::cout << "\"O\":" << m_writer.O_time() << ", "; 88 | } 89 | 90 | private: 91 | bool is_aligned(uint64_t pos) const { 92 | return m_data[pos] == ' ' or m_data[pos] == '\n'; 93 | } 94 | 95 | void align_forward(uint64_t& begin) { 96 | for (;; ++begin) { 97 | auto c = m_data[begin]; 98 | if (c == ' ' or c == '\n') { 99 | ++begin; // first char after a whitespace 100 | break; 101 | } 102 | m_boundary.push_back(c); 103 | } 104 | } 105 | 106 | void align_backward(uint64_t begin, uint64_t& end) { 107 | for (; begin != end; --end) { 108 | auto c = m_data[end]; 109 | if (c == ' ' or c == '\n') { 110 | ++end; // one-past 111 | std::reverse(m_boundary.begin(), m_boundary.end()); 112 | break; 113 | } 114 | m_boundary.push_back(c); 115 | } 116 | } 117 | 118 | configuration const& m_config; 119 | boost::iostreams::mapped_file_source m_file; 120 | uint8_t const* m_data; 121 | 122 | std::string m_boundary; 123 | 124 | double m_CPU_time; 125 | double m_I_time; 126 | 127 | typedef counting_writer counting_writer_type; 128 | typedef counting_reader counting_reader_type; 129 | counting_writer_type m_writer; 130 | counting_reader_type m_reader; 131 | }; 132 | 133 | } // namespace tongrams 134 | -------------------------------------------------------------------------------- /include/counting/counting_common.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "ngrams_block.hpp" 4 | #include "ngrams_hash_block.hpp" 5 | #include "hash_utils.hpp" 6 | 7 | namespace tongrams { 8 | namespace counting_step { 9 | 10 | typedef ngrams_hash_block<> block_type; 11 | 12 | } // namespace counting_step 13 | } // namespace tongrams -------------------------------------------------------------------------------- /include/counting/counting_reader.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "counting_common.hpp" 4 | #include "configuration.hpp" 5 | #include "tmp.hpp" 6 | #include "sliding_window.hpp" 7 | 8 | namespace tongrams { 9 | 10 | template 11 | struct counting_reader { 12 | counting_reader(configuration const& config, tmp::data& tmp_data, 13 | Writer& thread) 14 | : m_tmp_data(tmp_data) 15 | , m_window(config.max_order) 16 | , m_max_order(config.max_order) 17 | , m_writer(thread) 18 | , m_next_word_id(constants::empty_token_word_id + 1) 19 | , m_CPU_time(0.0) { 20 | m_window.fill(constants::empty_token_word_id); 21 | static constexpr double weight = 0.9; 22 | size_t bytes_per_ngram = sizeof_ngram(config.max_order) + 23 | sizeof(count_type) + // payload 24 | sizeof(word_id*) + // pointer 25 | sizeof(ngram_id); // hashset 26 | m_num_ngrams_per_block = ((weight * config.RAM) / 27 | (2 * hash_utils::probing_space_multiplier)) / 28 | bytes_per_ngram; 29 | } 30 | 31 | void init(uint8_t const* data, std::string const& boundary, 32 | uint64_t partition_begin, uint64_t partition_end, bool file_begin, 33 | bool file_end) { 34 | auto s = clock_type::now(); 35 | m_partition_end = partition_end; 36 | m_file_begin = file_begin; 37 | m_file_end = file_end; 38 | assert(partition_begin <= partition_end); 39 | m_counts.init(m_max_order, m_num_ngrams_per_block); 40 | if (file_begin) count(); // count empty window 41 | m_window.init({data + partition_begin, data + m_partition_end}, 42 | partition_begin); 43 | 44 | if (!boundary.empty()) { 45 | m_window.shift(); 46 | stl_string_adaptor adaptor; 47 | byte_range range = adaptor(boundary); 48 | uint64_t hash = hash_utils::byte_range_hash64(range); 49 | auto id = find_or_insert(range, hash); 50 | m_window.eat(id); 51 | count(); 52 | } 53 | 54 | auto e = clock_type::now(); 55 | std::chrono::duration diff = e - s; 56 | m_CPU_time += diff.count(); 57 | } 58 | 59 | void print_stats() const { 60 | std::cerr << "\treader thread stats:\n"; 61 | std::cerr << "\tCPU time: " << m_CPU_time << " [sec]\n"; 62 | std::cerr << "\tI time: " << m_window.time() << " [sec]" << std::endl; 63 | } 64 | 65 | void run() { 66 | auto s = clock_type::now(); 67 | while (advance()) count(); 68 | 69 | // NOTE: if we are at the end of file, 70 | // add [m_max_order - 1] ngrams padded with empty tokens, 71 | // i.e., for max_order = 5 and m text words: 72 | // w_{m-3} w_{m-2} w_{m-1} w_m 73 | // w_{m-2} w_{m-1} w_m 74 | // w_{m-1} w_m 75 | // w_m 76 | if (m_file_end) { 77 | assert(m_max_order > 0); 78 | for (uint8_t i = 0; i != m_max_order - 1; ++i) { 79 | m_window.shift(); 80 | m_window.eat(constants::empty_token_word_id); 81 | count(); 82 | } 83 | } 84 | 85 | push_block(); 86 | 87 | auto e = clock_type::now(); 88 | std::chrono::duration diff = e - s; 89 | m_CPU_time += diff.count(); 90 | m_CPU_time -= I_time(); 91 | } 92 | 93 | double CPU_time() const { 94 | return m_CPU_time; 95 | } 96 | 97 | double I_time() const { 98 | return m_window.time(); 99 | } 100 | 101 | private: 102 | tmp::data& m_tmp_data; 103 | sliding_window m_window; 104 | uint8_t m_max_order; 105 | Writer& m_writer; 106 | word_id m_next_word_id; 107 | double m_CPU_time; 108 | 109 | uint64_t m_partition_end; 110 | uint64_t m_num_ngrams_per_block; 111 | bool m_file_begin, m_file_end; 112 | counting_step::block_type m_counts; 113 | 114 | word_id find_or_insert(byte_range range, uint64_t hash) { 115 | word_id id = m_next_word_id; 116 | auto it = m_tmp_data.word_ids.find(hash); 117 | if (it == m_tmp_data.word_ids.end()) { 118 | m_tmp_data.word_ids[hash] = m_next_word_id; 119 | m_tmp_data.vocab_builder.push_back(range); 120 | ++m_next_word_id; 121 | } else { 122 | id = (*it).second; 123 | } 124 | assert(id < m_next_word_id); 125 | return id; 126 | } 127 | 128 | bool advance() { 129 | if (!m_window.advance()) return false; 130 | auto const& word = m_window.last(); 131 | assert(word.hash != constants::invalid_hash); 132 | auto id = find_or_insert(word.range, word.hash); 133 | assert(id < m_next_word_id); 134 | m_window.eat(id); 135 | return true; 136 | } 137 | 138 | void count() { 139 | uint64_t hash = 140 | hash_utils::hash64(m_window.data(), sizeof_ngram(m_max_order)); 141 | auto [found, at] = m_counts.find_or_insert(m_window.get(), hash); 142 | if (found) { 143 | auto count = ++m_counts[at]; 144 | auto& max_count = m_counts.statistics().max_count; 145 | if (count > max_count) max_count = count; 146 | } 147 | if (m_counts.size() == m_num_ngrams_per_block) push_block(); 148 | } 149 | 150 | void push_block() { 151 | while (m_writer.size() > 0) 152 | ; // wait for flush 153 | counting_step::block_type tmp; 154 | tmp.init(m_max_order, m_num_ngrams_per_block); 155 | tmp.swap(m_counts); 156 | tmp.release_hash_index(); 157 | m_writer.push(tmp); 158 | } 159 | }; 160 | 161 | } // namespace tongrams -------------------------------------------------------------------------------- /include/counting/counting_writer.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "counting_common.hpp" 4 | #include "configuration.hpp" 5 | #include "tmp.hpp" 6 | #include "comparators.hpp" 7 | 8 | namespace tongrams { 9 | 10 | template 11 | struct counting_writer { 12 | counting_writer(configuration const& config, tmp::data& tmp_data, 13 | std::string const& file_extension) 14 | : m_tmp_data(tmp_data) 15 | , m_filename_gen(config.tmp_dirname, "", file_extension) 16 | , m_O_time(0.0) 17 | , m_CPU_time(0.0) 18 | , m_num_flushes(0) 19 | , m_writer(config.max_order) 20 | , m_comparator(config.max_order) { 21 | m_buffer.open(); 22 | } 23 | 24 | ~counting_writer() { 25 | if (!m_buffer.empty()) { 26 | std::cerr << "Error: some data still need to be written" 27 | << std::endl; 28 | std::terminate(); 29 | } 30 | } 31 | 32 | void start() { 33 | m_thread = std::thread(&counting_writer::run, this); 34 | } 35 | 36 | void terminate() { 37 | m_buffer.lock(); 38 | m_buffer.close(); 39 | m_buffer.unlock(); 40 | if (m_thread.joinable()) m_thread.join(); 41 | assert(!m_buffer.active()); 42 | while (!m_buffer.empty()) flush(); 43 | std::cerr << "\tcounting_writer thread stats:\n"; 44 | std::cerr << "\tflushed blocks: " << m_num_flushes << "\n"; 45 | std::cerr << "\tO time: " << m_O_time << "\n"; 46 | std::cerr << "\tCPU time: " << m_CPU_time << "\n"; 47 | } 48 | 49 | void push(counting_step::block_type& block) { 50 | m_buffer.lock(); 51 | m_buffer.push(block); 52 | m_buffer.unlock(); 53 | } 54 | 55 | size_t size() { 56 | m_buffer.lock(); 57 | size_t s = m_buffer.size(); 58 | m_buffer.unlock(); 59 | return s; 60 | } 61 | 62 | double CPU_time() const { 63 | return m_CPU_time; 64 | } 65 | 66 | double O_time() const { 67 | return m_O_time; 68 | } 69 | 70 | private: 71 | tmp::data& m_tmp_data; 72 | semi_sync_queue m_buffer; 73 | std::thread m_thread; 74 | filename_generator m_filename_gen; 75 | double m_O_time; 76 | double m_CPU_time; 77 | uint64_t m_num_flushes; 78 | BlockWriter m_writer; 79 | Comparator m_comparator; 80 | 81 | void run() { 82 | while (m_buffer.active()) flush(); 83 | } 84 | 85 | void flush() { 86 | m_buffer.lock(); 87 | if (m_buffer.empty()) { 88 | m_buffer.unlock(); 89 | return; 90 | } 91 | auto& block = m_buffer.pick(); 92 | m_buffer.unlock(); 93 | 94 | block.statistics().max_word_id = m_tmp_data.word_ids.size(); 95 | 96 | auto start = clock_type::now(); 97 | block.sort(m_comparator); 98 | auto end = clock_type::now(); 99 | std::chrono::duration elapsed = end - start; 100 | m_CPU_time += elapsed.count(); 101 | std::cerr << "sorting took " << elapsed.count() << " [sec]" 102 | << std::endl; 103 | 104 | start = clock_type::now(); 105 | std::string filename = m_filename_gen(); 106 | std::ofstream os(filename.c_str(), std::ofstream::binary | 107 | std::ofstream::ate | 108 | std::ofstream::app); 109 | 110 | m_writer.write_block(os, block.begin(), block.end(), block.size(), 111 | block.statistics()); 112 | 113 | os.close(); 114 | end = clock_type::now(); 115 | elapsed = end - start; 116 | m_O_time += elapsed.count(); 117 | 118 | block.release(); 119 | 120 | m_buffer.lock(); 121 | m_buffer.pop(); 122 | m_buffer.unlock(); 123 | ++m_num_flushes; 124 | m_filename_gen.next(); 125 | } 126 | }; 127 | 128 | } // namespace tongrams -------------------------------------------------------------------------------- /include/counting/hash_utils.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "constants.hpp" 4 | 5 | namespace tongrams { 6 | namespace hash_utils { 7 | 8 | /* 9 | This code is an adaptation from 10 | https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp 11 | by Austin Appleby 12 | */ 13 | uint64_t murmur_hash64(const void* key, size_t len, uint64_t seed) { 14 | const uint64_t m = 0xc6a4a7935bd1e995ULL; 15 | const int r = 47; 16 | 17 | uint64_t h = seed ^ (len * m); 18 | 19 | #if defined(__arm) || defined(__arm__) 20 | const size_t ksize = sizeof(uint64_t); 21 | const unsigned char* data = (const unsigned char*)key; 22 | const unsigned char* end = data + (std::size_t)(len / 8) * ksize; 23 | #else 24 | const uint64_t* data = (const uint64_t*)key; 25 | const uint64_t* end = data + (len / 8); 26 | #endif 27 | 28 | while (data != end) { 29 | #if defined(__arm) || defined(__arm__) 30 | uint64_t k; 31 | memcpy(&k, data, ksize); 32 | data += ksize; 33 | #else 34 | uint64_t k = *data++; 35 | #endif 36 | 37 | k *= m; 38 | k ^= k >> r; 39 | k *= m; 40 | 41 | h ^= k; 42 | h *= m; 43 | } 44 | 45 | const unsigned char* data2 = (const unsigned char*)data; 46 | 47 | switch (len & 7) { 48 | // fall through 49 | case 7: 50 | h ^= uint64_t(data2[6]) << 48; 51 | // fall through 52 | case 6: 53 | h ^= uint64_t(data2[5]) << 40; 54 | // fall through 55 | case 5: 56 | h ^= uint64_t(data2[4]) << 32; 57 | // fall through 58 | case 4: 59 | h ^= uint64_t(data2[3]) << 24; 60 | // fall through 61 | case 3: 62 | h ^= uint64_t(data2[2]) << 16; 63 | // fall through 64 | case 2: 65 | h ^= uint64_t(data2[1]) << 8; 66 | // fall through 67 | case 1: 68 | h ^= uint64_t(data2[0]); 69 | h *= m; 70 | }; 71 | 72 | h ^= h >> r; 73 | h *= m; 74 | h ^= h >> r; 75 | 76 | return h; 77 | } 78 | 79 | static inline uint64_t byte_range_hash64(byte_range const& br) { 80 | return murmur_hash64(br.first, br.second - br.first, 0); 81 | } 82 | 83 | static inline uint64_t hash64(const void* data, size_t bytes) { 84 | return murmur_hash64(data, bytes, 0); 85 | } 86 | 87 | static const uint64_t hash_empty_token = 88 | byte_range_hash64(constants::empty_token_byte_range); 89 | static constexpr float probing_space_multiplier = 1.5; 90 | 91 | struct linear_prober { 92 | linear_prober(iterator position, uint64_t universe) 93 | : m_position(position % universe), m_universe(universe) {} 94 | 95 | inline iterator operator*() { 96 | if (m_position == m_universe) m_position = 0; // fall back 97 | return m_position; 98 | } 99 | 100 | inline void operator++() { 101 | ++m_position; 102 | } 103 | 104 | private: 105 | iterator m_position; 106 | uint64_t m_universe; 107 | }; 108 | 109 | } // namespace hash_utils 110 | } // namespace tongrams 111 | -------------------------------------------------------------------------------- /include/counting/ngrams_hash_block.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifndef __APPLE__ 4 | #include 5 | #endif 6 | 7 | #include "util.hpp" 8 | #include "hash_utils.hpp" 9 | #include "ngrams_block.hpp" 10 | #include "parallel_radix_sort.hpp" 11 | 12 | namespace tongrams { 13 | 14 | template 15 | struct ngrams_hash_block { 16 | static constexpr ngram_id invalid_ngram_id = ngram_id(-1); 17 | 18 | ngrams_hash_block() : m_size(0), m_num_bytes(0) { 19 | resize(0); 20 | } 21 | 22 | void init(uint8_t ngram_order, uint64_t size) { 23 | m_num_bytes = ngram_order * sizeof(word_id); 24 | m_block.init(ngram_order); 25 | resize(size); 26 | } 27 | 28 | void resize(uint64_t size) { 29 | uint64_t buckets = size * hash_utils::probing_space_multiplier; 30 | m_data.resize(buckets, invalid_ngram_id); 31 | m_block.resize_memory(size); 32 | m_block.resize_index(size); 33 | } 34 | 35 | std::pair find_or_insert(ngram_type const& key, 36 | iterator hint) { 37 | assert(buckets()); 38 | Prober prober(hint, buckets()); 39 | iterator start = *prober; 40 | iterator it = start; 41 | ngram_id at = invalid_ngram_id; 42 | 43 | while (m_data[it] != invalid_ngram_id) { 44 | assert(it < buckets()); 45 | if (equal_to(m_block[m_data[it]].data, key.data(), m_num_bytes)) { 46 | at = m_data[it]; 47 | return {true, at}; 48 | } 49 | ++prober; 50 | it = *prober; 51 | if (it == start) { // back to starting point: 52 | // thus all positions have been checked 53 | std::cerr << "ERROR: all positions have been checked" 54 | << std::endl; 55 | at = invalid_ngram_id; 56 | return {false, at}; 57 | } 58 | } 59 | 60 | // insert 61 | m_data[it] = m_size++; 62 | at = m_data[it]; 63 | m_block.set(at, key.begin(), key.end(), 1); 64 | return {false, at}; 65 | } 66 | 67 | template 68 | void sort(Comparator const& comparator) { 69 | std::cerr << "block size = " << m_size << std::endl; 70 | auto begin = m_block.begin(); 71 | auto end = begin + size(); 72 | 73 | #ifdef LSD_RADIX_SORT 74 | (void)comparator; 75 | uint32_t max_digit = statistics().max_word_id; 76 | uint32_t num_digits = m_block.order(); 77 | // std::cerr << "max_digit = " << max_digit 78 | // << "; num_digits = " << num_digits << std::endl; 79 | parallel_lsd_radix_sorter sorter( 80 | max_digit, num_digits); 81 | sorter.sort(begin, end); 82 | #else 83 | 84 | #ifdef __APPLE__ 85 | std::sort 86 | #else 87 | __gnu_parallel::sort 88 | #endif 89 | (begin, end, [&](auto l, auto r) { return comparator(l, r); }); 90 | #endif 91 | 92 | assert(m_block.template is_sorted(begin, end)); 93 | } 94 | 95 | inline count_type& operator[](ngram_id at) { 96 | assert(at < size()); 97 | return m_block.value(at); 98 | } 99 | 100 | inline uint64_t size() const { 101 | return m_size; 102 | } 103 | 104 | inline bool empty() const { 105 | return size() == 0; 106 | } 107 | 108 | inline uint64_t buckets() const { 109 | return m_data.size(); 110 | } 111 | 112 | double load_factor() const { 113 | return static_cast(size()) / buckets(); 114 | } 115 | 116 | auto begin() { 117 | return enumerator(m_block); 118 | } 119 | 120 | auto end() { 121 | return enumerator(m_block, size()); 122 | } 123 | 124 | struct enumerator { 125 | enumerator(ngrams_block& block, size_t pos = 0) 126 | : m_pos(pos), m_block(block) {} 127 | 128 | bool operator==(enumerator const& rhs) { 129 | return m_pos == rhs.m_pos; 130 | } 131 | 132 | bool operator!=(enumerator const& rhs) { 133 | return not(*this == rhs); 134 | } 135 | 136 | void operator++() { 137 | ++m_pos; 138 | } 139 | 140 | auto operator*() { 141 | return m_block[m_pos]; 142 | } 143 | 144 | private: 145 | size_t m_pos; 146 | ngrams_block& m_block; 147 | }; 148 | 149 | void swap(ngrams_hash_block& other) { 150 | std::swap(m_size, other.m_size); 151 | std::swap(m_num_bytes, other.m_num_bytes); 152 | m_data.swap(other.m_data); 153 | m_block.swap(other.m_block); 154 | } 155 | 156 | void release_hash_index() { 157 | std::vector().swap(m_data); 158 | } 159 | 160 | void release() { 161 | ngrams_hash_block().swap(*this); 162 | } 163 | 164 | auto& statistics() { 165 | return m_block.stats; 166 | } 167 | 168 | private: 169 | uint64_t m_size; 170 | size_t m_num_bytes; 171 | std::vector m_data; 172 | ngrams_block m_block; 173 | }; 174 | 175 | } // namespace tongrams 176 | -------------------------------------------------------------------------------- /include/counting/parallel_radix_sort.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace tongrams { 4 | 5 | template 6 | struct parallel_lsd_radix_sorter { 7 | parallel_lsd_radix_sorter( 8 | uint32_t max_digit, uint32_t num_digits, 9 | uint32_t num_threads = std::thread::hardware_concurrency()) 10 | : m_max_digit(max_digit) 11 | , m_num_digits(num_digits) 12 | , m_num_threads(num_threads) {} 13 | 14 | void sort(ForwardIterator begin, ForwardIterator end) const { 15 | uint32_t first_column_index = m_num_digits; 16 | for (uint32_t column_index = first_column_index; 17 | column_index - first_column_index < m_num_digits; ++column_index) { 18 | uint32_t k = column_index - 1; 19 | if (column_index > m_num_digits) { 20 | k -= first_column_index; 21 | } 22 | parallel_counting_sort(begin, end, k); 23 | } 24 | } 25 | 26 | private: 27 | uint32_t m_max_digit; 28 | uint32_t m_num_digits; 29 | uint32_t m_num_threads; 30 | 31 | void parallel_counting_sort(ForwardIterator begin, ForwardIterator end, 32 | uint32_t column_index) const { 33 | std::vector> counts( 34 | m_num_threads + 1, std::vector(m_max_digit, 0)); 35 | size_t n = end - begin; 36 | uint64_t batch_size = n / m_num_threads; 37 | if (!batch_size) throw std::runtime_error("too many threads"); 38 | 39 | parallel_executor p(m_num_threads); 40 | task_region(*(p.executor), [&](task_region_handle& trh) { 41 | for (uint64_t i = 0; i < m_num_threads; ++i) { 42 | trh.run([&, i] { 43 | auto b = begin + i * batch_size; 44 | auto e = b + batch_size; 45 | if (i == m_num_threads - 1) e = end; 46 | std::for_each(b, e, [&](auto const& x) { 47 | uint32_t id = x[column_index]; 48 | assert(id < m_max_digit); 49 | ++counts[i + 1][id]; 50 | }); 51 | }); 52 | } 53 | }); 54 | 55 | // prefix sum 56 | for (uint32_t j = 0, sum = 0; j < m_max_digit; ++j) { 57 | for (uint32_t i = 0; i < m_num_threads + 1; ++i) { 58 | uint32_t occ = counts[i][j]; 59 | counts[i][j] = sum; 60 | sum += occ; 61 | } 62 | } 63 | 64 | // for (auto const& positions: counts) { 65 | // for (auto pos: positions) { 66 | // std::cerr << pos << " "; 67 | // } 68 | // std::cerr << std::endl; 69 | // } 70 | 71 | std::vector tmp_index(n); 72 | task_region(*(p.executor), [&](task_region_handle& trh) { 73 | for (uint64_t i = 0; i < m_num_threads; ++i) { 74 | trh.run([&, i] { 75 | auto b = begin + i * batch_size; 76 | auto e = b + batch_size; 77 | if (i == m_num_threads - 1) e = end; 78 | auto& partition_counts = counts[i + 1]; 79 | std::for_each(b, e, [&](auto const& x) { 80 | uint32_t id = x[column_index]; 81 | assert(id < m_max_digit); 82 | tmp_index[partition_counts[id]++] = x; 83 | }); 84 | }); 85 | } 86 | }); 87 | 88 | task_region(*(p.executor), [&](task_region_handle& trh) { 89 | for (uint64_t i = 0; i < m_num_threads; ++i) { 90 | trh.run([&, i] { 91 | auto b = tmp_index.begin() + i * batch_size; 92 | auto output = begin + i * batch_size; 93 | auto e = b + batch_size; 94 | if (i == m_num_threads - 1) e = tmp_index.end(); 95 | std::for_each(b, e, [&](auto const& x) { 96 | *output = x; 97 | ++output; 98 | }); 99 | }); 100 | } 101 | }); 102 | } 103 | }; 104 | } // namespace tongrams 105 | -------------------------------------------------------------------------------- /include/counting/sliding_window.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include // for std::memmove 4 | 5 | #include "../external/tongrams/include/utils/iterators.hpp" 6 | 7 | #include "util.hpp" 8 | #include "hash_utils.hpp" 9 | #include "constants.hpp" 10 | 11 | namespace tongrams { 12 | 13 | struct sliding_window { 14 | sliding_window(uint8_t capacity) 15 | : m_end(2), m_buff(capacity), m_time(0.0) {} 16 | 17 | void init(byte_range text, uint64_t pos = 2) { 18 | m_end = pos; 19 | m_iterator.init(text); 20 | m_time = 0.0; 21 | } 22 | 23 | void fill(word_id id) { 24 | m_buff.assign(m_buff.size(), id); 25 | } 26 | 27 | void print() const { 28 | for (auto x : m_buff) { 29 | std::cout << x << " "; 30 | } 31 | std::cout << std::endl; 32 | } 33 | 34 | struct word { 35 | void init(uint64_t h, byte_range br) { 36 | hash = h; 37 | range = br; 38 | } 39 | 40 | uint64_t hash; 41 | byte_range range; 42 | }; 43 | 44 | inline void shift() { 45 | std::memmove(&m_buff[0], &m_buff[1], 46 | sizeof_ngram(m_buff.size() - 1)); // shift left by one 47 | } 48 | 49 | bool advance() { 50 | if (!m_iterator.has_next()) return false; 51 | 52 | shift(); 53 | uint64_t hash = hash_utils::hash_empty_token; 54 | byte_range range = constants::empty_token_byte_range; 55 | size_t range_len = 0; 56 | 57 | while (range_len == 0) { // skip blank lines 58 | if (m_iterator.has_next()) { 59 | auto start = clock_type::now(); 60 | range = m_iterator.next(); 61 | auto end = clock_type::now(); 62 | std::chrono::duration elapsed = end - start; 63 | m_time += elapsed.count(); 64 | range_len = range.second - range.first; 65 | } else { 66 | m_end += 2; 67 | m_last.init(hash, range); 68 | return false; 69 | } 70 | } 71 | 72 | ++range_len; 73 | hash = hash_utils::byte_range_hash64(range); 74 | m_end += range_len; 75 | m_last.init(hash, range); 76 | 77 | return true; 78 | } 79 | 80 | void eat(word_id id) { 81 | m_buff.back() = id; 82 | } 83 | 84 | ngram_type const& get() { 85 | return m_buff; 86 | } 87 | 88 | word_id const* data() { 89 | return m_buff.data(); 90 | } 91 | 92 | inline auto const& last() const { 93 | return m_last; 94 | } 95 | 96 | inline word_id front() const { 97 | return m_buff.back(); 98 | } 99 | 100 | inline word_id back() const { 101 | return m_buff.front(); 102 | } 103 | 104 | double time() const { 105 | return m_time; 106 | } 107 | 108 | private: 109 | uint64_t m_end; // beginning of next word 110 | word m_last; 111 | forward_byte_range_iterator m_iterator; 112 | ngram_type m_buff; 113 | double m_time; 114 | }; 115 | 116 | } // namespace tongrams 117 | -------------------------------------------------------------------------------- /include/estimation.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "vocabulary.hpp" 4 | #include "tmp.hpp" 5 | #include "statistics.hpp" 6 | #include "stream.hpp" 7 | #include "counting/counting.hpp" 8 | #include "adjusting/adjusting.hpp" 9 | #include "last/last.hpp" 10 | #include "last/write.hpp" 11 | 12 | namespace tongrams { 13 | 14 | struct estimation { 15 | estimation(configuration const& config) 16 | : m_config(config) 17 | , m_tmp_data() 18 | , m_tmp_stats(config.max_order) 19 | , m_stats(config.max_order) { 20 | m_timings.reserve(3); 21 | std::cout << "{"; 22 | std::cout << "\"dataset\":" 23 | << boost::filesystem::path(config.text_filename).stem() 24 | << ", "; 25 | std::cout << "\"order\":" << config.max_order << ", "; 26 | std::cout << "\"RAM\":" << config.RAM << ", "; 27 | std::cout << "\"threads\":" << config.num_threads; 28 | } 29 | 30 | ~estimation() { 31 | std::cout << "}" << std::endl; 32 | } 33 | 34 | void run() { 35 | if (m_config.compress_blocks) { 36 | typedef fc::writer block_writer_type; 37 | run>( 38 | "counting"); 39 | } else { 40 | run>( 41 | "counting"); 42 | } 43 | 44 | m_stats.num_ngrams(1) = m_tmp_data.word_ids.size(); 45 | m_tmp_data.word_ids.clear(); 46 | auto handle = util::async_call(write_vocab); 47 | 48 | if (m_config.compress_blocks) { 49 | run>("adjusting"); 50 | } else { 51 | run>("adjusting"); 52 | } 53 | 54 | util::wait(handle); 55 | 56 | run("last"); 57 | 58 | // util::clean_temporaries(m_config.tmp_dirname); 59 | } 60 | 61 | void print_stats() { 62 | std::cerr 63 | << "==== STATISTICS =======================================\n"; 64 | std::cerr << "total num. of words = " << m_stats.total_words() << "\n"; 65 | std::cerr << "total num. of grams = " << m_stats.total_grams() << "\n"; 66 | std::cerr << "probability of word = " << m_stats.unk_prob() 67 | << "\n"; 68 | m_stats.print(); 69 | int step = 1; 70 | for (auto t : m_timings) { 71 | std::cerr << "step-" << step << ": " << t << " [sec]\n"; 72 | ++step; 73 | } 74 | std::cerr << "=======================================================" 75 | << std::endl; 76 | } 77 | 78 | private: 79 | configuration const& m_config; 80 | tmp::data m_tmp_data; 81 | tmp::statistics m_tmp_stats; 82 | statistics m_stats; 83 | std::vector m_timings; 84 | 85 | template 86 | void run(std::string const& name) { 87 | std::cout << ", "; 88 | std::cout << "\"" + name + "\": {"; 89 | auto start = clock_type::now(); 90 | Step step(m_config, m_tmp_data, m_tmp_stats, m_stats); 91 | step.run(); 92 | auto end = clock_type::now(); 93 | std::chrono::duration elapsed = end - start; 94 | double total_time = elapsed.count(); 95 | m_timings.push_back(total_time); 96 | step.print_stats(); 97 | std::cout << "\"total\":" << total_time; 98 | std::cout << "}"; 99 | } 100 | 101 | std::function write_vocab = [&]() { 102 | std::ofstream os(m_config.vocab_tmp_subdirname + 103 | m_config.vocab_filename); 104 | size_t vocab_size = m_stats.num_ngrams(1); 105 | vocabulary vocab; 106 | m_tmp_data.vocab_builder.build(vocab); 107 | for (size_t id = 0; id != vocab_size; ++id) { 108 | util::write(os, vocab[id]); 109 | os << "\n"; 110 | } 111 | os.close(); 112 | }; 113 | }; 114 | } // namespace tongrams 115 | -------------------------------------------------------------------------------- /include/front_coding.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "util_types.hpp" 4 | #include "../external/tongrams/include/vectors/bit_vector.hpp" 5 | 6 | #include 7 | 8 | namespace tongrams { 9 | namespace fc { 10 | 11 | const static std::streamsize BLOCK_BYTES = 64 * essentials::MiB; 12 | const static std::streamsize BLOCK_BITS = BLOCK_BYTES * 8; 13 | 14 | template 15 | struct writer { 16 | writer(uint8_t N) : m_comparator(N) {} 17 | 18 | template 19 | void write_block(std::ofstream& os, Iterator begin, Iterator end, size_t n, 20 | ngrams_block_statistics const& stats) { 21 | // in bytes 22 | uint8_t l = 1; 23 | uint8_t w = (util::ceil_log2(stats.max_word_id + 1) + 7) / 8; 24 | uint8_t v = (util::ceil_log2(stats.max_count + 1) + 7) / 8; 25 | essentials::save_pod(os, w); 26 | essentials::save_pod(os, v); 27 | // in bits 28 | l *= 8; 29 | w *= 8; 30 | v *= 8; 31 | uint8_t N = m_comparator.order(); 32 | size_t max_record_size = l + N * w + v; // in bits 33 | 34 | m_buffer.init(); 35 | m_buffer.reserve(BLOCK_BITS); 36 | 37 | auto explicit_write = [&](ngram_pointer ptr) { 38 | for (int i = 0; i < N; ++i) { 39 | m_buffer.append_bits(ptr[i], w); 40 | } 41 | m_buffer.append_bits(*(ptr.value(N)), v); 42 | }; 43 | 44 | auto prev_ptr = *begin; 45 | explicit_write(prev_ptr); 46 | ++begin; 47 | 48 | uint64_t written = 0; 49 | uint64_t num_ngrams_in_block = 1; // first is written explicitly 50 | for (uint64_t encoded = 0; begin != end; 51 | ++begin, ++encoded, ++num_ngrams_in_block) { 52 | int lcp = 0; 53 | auto ptr = *begin; 54 | 55 | if (BLOCK_BITS - m_buffer.size() < max_record_size) { 56 | // flush current buffer, inserting padding 57 | // always flush exactly BLOCK_BYTES bytes 58 | flush_buffer(os, BLOCK_BYTES, num_ngrams_in_block); 59 | m_buffer.init(); 60 | m_buffer.reserve(BLOCK_BITS); 61 | written = encoded; 62 | num_ngrams_in_block = 0; 63 | explicit_write(ptr); 64 | } else { 65 | lcp = m_comparator.lcp(ptr, prev_ptr); 66 | assert(lcp < N); 67 | m_buffer.append_bits(lcp, l); 68 | if (lcp == 0) { 69 | explicit_write(ptr); 70 | } else { 71 | int i = m_comparator.begin(); 72 | m_comparator.advance(i, lcp); 73 | for (;; m_comparator.next(i)) { 74 | m_buffer.append_bits(ptr[i], w); 75 | if (i == m_comparator.end()) break; 76 | } 77 | m_buffer.append_bits(*(ptr.value(N)), v); 78 | } 79 | } 80 | 81 | prev_ptr = ptr; 82 | } 83 | 84 | // save last block if needed 85 | if (written != n) { 86 | size_t bytes = (m_buffer.size() + 7) / 8; 87 | flush_buffer(os, bytes, num_ngrams_in_block); 88 | } 89 | } 90 | 91 | private: 92 | Comparator m_comparator; 93 | bit_vector_builder m_buffer; // NOTE: need a buffer beacuse we do not know 94 | // how many ngrams we can compress in a block 95 | 96 | void flush_buffer(std::ofstream& os, size_t bytes, 97 | uint64_t num_ngrams_in_block) { 98 | assert(num_ngrams_in_block > 0); 99 | essentials::save_pod(os, num_ngrams_in_block); 100 | os.write(reinterpret_cast(m_buffer.data().data()), bytes); 101 | } 102 | }; 103 | 104 | struct cache { 105 | cache() : pos(nullptr), m_begin(nullptr), m_data(0, 0) {} 106 | 107 | cache(uint8_t N) : m_data(ngrams_block::record_size(N), 0) { 108 | init(); 109 | } 110 | 111 | inline void init() { 112 | m_begin = m_data.data(); 113 | pos = m_begin; 114 | } 115 | 116 | inline uint8_t* begin() const { 117 | return m_begin; 118 | } 119 | 120 | void store(uint8_t const* src, size_t n) { 121 | std::memcpy(pos, src, n); 122 | } 123 | 124 | void swap(cache& other) { 125 | std::swap(pos, other.pos); 126 | std::swap(m_begin, other.m_begin); 127 | m_data.swap(other.m_data); 128 | } 129 | 130 | uint8_t* pos; 131 | 132 | private: 133 | uint8_t* m_begin; 134 | std::vector m_data; 135 | }; 136 | 137 | template 138 | struct ngrams_block { 139 | ngrams_block() {} 140 | 141 | struct fc_iterator { 142 | const static size_t W = sizeof(word_id); 143 | 144 | fc_iterator(uint8_t N, size_t pos, size_t size, 145 | ngrams_block& m_block) 146 | : m_it(m_block.m_memory.data()) 147 | , m_comparator(N) 148 | , m_back(N) 149 | , m_pos(pos) 150 | , m_size(size) 151 | , m_w(m_block.m_w) 152 | , m_v(m_block.m_v) { 153 | if (pos != size) decode_explicit(); 154 | } 155 | 156 | void swap(fc_iterator& other) { 157 | std::swap(m_it, other.m_it); 158 | m_comparator.swap(other.m_comparator); 159 | m_back.swap(other.m_back); 160 | m_back.init(); 161 | std::swap(m_pos, other.m_pos); 162 | std::swap(m_size, other.m_size); 163 | std::swap(m_w, other.m_w); 164 | std::swap(m_v, other.m_v); 165 | } 166 | 167 | fc_iterator(fc_iterator&& rhs) { 168 | *this = std::move(rhs); 169 | } 170 | 171 | inline fc_iterator& operator=(fc_iterator&& rhs) { 172 | if (this != &rhs) swap(rhs); 173 | return *this; 174 | }; 175 | 176 | fc_iterator(fc_iterator const& rhs) { 177 | *this = rhs; 178 | } 179 | 180 | fc_iterator& operator=(fc_iterator const& rhs) { 181 | if (this != &rhs) { 182 | m_it = rhs.m_it; 183 | m_comparator = rhs.m_comparator; 184 | m_back = rhs.m_back; 185 | m_back.init(); 186 | m_pos = rhs.m_pos; 187 | m_size = rhs.m_size; 188 | m_w = rhs.m_w; 189 | m_v = rhs.m_v; 190 | } 191 | return *this; 192 | }; 193 | 194 | bool operator==(fc_iterator const& rhs) { 195 | return m_pos == rhs.m_pos; 196 | } 197 | 198 | bool operator!=(fc_iterator const& rhs) { 199 | return not(*this == rhs); 200 | } 201 | 202 | inline ngram_pointer operator*() const { 203 | ngram_pointer ptr; 204 | ptr.data = reinterpret_cast(m_back.begin()); 205 | return ptr; 206 | } 207 | 208 | void operator++() { 209 | if (m_pos == m_size - 1) { 210 | ++m_pos; // one-past the end 211 | return; 212 | } 213 | decode(); 214 | ++m_pos; 215 | } 216 | 217 | private: 218 | uint8_t const* m_it; 219 | Comparator m_comparator; 220 | cache m_back; 221 | size_t m_pos, m_size; 222 | uint8_t m_w, m_v; 223 | 224 | void decode_value() { 225 | m_back.store(m_it, m_v); 226 | m_back.pos += sizeof(count_type); 227 | m_it += m_v; 228 | } 229 | 230 | void decode_explicit() { 231 | uint8_t N = m_comparator.order(); 232 | assert(m_back.pos == m_back.begin()); 233 | for (uint8_t i = 0; i < N; ++i) { 234 | m_back.store(m_it, m_w); 235 | m_back.pos += W; 236 | m_it += m_w; 237 | } 238 | decode_value(); 239 | } 240 | 241 | void decode() { 242 | m_back.init(); 243 | 244 | uint8_t lcp = *m_it++; 245 | if (lcp == 0) { 246 | decode_explicit(); 247 | return; 248 | } 249 | 250 | int i = m_comparator.begin(); 251 | m_comparator.advance(i, lcp); 252 | m_back.pos = m_back.begin() + i * W; 253 | uint8_t N = m_comparator.order(); 254 | assert(lcp < N); 255 | 256 | // store into [m_back] the other [N] - [lcp] word_ids 257 | for (int j = 0; j < N - lcp; ++j) { 258 | m_back.store(m_it, m_w); 259 | m_comparator.next(i); 260 | m_back.pos = m_back.begin() + i * W; 261 | m_it += m_w; 262 | } 263 | 264 | m_back.pos = m_back.begin() + N * W; 265 | decode_value(); 266 | } 267 | }; 268 | 269 | typedef fc_iterator iterator; 270 | 271 | ngrams_block(uint8_t N, size_t size, uint8_t w, uint8_t v) 272 | : m_size(size), m_N(N), m_w(w), m_v(v) {} 273 | 274 | void read(std::ifstream& is, size_t bytes) { 275 | m_memory.resize(bytes); 276 | is.read(reinterpret_cast(m_memory.data()), bytes); 277 | } 278 | 279 | template 280 | bool is_sorted(iterator begin, iterator end) { 281 | C comparator(m_N); 282 | auto it = begin; 283 | 284 | size_t record_bytes = tongrams::ngrams_block::record_size(m_N); 285 | cache prev(m_N); 286 | prev.init(); 287 | prev.store(reinterpret_cast((*it).data), record_bytes); 288 | ngram_pointer prev_ptr; 289 | prev_ptr.data = reinterpret_cast(prev.begin()); 290 | 291 | ++it; 292 | bool ret = true; 293 | for (size_t i = 1; it != end; ++i, ++it) { 294 | auto curr_ptr = *it; 295 | int cmp = comparator.compare(prev_ptr, curr_ptr); 296 | if (cmp == 0) { 297 | std::cerr << "Error at " << i << "/" << size() << ":\n"; 298 | prev_ptr.print(m_N); 299 | curr_ptr.print(m_N); 300 | std::cerr << "Repeated ngrams" << std::endl; 301 | } 302 | 303 | if (cmp > 0) { 304 | std::cerr << "Error at " << i << "/" << size() << ":\n"; 305 | prev_ptr.print(m_N); 306 | curr_ptr.print(m_N); 307 | std::cerr << std::endl; 308 | ret = false; 309 | } 310 | prev.init(); 311 | prev.store(reinterpret_cast(curr_ptr.data), 312 | record_bytes); 313 | prev_ptr.data = reinterpret_cast(prev.begin()); 314 | } 315 | return ret; 316 | } 317 | 318 | void materialize_index() {} 319 | 320 | void swap(ngrams_block& other) { 321 | m_memory.swap(other.m_memory); 322 | std::swap(m_size, other.m_size); 323 | std::swap(m_N, other.m_N); 324 | std::swap(m_w, other.m_w); 325 | std::swap(m_v, other.m_v); 326 | } 327 | 328 | void release() { 329 | fc::ngrams_block().swap(*this); 330 | } 331 | 332 | friend struct fc_iterator; 333 | 334 | inline auto begin() { 335 | return fc_iterator(m_N, 0, m_size, *this); 336 | } 337 | 338 | inline auto end() { 339 | return fc_iterator(m_N, m_size, m_size, *this); 340 | } 341 | 342 | size_t size() const { 343 | return m_size; 344 | } 345 | 346 | private: 347 | std::vector m_memory; 348 | size_t m_size; 349 | uint8_t m_N; 350 | uint8_t m_w, m_v; 351 | }; 352 | 353 | } // namespace fc 354 | } // namespace tongrams 355 | -------------------------------------------------------------------------------- /include/last/estimation_builder.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "../external/tongrams/include/trie_prob_lm.hpp" 4 | 5 | namespace tongrams { 6 | 7 | template 9 | struct trie_prob_lm::estimation_builder { 11 | estimation_builder() {} 12 | 13 | estimation_builder(uint64_t order, configuration const& config, 14 | statistics& stats) 15 | : m_order(order) 16 | , m_unk_prob(stats.unk_prob()) 17 | , m_arrays(order) 18 | , m_next_positions(order, 0) { 19 | building_util::check_order(m_order); 20 | 21 | uint64_t vocab_size = stats.num_ngrams(1); 22 | size_t log_vocab_size = util::ceil_log2(vocab_size + 1); 23 | m_vocab_values.resize(vocab_size, 24 | 64); // values are not quantized 25 | 26 | m_arrays.front().pointers.resize( 27 | vocab_size + 1, util::ceil_log2(stats.num_ngrams(2) + 1)); 28 | m_probs.resize(order - 1); 29 | m_backoffs.resize(order - 2); 30 | 31 | std::vector probs; 32 | std::vector backoffs; 33 | size_t probs_levels = uint64_t(1) << config.probs_quantization_bits; 34 | size_t backoffs_levels = uint64_t(1) 35 | << config.backoffs_quantization_bits; 36 | double prob_quantum = 1.0 / probs_levels; 37 | double backoff_quantum = 1.0 / backoffs_levels; 38 | 39 | for (uint64_t ord = 2; ord <= m_order; ++ord) { 40 | uint64_t n = stats.num_ngrams(ord); 41 | auto& level = m_arrays[ord - 1]; 42 | level.word_ids.resize(n, log_vocab_size); 43 | level.probs_backoffs_ranks.resize( 44 | n, 45 | config.probs_quantization_bits + 46 | ((ord != m_order) ? config.backoffs_quantization_bits : 0)); 47 | probs.resize(probs_levels, 0.0); 48 | for (uint64_t i = 1; i != probs_levels + 1; ++i) { 49 | probs[i - 1] = std::log10(i * prob_quantum); 50 | } 51 | m_probs.add_sequence(ord - 1, config.probs_quantization_bits, 52 | probs); 53 | 54 | if (ord != m_order) { 55 | backoffs.resize(backoffs_levels + 1, 0.0); 56 | for (uint64_t i = 1; i != backoffs_levels + 1; ++i) { 57 | backoffs[i] = std::log10(i * backoff_quantum); 58 | } 59 | m_backoffs.add_sequence( 60 | ord - 1, config.backoffs_quantization_bits, backoffs); 61 | uint64_t pointer_bits = 62 | util::ceil_log2(stats.num_ngrams(ord + 1) + 1); 63 | level.pointers.resize(n + 1, pointer_bits); 64 | } 65 | } 66 | } 67 | 68 | void set_next_word(uint64_t n, word_id id) { 69 | assert(n >= 2 and n <= m_order); 70 | m_arrays[n - 1].word_ids.push_back(id); 71 | } 72 | 73 | void set_next_pointer(uint64_t n, uint64_t pointer) { 74 | assert(n >= 1 and n < m_order); 75 | m_arrays[n - 1].pointers.push_back(pointer); 76 | } 77 | 78 | void set_next_backoff(uint64_t n, float backoff) { 79 | assert(n >= 2 and n < m_order); 80 | uint64_t backoff_rank = 81 | m_backoffs.rank(n - 2, std::log10(backoff), 1 // reserved 82 | ); 83 | uint64_t& next_pos = m_next_positions[n - 1]; 84 | uint64_t prob_backoff_rank = 85 | m_arrays[n - 1].probs_backoffs_ranks[next_pos]; 86 | uint64_t probs_quantization_bits = m_probs.quantization_bits(n - 2); 87 | assert(probs_quantization_bits); 88 | prob_backoff_rank |= (backoff_rank << probs_quantization_bits); 89 | m_arrays[n - 1].probs_backoffs_ranks.push_back(prob_backoff_rank); 90 | ++next_pos; 91 | } 92 | 93 | void set_backoff(uint64_t n, uint64_t pos, float backoff) { 94 | assert(n >= 2 and n < m_order); 95 | uint64_t backoff_rank = 96 | m_backoffs.rank(n - 2, std::log10(backoff), 1 // reserved 97 | ); 98 | uint64_t prob_backoff_rank = m_arrays[n - 1].probs_backoffs_ranks[pos]; 99 | uint64_t probs_quantization_bits = m_probs.quantization_bits(n - 2); 100 | assert(probs_quantization_bits); 101 | prob_backoff_rank |= (backoff_rank << probs_quantization_bits); 102 | m_arrays[n - 1].probs_backoffs_ranks.set(pos, prob_backoff_rank); 103 | } 104 | 105 | void set_next_unigram_values(float prob, float backoff) { 106 | uint64_t packed = 0; 107 | bits::pack(packed, std::log10(prob), std::log10(backoff)); 108 | m_vocab_values.push_back(packed); 109 | } 110 | 111 | void set_unigram_values(uint64_t pos, float prob, float backoff) { 112 | uint64_t packed = 0; 113 | bits::pack(packed, std::log10(prob), std::log10(backoff)); 114 | m_vocab_values.set(pos, packed); 115 | } 116 | 117 | void set_word(uint64_t n, uint64_t pos, word_id id) { 118 | assert(n >= 2 and n <= m_order); 119 | m_arrays[n - 1].word_ids.set(pos, id); 120 | } 121 | 122 | void set_pointer(uint64_t n, uint64_t pos, uint64_t pointer) { 123 | assert(n >= 1 and n < m_order); 124 | m_arrays[n - 1].pointers.set(pos, pointer); 125 | } 126 | 127 | void set_prob(uint64_t n, uint64_t pos, float prob) { 128 | assert(n >= 2 and n <= m_order); 129 | uint64_t prob_backoff_rank = m_arrays[n - 1].probs_backoffs_ranks[pos]; 130 | uint64_t prob_rank = m_probs.rank(n - 2, std::log10(prob), 0); 131 | prob_backoff_rank |= prob_rank; 132 | m_arrays[n - 1].probs_backoffs_ranks.set(pos, prob_backoff_rank); 133 | } 134 | 135 | void build(trie_prob_lm& trie, configuration const& config) { 136 | trie.m_order = m_order; 137 | trie.m_unk_prob = std::log10(m_unk_prob); 138 | 139 | parallel_executor p(2); 140 | task_region(*(p.executor), [&](task_region_handle& trh) { 141 | trh.run([&] { 142 | essentials::logger("building vocabulary"); 143 | uint64_t vocab_size = m_vocab_values.size(); 144 | vocabulary vocab; 145 | { 146 | size_t num_bytes = 147 | sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES); 148 | vocabulary::builder vocab_builder(vocab_size, num_bytes); 149 | vocab_builder.load(config.vocab_tmp_subdirname + 150 | config.vocab_filename); 151 | vocab_builder.build(vocab); 152 | } 153 | 154 | std::vector bytes; 155 | bytes.reserve(vocab_size); 156 | compact_vector::builder vocab_ids( 157 | vocab_size, util::ceil_log2(vocab_size + 1)); 158 | for (uint64_t id = 0; id < vocab_size; ++id) { 159 | bytes.emplace_back(vocab[id]); 160 | vocab_ids.push_back(id); 161 | } 162 | 163 | trie.m_vocab.build(bytes, 164 | compact_vector(), // use default hash-keys 165 | compact_vector(vocab_ids), 166 | compact_vector(m_vocab_values), 167 | identity_adaptor()); 168 | }); 169 | 170 | trh.run([&] { 171 | m_probs.build(trie.m_probs_averages); 172 | m_backoffs.build(trie.m_backoffs_averages); 173 | 174 | trie.m_arrays.resize(m_order); 175 | 176 | // #pragma omp parallel for 177 | for (uint64_t n = 2; n <= m_order; ++n) { 178 | if (n == m_order) { 179 | // prefix sums pointers for N-grams 180 | // std::cerr << "prefix summing pointers for " 181 | // << int(m_order) << "-grams" << std::endl; 182 | auto& pointers = m_arrays[n - 2].pointers; 183 | uint64_t prev = 0; 184 | for (uint64_t pos = 1; pos < pointers.size(); ++pos) { 185 | prev += pointers[pos]; 186 | pointers.set(pos, prev); 187 | } 188 | } 189 | 190 | // std::cerr << "building " << int(n) << "-level word_ids" 191 | // << std::endl; 192 | // std::cerr << "m_arrays[" << int(n) - 2 193 | // << "].pointers.back() = " 194 | // << m_arrays[n - 2].pointers.back() << "; "; 195 | // std::cerr << "m_arrays[" << int(n) - 1 196 | // << "].word_ids.size() = " 197 | // << m_arrays[n - 1].word_ids.size() << 198 | // std::endl; 199 | assert(m_arrays[n - 2].pointers.back() == 200 | m_arrays[n - 1].word_ids.size()); 201 | m_arrays[n - 1].build_word_ids(n, trie.m_arrays[n - 1], 202 | m_arrays[n - 2].pointers); 203 | m_arrays[n - 1].build_probs_backoffs_ranks( 204 | trie.m_arrays[n - 1]); 205 | // std::cerr << "DONE" << std::endl; 206 | } 207 | 208 | // #pragma omp parallel for 209 | for (uint64_t n = 1; n < m_order; ++n) { 210 | // std::cerr << "building " << int(n) << "-level pointers" 211 | // << std::endl; 212 | m_arrays[n - 1].build_pointers(trie.m_arrays[n - 1]); 213 | // std::cerr << "DONE" << std::endl; 214 | } 215 | }); 216 | }); 217 | 218 | estimation_builder().swap(*this); 219 | } 220 | 221 | // for bebug 222 | // void print_stats() const { 223 | // int n = 1; 224 | // for (auto& l : m_arrays) { 225 | // std::cerr << "===========\n"; 226 | // std::cerr << "level-" << n << " statistics:\n"; 227 | // for (auto x : l.word_ids) { 228 | // std::cerr << x << " "; 229 | // } 230 | // std::cerr << std::endl; 231 | // for (auto x : l.probs_backoffs_ranks) { 232 | // std::cerr << x << " "; 233 | // } 234 | // std::cerr << std::endl; 235 | // for (auto x : l.pointers) { 236 | // std::cerr << x << " "; 237 | // } 238 | // std::cerr << std::endl; 239 | // ++n; 240 | // } 241 | // std::cerr << std::endl; 242 | // } 243 | 244 | void swap(estimation_builder& other) { 245 | std::swap(m_order, other.m_order); 246 | std::swap(m_unk_prob, other.m_unk_prob); 247 | m_vocab_values.swap(other.m_vocab_values); 248 | m_probs.swap(other.m_probs); 249 | m_backoffs.swap(other.m_backoffs); 250 | m_arrays.swap(other.m_arrays); 251 | } 252 | 253 | private: 254 | uint64_t m_order; 255 | float m_unk_prob; 256 | compact_vector::builder m_vocab_values; 257 | typename Values::builder m_probs; 258 | typename Values::builder m_backoffs; 259 | std::vector m_arrays; 260 | std::vector m_next_positions; 261 | }; 262 | 263 | } // namespace tongrams -------------------------------------------------------------------------------- /include/last/index_types.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "../external/tongrams/include/lm_types.hpp" 4 | 5 | namespace tongrams { 6 | 7 | typedef trie_prob_lm 14 | reversed_trie_index; 15 | 16 | } // namespace tongrams 17 | -------------------------------------------------------------------------------- /include/last/last.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "../external/tongrams/include/utils/util.hpp" 4 | 5 | #include "constants.hpp" 6 | #include "util.hpp" 7 | #include "stream.hpp" 8 | #include "estimation_builder.hpp" 9 | #include "index_types.hpp" 10 | 11 | namespace tongrams { 12 | 13 | struct last { 14 | typedef stream::floats_vec<> float_vector_type; 15 | 16 | last(configuration const& config, tmp::data& tmp_data, 17 | tmp::statistics& tmp_stats, statistics& stats) 18 | : m_config(config) 19 | , m_stream_generator(config.max_order) 20 | , m_tmp_data(tmp_data) 21 | , m_stats(stats) 22 | , m_tmp_stats(tmp_stats) 23 | , m_record_size(ngrams_block::record_size(config.max_order)) 24 | , m_pointers(config.max_order - 1, 0) 25 | , m_probs(config.max_order, float_vector_type(0)) 26 | , m_index_builder(config.max_order, config, stats) 27 | , m_current_block_id(0) 28 | , m_fetched_block_id(0) 29 | , m_num_blocks(tmp_data.blocks_offsets.size()) 30 | , m_CPU_time(0.0) 31 | , m_I_time(0.0) 32 | , m_O_time(0.0) { 33 | assert(m_num_blocks); 34 | std::cout << "processing " << m_num_blocks << " blocks" << std::endl; 35 | uint8_t N = m_config.max_order; 36 | { 37 | essentials::directory tmp_dir(m_config.tmp_dirname); 38 | for (auto const& filename : tmp_dir) { 39 | if (filename.extension == constants::file_extension::merged) { 40 | m_stream_generator.open(filename.fullpath); 41 | async_fetch_next_block(); 42 | break; 43 | } 44 | } 45 | } 46 | 47 | auto start = clock_type::now(); 48 | size_t vocab_size = m_stats.num_ngrams(1); 49 | for (uint8_t n = 2; n < N; ++n) { 50 | m_tmp_stats.resize(n, vocab_size); 51 | m_index_builder.set_next_pointer(n - 1, 0); 52 | } 53 | m_index_builder.set_next_pointer(N - 1, 0); 54 | auto end = clock_type::now(); 55 | std::chrono::duration elapsed = end - start; 56 | m_CPU_time += elapsed.count(); 57 | } 58 | 59 | void print_stats() const { 60 | std::cout << "\"CPU\":" << m_CPU_time << ", "; 61 | std::cout << "\"I\":" << m_I_time << ", "; 62 | std::cout << "\"O\":" << m_O_time << ", "; 63 | } 64 | 65 | void async_fetch_next_block() { 66 | if (m_fetched_block_id != m_num_blocks) { 67 | auto const& offsets = m_tmp_data.blocks_offsets[m_fetched_block_id]; 68 | // std::cout << "offsets:\n"; 69 | // for (auto off: offsets) { 70 | // std::cout << off << std::endl; 71 | // } 72 | size_t n = offsets.back(); 73 | assert(n > 0); 74 | m_stream_generator.async_fetch_next_block(n * m_record_size); 75 | ++m_fetched_block_id; 76 | } 77 | } 78 | 79 | void run() { 80 | auto start = clock_type::now(); 81 | 82 | for (; m_current_block_id < m_num_blocks;) { 83 | auto* block = m_stream_generator.get_block(); 84 | async_fetch_next_block(); 85 | uint8_t N = block->order(); 86 | 87 | for (uint8_t n = 1; n < N; ++n) { 88 | m_probs[n - 1].reserve(block->size()); 89 | } 90 | 91 | /* 92 | - n = 1: (empty context) the denominator is equal to the number 93 | of bi-grams; 94 | - n = N: the denominator is equal to the sum of the raw 95 | counts of N-grams having the same context; 96 | - 1 < n < N (otherwise): 97 | the denominator is equal to the sum of the modified counts of 98 | all n-grams having the same context. 99 | */ 100 | auto begin = block->begin(); 101 | auto end = block->end(); 102 | state s(N, begin, end); 103 | 104 | m_tmp_stats.clear(); 105 | // m_tmp_stats.print_stats(); 106 | 107 | while (s.iterators.back() != end) { 108 | for (uint8_t n = 2; n <= N; ++n) { 109 | auto& it = s.iterators[n - 1]; 110 | if (it == end) continue; 111 | auto prev_ptr = *it; 112 | for (; it != end; ++it) { 113 | auto ptr = *it; 114 | // std::cout << "scanning " << int(n) << ": "; 115 | // ptr.print(N); 116 | 117 | bool context_changes = 118 | !ptr.equal_to(prev_ptr, N - n, N - 1); 119 | if (context_changes) break; 120 | 121 | ++s.range_lengths[n - 1]; 122 | auto right = ptr[N - 1]; 123 | 124 | if (n == N) { 125 | uint64_t count = *(ptr.value(N)); 126 | s.N_gram_denominator += count; 127 | if (count < 5) { 128 | ++m_tmp_stats.r[N - 1][count - 1]; 129 | } else { 130 | ++m_tmp_stats.r[N - 1].back(); 131 | } 132 | } else { 133 | if (n == 2) { 134 | float u = unigram_prob(right); 135 | m_probs[0].push_back(u); 136 | } 137 | 138 | auto left = ptr[N - n - 1]; 139 | m_tmp_stats.update(n, left, right); 140 | auto prev_left = prev_ptr[N - n - 1]; 141 | if (left != prev_left) ++m_pointers[n - 2]; 142 | } 143 | 144 | prev_ptr = ptr; 145 | } 146 | write(n, s); 147 | } 148 | } 149 | 150 | // write last entries since [begin, end) is aligned 151 | // according to unigrams' boundaries 152 | for (uint8_t n = 2; n <= N; ++n) write(n, s); 153 | for (auto& p : m_probs) p.clear(); 154 | 155 | ++m_current_block_id; 156 | if (m_current_block_id % 20 == 0) { 157 | std::cerr << "processed " << m_current_block_id << "/" 158 | << m_num_blocks << " blocks" << std::endl; 159 | } 160 | 161 | m_stream_generator.release_block(); 162 | } 163 | 164 | auto end = clock_type::now(); 165 | std::chrono::duration elapsed = end - start; 166 | m_CPU_time += elapsed.count(); 167 | 168 | std::cerr << "processed " << m_current_block_id << "/" << m_num_blocks 169 | << " blocks" << std::endl; 170 | 171 | std::vector().swap(m_probs); 172 | 173 | // Close but do not destroy: deleting large file from disk is expensive 174 | // and we can do this after construction is over. 175 | m_stream_generator.close(); 176 | // m_index_builder.print_stats(); 177 | 178 | essentials::logger("compressing index"); 179 | start = clock_type::now(); 180 | reversed_trie_index index; 181 | m_index_builder.build(index, m_config); 182 | end = clock_type::now(); 183 | elapsed = end - start; 184 | std::cerr << "compressing index took: " << elapsed.count() << " [sec]" 185 | << std::endl; 186 | m_CPU_time += elapsed.count(); 187 | 188 | essentials::logger("writing index"); 189 | start = clock_type::now(); 190 | binary_header bin_header; 191 | bin_header.remapping_order = 0; 192 | bin_header.data_structure_t = data_structure_type::pef_trie; 193 | bin_header.value_t = value_type::prob_backoff; 194 | util::save(bin_header.get(), index, m_config.output_filename.c_str()); 195 | end = clock_type::now(); 196 | elapsed = end - start; 197 | std::cerr << "flushing index took: " << elapsed.count() << " [sec]" 198 | << std::endl; 199 | m_O_time = elapsed.count(); 200 | m_I_time = m_stream_generator.I_time(); 201 | } 202 | 203 | private: 204 | configuration const& m_config; 205 | stream::uncompressed_stream_generator m_stream_generator; 206 | tmp::data& m_tmp_data; 207 | statistics& m_stats; 208 | 209 | tmp::statistics m_tmp_stats; 210 | size_t m_record_size; 211 | 212 | std::vector m_pointers; 213 | std::vector m_probs; // buffer of uncompressed probs 214 | 215 | reversed_trie_index::estimation_builder m_index_builder; 216 | 217 | uint64_t m_current_block_id; 218 | uint64_t m_fetched_block_id; 219 | uint64_t m_num_blocks; 220 | double m_CPU_time; 221 | double m_I_time; 222 | double m_O_time; 223 | 224 | struct state { 225 | state(uint8_t N, ngrams_block::iterator begin, 226 | ngrams_block::iterator end) 227 | : range_lengths(N, 0) 228 | , probs_offsets(N, 0) 229 | , iterators(N, begin) 230 | , end(end) 231 | , N_gram_denominator(0) {} 232 | std::vector range_lengths; 233 | std::vector probs_offsets; 234 | std::vector iterators; 235 | const ngrams_block::iterator end; 236 | uint64_t N_gram_denominator; 237 | }; 238 | 239 | float unigram_prob(word_id w); 240 | void write(uint8_t n, state& s); 241 | }; 242 | 243 | } // namespace tongrams -------------------------------------------------------------------------------- /include/last/write.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "last.hpp" 4 | 5 | namespace tongrams { 6 | 7 | float last::unigram_prob(word_id w) { 8 | uint64_t uni_gram_count = m_tmp_stats.occs[0][w]; 9 | uint64_t uni_gram_denominator = m_stats.num_ngrams(2); 10 | float u = 11 | (static_cast(uni_gram_count) - m_stats.D(1, uni_gram_count)) / 12 | uni_gram_denominator; 13 | u += m_stats.unk_prob(); // interpolate 14 | assert(u <= 1.0); 15 | return u; 16 | } 17 | 18 | void last::write(uint8_t n, last::state& s) { // write ngram 19 | uint8_t N = m_config.max_order; 20 | assert(n >= 2 and n <= N); 21 | 22 | auto& l = s.range_lengths[n - 1]; 23 | if (l == 0) return; 24 | auto it = s.iterators[n - 1]; 25 | auto prev_ptr = *(it - 1); // always one-past the end 26 | 27 | if (n != 2) { 28 | auto left = prev_ptr[N - n]; 29 | m_index_builder.set_next_word(n - 1, left); 30 | } 31 | 32 | if (n != N) { 33 | ++m_pointers[n - 2]; 34 | auto pointer = m_pointers[n - 2]; 35 | m_index_builder.set_next_pointer(n - 1, pointer); 36 | } 37 | 38 | float backoff = 0.0; // backoff numerator 39 | // D_n(1) * N_n(1) + D_n(2) * N_n(2) + D_n(3) * N_n(>= 3), 40 | // where: N_n(c) = # n-grams with modified count equal to c 41 | // N_n(>= 3) = # n-grams with modified count >= 3 42 | for (uint64_t k = 1; k <= 5; ++k) { 43 | auto& c = m_tmp_stats.r[n - 1][k - 1]; 44 | backoff += c * m_stats.D(n, k); // = D(n, 3) for k >= 3 45 | c = 0; // reset current range counts 46 | } 47 | 48 | auto& offset = s.probs_offsets[n - 1]; 49 | assert(offset < m_probs[n - 2].size()); 50 | 51 | if (n != N) { 52 | ++m_tmp_stats.current_range_id[n - 1]; 53 | 54 | uint64_t denominator = 0; 55 | std::for_each(it - l, it, [&](auto ptr) { 56 | auto right = ptr[N - 1]; 57 | if (m_tmp_stats.was_not_seen(n, right)) { 58 | uint64_t count = m_tmp_stats.occs[n - 1][right]; 59 | denominator += count; 60 | } 61 | }); 62 | assert(denominator > 0); 63 | assert(backoff <= denominator); 64 | backoff /= denominator; 65 | 66 | ++m_tmp_stats.current_range_id[n - 1]; 67 | 68 | std::for_each(it - l, it, [&](auto ptr) { 69 | auto right = ptr[N - 1]; 70 | uint64_t count = m_tmp_stats.occs[n - 1][right]; 71 | assert(count > 0); 72 | float prob = 73 | (static_cast(count) - m_stats.D(n, count)) / denominator; 74 | prob += backoff * m_probs[n - 2][offset]; 75 | 76 | if (m_tmp_stats.was_not_seen(n, right)) { 77 | auto& pos = m_tmp_data.probs_offsets[n - 1][right]; 78 | m_index_builder.set_prob(n, pos, prob); 79 | if (n == N - 1) { 80 | m_index_builder.set_pointer(n, pos + 1, count); 81 | } 82 | ++pos; 83 | } 84 | 85 | assert(prob <= 1.0); 86 | m_probs[n - 1].push_back(prob); 87 | ++offset; 88 | }); 89 | 90 | ++m_tmp_stats.current_range_id[n - 1]; 91 | 92 | } else { // N-gram case 93 | 94 | assert(s.N_gram_denominator > 0); 95 | assert(backoff <= s.N_gram_denominator); 96 | backoff /= s.N_gram_denominator; 97 | 98 | std::for_each(it - l, it, [&](auto ptr) { 99 | uint64_t count = *(ptr.value(N)); 100 | assert(count > 0); 101 | float prob = (static_cast(count) - m_stats.D(N, count)) / 102 | s.N_gram_denominator; 103 | prob += backoff * m_probs[N - 2][offset]; // interpolate 104 | assert(prob <= 1.0); 105 | 106 | auto right = ptr[N - 1]; 107 | auto& pos = m_tmp_data.probs_offsets[N - 1][right]; 108 | 109 | m_index_builder.set_prob(N, pos, prob); 110 | m_index_builder.set_word(N, pos, 111 | ptr[0]); // for suffix order 112 | ++pos; 113 | }); 114 | 115 | if (it != s.end) s.N_gram_denominator = *(it->value(N)); 116 | } 117 | 118 | if (n == 2) { 119 | auto context = prev_ptr[N - 2]; 120 | float u = unigram_prob(context); 121 | m_index_builder.set_next_unigram_values(u, backoff); 122 | } else { 123 | m_index_builder.set_next_backoff(n - 1, backoff); 124 | } 125 | 126 | if (n != N) s.probs_offsets[n] = 0; // reset next order's offset 127 | 128 | l = 0; 129 | }; 130 | 131 | } // namespace tongrams -------------------------------------------------------------------------------- /include/merge_utils.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "util_types.hpp" 4 | 5 | #include 6 | #include 7 | 8 | namespace tongrams { 9 | 10 | template 11 | struct cursor { 12 | cursor(Iterator const& begin, Iterator const& end, uint64_t i) 13 | : range(begin, end), index(i) {} 14 | 15 | iterator_range range; 16 | uint64_t index; 17 | }; 18 | 19 | template 20 | struct cursor_comparator { 21 | cursor_comparator() {} 22 | cursor_comparator(uint8_t ngram_order) : m_comparator(ngram_order) {} 23 | 24 | template 25 | bool operator()(cursor& l, cursor& r) { 26 | return m_comparator.compare(l.range.begin.operator*(), 27 | r.range.begin.operator*()) >= 0; 28 | } 29 | 30 | private: 31 | Comparator m_comparator; 32 | }; 33 | 34 | template 35 | struct min_heap { 36 | min_heap(Comparator comparator) : m_comparator(comparator) {} 37 | 38 | void push(T const& t) { 39 | m_q.push_back(t); 40 | std::push_heap(m_q.begin(), m_q.end(), m_comparator); 41 | } 42 | 43 | T& top() { 44 | return m_q.front(); 45 | } 46 | 47 | void pop() { 48 | std::pop_heap(m_q.begin(), m_q.end(), m_comparator); 49 | m_q.pop_back(); 50 | } 51 | 52 | void heapify() { 53 | sink(0); 54 | } 55 | 56 | void clear() { 57 | m_q.clear(); 58 | } 59 | 60 | bool empty() const { 61 | return m_q.empty(); 62 | } 63 | 64 | inline uint64_t size() const { 65 | return m_q.size(); 66 | } 67 | 68 | private: 69 | std::vector m_q; 70 | Comparator m_comparator; 71 | 72 | void sink(uint64_t pos) { 73 | assert(pos <= size()); 74 | while (2 * pos + 1 < size()) { 75 | uint64_t i = 2 * pos + 1; 76 | if (i + 1 < size() and m_comparator(m_q[i], m_q[i + 1])) ++i; 77 | if (!m_comparator(m_q[pos], m_q[i])) break; 78 | std::swap(m_q[pos], m_q[i]); 79 | pos = i; 80 | } 81 | } 82 | }; 83 | 84 | } // namespace tongrams 85 | -------------------------------------------------------------------------------- /include/merging/merging.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "util.hpp" 4 | #include "constants.hpp" 5 | #include "stream.hpp" 6 | #include "merge_utils.hpp" 7 | #include "merging_writer.hpp" 8 | 9 | namespace tongrams { 10 | 11 | template 12 | struct merging { 13 | typedef cursor_comparator 14 | cursor_comparator_type; 15 | 16 | merging(configuration const& config, tmp::data& tmp_data, 17 | tmp::statistics& /*tmp_stats*/, statistics& /*stats*/) 18 | : m_config(config) 19 | , m_writer(config, tmp_data) 20 | , m_comparator(config.max_order) 21 | , m_cursors(cursor_comparator_type(config.max_order)) {} 22 | 23 | typedef typename StreamGenerator::block_type input_block_type; 24 | 25 | void run() { 26 | std::vector filenames; 27 | { 28 | essentials::directory tmp_dir(m_config.tmp_dirname); 29 | for (auto const& filename : tmp_dir) { 30 | if (filename.extension == constants::file_extension::counts) { 31 | filenames.push_back(filename.fullpath); 32 | } 33 | } 34 | } 35 | 36 | uint8_t N = m_config.max_order; 37 | size_t num_files_to_merge = filenames.size(); 38 | assert(num_files_to_merge > 0); 39 | std::cerr << "merging " << num_files_to_merge << " files" << std::endl; 40 | 41 | uint64_t record_size = ngrams_block::record_size(N); 42 | uint64_t min_load_size = m_config.RAM / (2 * num_files_to_merge + 1) / 43 | record_size * record_size; 44 | uint64_t default_load_size = 45 | (64 * essentials::MiB) / record_size * record_size; 46 | uint64_t load_size = default_load_size; 47 | if (min_load_size < default_load_size) { 48 | std::cerr << "using min. load size of " << min_load_size 49 | << " because not enough RAM is available" << std::endl; 50 | load_size = min_load_size; 51 | } 52 | assert(load_size % record_size == 0); 53 | 54 | for (auto const& filename : filenames) { 55 | m_stream_generators.emplace_back(N); 56 | auto& gen = m_stream_generators.back(); 57 | gen.open(filename); 58 | assert(gen.size() == 0); 59 | gen.fetch_next_block(load_size); 60 | } 61 | 62 | auto get_block = [](StreamGenerator& gen) { 63 | auto* block = gen.get_block(); 64 | assert(block->template is_sorted( 65 | block->begin(), block->end())); 66 | return block; 67 | }; 68 | 69 | assert(m_cursors.empty()); 70 | for (uint64_t k = 0; k != m_stream_generators.size(); ++k) { 71 | auto& gen = m_stream_generators[k]; 72 | auto* block = get_block(gen); 73 | cursor c(block->begin(), 74 | block->end(), k); 75 | m_cursors.push(c); 76 | } 77 | 78 | uint64_t num_ngrams_per_block = load_size / record_size; 79 | std::cerr << "num_ngrams_per_block = " << num_ngrams_per_block 80 | << " ngrams" << std::endl; 81 | 82 | ngrams_block result(N); 83 | result.resize_memory(num_ngrams_per_block); 84 | result.reserve_index(num_ngrams_per_block); 85 | 86 | m_writer.start(); 87 | 88 | while (!m_cursors.empty()) { 89 | auto& top = m_cursors.top(); 90 | auto min = *(top.range.begin); 91 | 92 | if (!result.size()) { 93 | result.push_back(min.data, min.data + N, *(min.value(N))); 94 | } else { 95 | auto& back = result.back(); 96 | bool equal = equal_to(min.data, back.data, sizeof_ngram(N)); 97 | if (!equal) { 98 | if (result.size() == num_ngrams_per_block) { 99 | while (m_writer.size() > 0) 100 | ; // wait for flush 101 | m_writer.push(result); 102 | 103 | result.init(N); 104 | result.resize_memory(num_ngrams_per_block); 105 | result.reserve_index(num_ngrams_per_block); 106 | assert(result.empty()); 107 | } 108 | result.push_back(min.data, min.data + N, *(min.value(N))); 109 | } else { 110 | *(back.value(N)) += *(min.value(N)); 111 | } 112 | } 113 | 114 | ++(top.range.begin); 115 | 116 | if (top.range.begin == top.range.end) { 117 | auto& gen = m_stream_generators[top.index]; 118 | gen.release_block(); 119 | if (gen.eos()) { 120 | assert(gen.empty()); 121 | gen.close_and_remove(); 122 | m_cursors.pop(); 123 | } else { 124 | gen.fetch_next_block(load_size); 125 | auto* block = get_block(gen); 126 | top.range.begin = block->begin(); 127 | top.range.end = block->end(); 128 | } 129 | } 130 | 131 | m_cursors.heapify(); 132 | } 133 | 134 | m_writer.push(result); 135 | m_writer.terminate(); 136 | } 137 | 138 | private: 139 | configuration const& m_config; 140 | std::deque m_stream_generators; 141 | merging_writer m_writer; 142 | prefix_order_comparator_type m_comparator; 143 | 144 | min_heap, 145 | cursor_comparator_type> 146 | m_cursors; 147 | }; 148 | 149 | } // namespace tongrams 150 | -------------------------------------------------------------------------------- /include/merging/merging_writer.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "configuration.hpp" 4 | #include "tmp.hpp" 5 | 6 | namespace tongrams { 7 | 8 | struct merging_writer { 9 | merging_writer(configuration const& config, tmp::data& tmp_data) 10 | : m_num_flushes(0), m_order(config.max_order), m_ngrams(0) { 11 | m_buffer.open(); 12 | m_os.open(config.output_filename.c_str(), 13 | std::ofstream::ate | std::ofstream::app); 14 | 15 | tmp_data.vocab_builder.build(m_vocab); 16 | 17 | // 18446744073709551615\n 18 | static std::string empty_line = " \n"; 19 | m_os.write(empty_line.data(), empty_line.size()); 20 | 21 | m_params.path = config.output_filename; 22 | m_params.offset = 0; 23 | m_params.length = sysconf(_SC_PAGESIZE); 24 | } 25 | 26 | ~merging_writer() { 27 | if (!m_buffer.empty()) { 28 | std::cerr << "Error: some data still need to be written" 29 | << std::endl; 30 | std::terminate(); 31 | } 32 | } 33 | 34 | void start() { 35 | m_thread = std::thread(&merging_writer::run, this); 36 | } 37 | 38 | void terminate() { 39 | m_buffer.lock(); 40 | m_buffer.close(); 41 | m_buffer.unlock(); 42 | if (m_thread.joinable()) m_thread.join(); 43 | assert(!m_buffer.active()); 44 | while (!m_buffer.empty()) flush(); 45 | m_os.close(); 46 | 47 | // write number of ngrams at the beginning of file 48 | boost::iostreams::mapped_file_sink tmp(m_params); 49 | char* data = tmp.data(); 50 | std::string str = std::to_string(m_ngrams); 51 | memcpy(data, str.data(), str.size()); 52 | tmp.close(); 53 | 54 | std::cerr << "\tmerging_writer thread stats:\n"; 55 | std::cerr << "\tflushed blocks: " << m_num_flushes << "\n"; 56 | std::cerr << "\tflushed ngrams: " << m_ngrams << "\n"; 57 | } 58 | 59 | void push(ngrams_block& block) { 60 | m_buffer.lock(); 61 | m_buffer.push(block); 62 | m_buffer.unlock(); 63 | } 64 | 65 | size_t size() { 66 | m_buffer.lock(); 67 | size_t s = m_buffer.size(); 68 | m_buffer.unlock(); 69 | return s; 70 | } 71 | 72 | private: 73 | semi_sync_queue m_buffer; 74 | std::ofstream m_os; 75 | std::thread m_thread; 76 | uint64_t m_num_flushes; 77 | uint64_t m_order; 78 | uint64_t m_ngrams; 79 | vocabulary m_vocab; 80 | boost::iostreams::mapped_file_params m_params; 81 | 82 | void run() { 83 | while (m_buffer.active()) flush(); 84 | } 85 | 86 | void flush() { 87 | m_buffer.lock(); 88 | if (m_buffer.empty()) { 89 | m_buffer.unlock(); 90 | return; 91 | } 92 | auto& block = m_buffer.pick(); 93 | m_buffer.unlock(); 94 | 95 | for (auto const ngram : block) { 96 | for (uint64_t i = 0; i != m_order; ++i) { 97 | auto br = m_vocab[ngram[i]]; 98 | util::write(m_os, br); 99 | if (i != m_order - 1) m_os << " "; 100 | } 101 | m_os << "\t" << *ngram.value(m_order) << "\n"; 102 | } 103 | 104 | m_ngrams += block.size(); 105 | block.release(); 106 | 107 | m_buffer.lock(); 108 | m_buffer.pop(); 109 | m_buffer.unlock(); 110 | ++m_num_flushes; 111 | if (m_num_flushes % 20 == 0) { 112 | std::cerr << "flushed " << m_num_flushes << " blocks" << std::endl; 113 | } 114 | } 115 | }; 116 | 117 | } // namespace tongrams -------------------------------------------------------------------------------- /include/ngrams_block.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "util_types.hpp" 4 | #include "comparators.hpp" 5 | #include "../external/tongrams/include/utils/util.hpp" 6 | 7 | namespace tongrams { 8 | 9 | struct ngram_pointer { 10 | inline word_id operator[](size_t i) const { 11 | return data[i]; 12 | } 13 | 14 | inline count_type* value(uint8_t order) const { 15 | return reinterpret_cast(data + order); 16 | } 17 | 18 | inline bool equal_to(ngram_pointer const& other, size_t begin, 19 | size_t end) const { 20 | return memcmp(other.data + begin, this->data + begin, 21 | (end - begin) * sizeof(word_id)) == 0; 22 | } 23 | 24 | void print(uint8_t order) const { 25 | for (uint8_t i = 0; i < order; ++i) { 26 | std::cerr << data[i] << " "; 27 | } 28 | std::cerr << "[" << value(order) << "]\n"; 29 | } 30 | 31 | word_id* data; 32 | }; 33 | 34 | typedef context_order_comparator context_order_comparator_type; 35 | typedef prefix_order_comparator prefix_order_comparator_type; 36 | 37 | struct ngrams_block_statistics { 38 | word_id max_word_id; 39 | uint64_t max_count; 40 | }; 41 | 42 | struct ngrams_allocator { 43 | ngrams_allocator() : m_offset(0), m_alignment(0) {} 44 | 45 | ngrams_allocator(uint8_t order) { 46 | init(order); 47 | } 48 | 49 | void init(uint8_t order) { 50 | m_offset = 0; 51 | m_alignment = sizeof_ngram(order); 52 | } 53 | 54 | void resize(std::vector& memory, uint64_t num_ngrams) { 55 | memory.resize((m_alignment + sizeof(count_type)) * num_ngrams); 56 | } 57 | 58 | template 59 | void construct(ngram_pointer& ptr, Iterator begin, Iterator end, 60 | count_type count) { 61 | uint64_t n = 0; 62 | for (; begin != end; ++n, ++begin) ptr.data[n] = *begin; 63 | *(ptr.value(n)) = count; 64 | } 65 | 66 | auto allocate(std::vector& memory) { 67 | assert(m_offset < memory.size()); 68 | ngram_pointer ptr; 69 | ptr.data = reinterpret_cast(&memory[m_offset]); 70 | m_offset += m_alignment + sizeof(count_type); 71 | return ptr; 72 | } 73 | 74 | auto allocate(std::vector& memory, uint64_t i) { 75 | uint64_t offset = i * (m_alignment + sizeof(count_type)); 76 | assert(offset < memory.size()); 77 | ngram_pointer ptr; 78 | ptr.data = reinterpret_cast(&memory[offset]); 79 | return ptr; 80 | } 81 | 82 | uint8_t order() const { 83 | return m_alignment / sizeof(word_id); 84 | } 85 | 86 | void swap(ngrams_allocator& other) { 87 | std::swap(m_offset, other.m_offset); 88 | std::swap(m_alignment, other.m_alignment); 89 | } 90 | 91 | private: 92 | uint64_t m_offset; 93 | uint64_t m_alignment; 94 | }; 95 | 96 | struct ngrams_block { 97 | typedef typename std::vector::iterator iterator; 98 | 99 | ngrams_block() {} 100 | 101 | ngrams_block(uint8_t order) { 102 | init(order); 103 | } 104 | 105 | ngrams_block(ngrams_block&& rhs) { 106 | *this = std::move(rhs); 107 | } 108 | 109 | void init(uint8_t order) { 110 | stats = {0, 0}; 111 | m_memory.resize(0); 112 | m_allocator.init(order); 113 | m_index.resize(0); 114 | } 115 | 116 | inline ngrams_block& operator=(ngrams_block&& rhs) { 117 | if (this != &rhs) swap(rhs); 118 | return *this; 119 | }; 120 | 121 | ngrams_block(ngrams_block const&) { 122 | assert(false); 123 | } 124 | 125 | ngrams_block& operator=(ngrams_block const&) { 126 | assert(false); 127 | return *this; 128 | }; 129 | 130 | inline static size_t record_size(uint8_t order) { 131 | return sizeof_ngram(order) + sizeof(count_type); 132 | } 133 | 134 | inline uint64_t record_size() const { 135 | return record_size(order()); 136 | } 137 | 138 | void resize_memory(uint64_t num_ngrams) { 139 | m_allocator.resize(m_memory, num_ngrams); 140 | } 141 | 142 | void reserve_index(uint64_t num_ngrams) { 143 | m_index.reserve(num_ngrams); 144 | } 145 | 146 | void resize_index(uint64_t num_ngrams) { 147 | m_index.resize(num_ngrams); 148 | } 149 | 150 | void release() { 151 | ngrams_block().swap(*this); 152 | } 153 | 154 | void push_back(ngram_pointer ptr) { 155 | m_index.push_back(ptr); 156 | } 157 | 158 | template 159 | void push_back(Iterator begin, Iterator end, count_type count) { 160 | auto ptr = m_allocator.allocate(m_memory); 161 | m_allocator.construct(ptr, begin, end, count); 162 | push_back(ptr); 163 | } 164 | 165 | template 166 | void set(uint64_t i, Iterator begin, Iterator end, count_type count) { 167 | assert(i < size()); 168 | auto ptr = m_allocator.allocate(m_memory, i); 169 | m_allocator.construct(ptr, begin, end, count); 170 | m_index[i] = ptr; 171 | } 172 | 173 | inline size_t size() const { 174 | return m_index.size(); 175 | } 176 | 177 | inline bool empty() const { 178 | return m_index.empty(); 179 | } 180 | 181 | inline uint8_t order() const { 182 | return m_allocator.order(); 183 | } 184 | 185 | void write_memory(std::ofstream& os) { 186 | assert(m_memory.size() > 0); 187 | std::streamsize num_bytes = size() * record_size(); 188 | os.write(reinterpret_cast(m_memory.data()), num_bytes); 189 | } 190 | 191 | char* initialize_memory(size_t num_bytes) { 192 | m_memory.resize(num_bytes); 193 | return reinterpret_cast(m_memory.data()); 194 | } 195 | 196 | char* read_bytes(std::ifstream& is, char* dest, size_t num_bytes) { 197 | is.read(dest, static_cast(num_bytes)); 198 | dest += num_bytes; 199 | return dest; 200 | } 201 | 202 | void materialize_index(uint64_t num_ngrams) { 203 | m_index.clear(); 204 | m_index.reserve(num_ngrams); 205 | assert(m_memory.size() > 0); 206 | for (uint64_t i = 0; i != num_ngrams; ++i) { 207 | auto ptr = m_allocator.allocate(m_memory); 208 | push_back(ptr); 209 | } 210 | assert(size() == num_ngrams); 211 | } 212 | 213 | inline ngram_pointer operator[](size_t i) { 214 | assert(i < size()); 215 | return m_index[i]; 216 | } 217 | 218 | inline count_type& value(size_t i) { 219 | assert(i < size()); 220 | return *(m_index[i].value(order())); 221 | } 222 | 223 | inline iterator begin() { 224 | return m_index.begin(); 225 | } 226 | 227 | inline iterator end() { 228 | return m_index.end(); 229 | } 230 | 231 | inline ngram_pointer& front() { 232 | return m_index.front(); 233 | } 234 | 235 | inline ngram_pointer& back() { 236 | return m_index.back(); 237 | } 238 | 239 | size_t num_bytes() const { 240 | return m_memory.size(); 241 | } 242 | 243 | template 244 | bool is_sorted(Iterator begin, Iterator end) { 245 | std::cerr << "checking if block is sorted..."; 246 | uint8_t N = order(); 247 | Comparator comparator(N); 248 | auto it = begin; 249 | auto prev = *it; 250 | ++it; 251 | bool ret = true; 252 | for (size_t i = 1; it != end; ++i, ++it) { 253 | auto curr = *it; 254 | int cmp = comparator.compare(prev, curr); 255 | if (cmp == 0) { 256 | std::cerr << "Error at " << i << "/" << size() << ":\n"; 257 | prev.print(N); 258 | curr.print(N); 259 | std::cerr << "Repeated ngrams" << std::endl; 260 | } 261 | if (cmp > 0) { 262 | std::cerr << "Error at " << i << "/" << size() << ":\n"; 263 | prev.print(N); 264 | curr.print(N); 265 | std::cerr << std::endl; 266 | ret = false; 267 | } 268 | prev = curr; 269 | } 270 | if (ret) std::cerr << "OK!" << std::endl; 271 | return ret; 272 | } 273 | 274 | void swap(ngrams_block& other) { 275 | std::swap(stats.max_word_id, other.stats.max_word_id); 276 | std::swap(stats.max_count, other.stats.max_count); 277 | m_memory.swap(other.m_memory); 278 | m_allocator.swap(other.m_allocator); 279 | m_index.swap(other.m_index); 280 | } 281 | 282 | ngrams_block_statistics stats; 283 | 284 | protected: 285 | std::vector m_memory; 286 | ngrams_allocator m_allocator; 287 | std::vector m_index; 288 | }; 289 | 290 | struct ngram_cache { 291 | ngram_cache() : m_empty(true) {} 292 | 293 | typedef ngram_pointer pointer; 294 | 295 | ngram_cache(uint8_t order) { 296 | init(order); 297 | } 298 | 299 | void init(uint8_t order) { 300 | m_data.resize(ngrams_block::record_size(order)); 301 | m_empty = true; 302 | } 303 | 304 | pointer get() { 305 | pointer ptr; 306 | ptr.data = reinterpret_cast(m_data.data()); 307 | return ptr; 308 | } 309 | 310 | void store(pointer const& ptr) { 311 | std::memcpy(m_data.data(), ptr.data, m_data.size()); 312 | m_empty = false; 313 | } 314 | 315 | bool empty() const { 316 | return m_empty; 317 | } 318 | 319 | void swap(ngram_cache& other) { 320 | m_data.swap(other.m_data); 321 | std::swap(m_empty, other.m_empty); 322 | } 323 | 324 | private: 325 | std::vector m_data; 326 | bool m_empty; 327 | }; 328 | 329 | } // namespace tongrams 330 | -------------------------------------------------------------------------------- /include/statistics.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "tmp.hpp" 6 | #include "configuration.hpp" 7 | #include "util_types.hpp" 8 | 9 | namespace tongrams { 10 | 11 | struct statistics { 12 | struct builder { 13 | builder(configuration const& config, tmp::data& tmp_data, 14 | tmp::statistics& tmp_stats) 15 | : m_config(config) 16 | , m_tmp_stats(tmp_stats) 17 | , m_tmp_data(tmp_data) 18 | , m_ngram_cache(config.max_order) 19 | , m_t(config.max_order, std::vector(4, 0)) 20 | , m_D(config.max_order, std::vector(4, 0)) 21 | , m_num_ngrams(config.max_order, 0) 22 | , m_total_num_words(0) 23 | , m_unk_prob(0.0) {} 24 | 25 | void init(size_t vocab_size) { 26 | assert(vocab_size); 27 | m_vocab_size = vocab_size; 28 | uint64_t N = m_config.max_order; 29 | for (uint64_t n = 1; n < N; ++n) { 30 | m_tmp_stats.resize(n, m_vocab_size); 31 | } 32 | m_tmp_data.probs_offsets.resize(N, std::vector(0, 0)); 33 | for (uint64_t n = 2; n <= N; ++n) { 34 | m_tmp_data.probs_offsets[n - 1].resize(m_vocab_size, 0); 35 | } 36 | } 37 | 38 | template 39 | void compute_left_extensions(Iterator begin, size_t len) { 40 | uint64_t N = m_config.max_order; 41 | m_num_ngrams[N - 1] += len; 42 | 43 | auto prev_ptr = *begin; 44 | if (not m_ngram_cache.empty()) { 45 | prev_ptr = m_ngram_cache.get(); 46 | } 47 | 48 | for (size_t i = 0; i < len; ++i, ++begin) { 49 | auto ptr = *begin; 50 | word_id right = ptr[N - 1]; 51 | 52 | for (uint64_t n = 1; n < N; ++n) { 53 | bool context_changes = 54 | !ptr.equal_to(prev_ptr, N - n, N - 1); 55 | if (n != 1 and context_changes) { 56 | m_tmp_stats.combine(n); 57 | ++m_num_ngrams[n - 2]; // previous order 58 | } 59 | 60 | word_id left = ptr[N - n - 1]; 61 | if (m_tmp_stats.update(n, left, right)) { 62 | ++m_tmp_data.probs_offsets[n][right]; 63 | } 64 | } 65 | 66 | if (!ptr.equal_to(prev_ptr, 0, N - 1)) ++m_num_ngrams[N - 2]; 67 | 68 | // N-gram case: they do not have modified counts, 69 | // rather their counts are equal to the occurrence in corpus 70 | uint64_t count = *(ptr.value(N)); 71 | assert(count > 0); 72 | m_total_num_words += count; 73 | if (count <= 4) ++m_tmp_stats.t[N - 1][count - 1]; 74 | prev_ptr = ptr; 75 | } 76 | 77 | m_ngram_cache.store(prev_ptr); 78 | } 79 | 80 | void finalize() { 81 | ++m_num_ngrams[m_config.max_order - 2]; 82 | for (uint64_t n = 2; n < m_config.max_order; ++n) { 83 | ++m_num_ngrams[n - 2]; 84 | m_tmp_stats.combine(n); 85 | m_tmp_stats.release(n); 86 | } 87 | for (uint64_t n = 2; n <= m_config.max_order; ++n) { 88 | for (uint64_t k = 1; k <= 4; ++k) { 89 | m_t[n - 1][k - 1] = m_tmp_stats.t[n - 1][k - 1]; 90 | } 91 | } 92 | } 93 | 94 | void build(statistics& stats) { 95 | uint64_t N = m_config.max_order; 96 | stats.num_ngrams(1) = m_vocab_size; 97 | for (uint64_t n = 2; n <= N; ++n) { 98 | stats.num_ngrams(n) = m_num_ngrams[n - 1]; 99 | } 100 | 101 | std::cerr << "number of ngrams:\n"; 102 | size_t sum = 0; 103 | for (uint64_t n = 1; n <= N; ++n) { 104 | std::cerr << int(n) << "-grams: " << stats.num_ngrams(n) 105 | << "\n"; 106 | sum += stats.num_ngrams(n); 107 | } 108 | std::cerr << "total num. grams: " << sum << std::endl; 109 | 110 | // NOTE: smoothing statistics for unigrams must be computed globally 111 | for (auto k : m_tmp_stats.occs[0]) { 112 | assert(k); 113 | if (k <= 4) ++m_t[0][k - 1]; 114 | } 115 | 116 | for (uint64_t n = 2; n <= N; ++n) { 117 | auto& positions = m_tmp_data.probs_offsets[n - 1]; 118 | // compute prefix sums 119 | for (uint64_t id = 0, sum = 0; id < m_vocab_size; ++id) { 120 | uint64_t occ = positions[id]; 121 | positions[id] = sum; 122 | sum += occ; 123 | } 124 | // for (auto x: positions) { 125 | // std::cerr << x << " "; 126 | // } 127 | // std::cerr << std::endl; 128 | } 129 | 130 | // NOTE: do not compute for small synthetic datasets 131 | for (uint64_t n = 1; n <= N; ++n) { 132 | for (uint64_t k = 1; k <= 4; ++k) { 133 | try { 134 | D(n, k) = compute_discount(n, k); 135 | } catch (std::runtime_error const& e) { 136 | e.what(); 137 | complain(n, k); 138 | util::clean_temporaries(m_config.tmp_dirname); 139 | std::abort(); 140 | } 141 | } 142 | } 143 | 144 | for (uint64_t k = 1; k <= 3; ++k) { 145 | m_unk_prob += t(1, k) * D(1, k); 146 | } 147 | m_unk_prob /= stats.num_ngrams(2); // uni-grams' denominator 148 | m_unk_prob /= stats.num_ngrams( 149 | 1); // interpolate with uniform distribution: 1/|vocabulary| 150 | 151 | stats.m_t.swap(m_t); 152 | stats.m_D.swap(m_D); 153 | stats.m_total_num_words = m_total_num_words - N + 1; 154 | stats.m_unk_prob = m_unk_prob; 155 | std::cerr << "total num. tokens: " << stats.m_total_num_words 156 | << std::endl; 157 | } 158 | 159 | private: 160 | configuration const& m_config; 161 | tmp::statistics& m_tmp_stats; 162 | tmp::data& m_tmp_data; 163 | ngram_cache m_ngram_cache; 164 | std::vector> m_t; 165 | std::vector> m_D; 166 | std::vector m_num_ngrams; 167 | uint64_t m_total_num_words; // total numer of words in the text corpus 168 | size_t m_vocab_size; 169 | float m_unk_prob; // prob of word, which is backoff(empty) / 170 | // vocabulary_size 171 | 172 | float& D(uint64_t n, uint64_t k) { 173 | assert(k > 0); 174 | assert(n >= 1 and n <= m_config.max_order); 175 | if (k >= 3) return m_D[n - 1].back(); 176 | return m_D[n - 1][k - 1]; 177 | } 178 | 179 | inline uint64_t t(uint64_t n, uint64_t k) const { 180 | assert(n >= 1 and n <= m_config.max_order); 181 | assert(k > 0 and k <= 4); 182 | return m_t[n - 1][k - 1]; 183 | } 184 | 185 | float compute_discount(uint64_t n, uint64_t k) { 186 | assert(k > 0 and k <= 4); 187 | assert(n >= 1 and n <= m_config.max_order); 188 | if (k <= 3) { 189 | float d = (t(n, 1) + 2 * t(n, 2)) * t(n, k); 190 | if (d == 0.0) throw std::runtime_error("bad discount"); 191 | return static_cast(k) - 192 | static_cast((k + 1) * t(n, 1) * t(n, k + 1)) / d; 193 | } 194 | return compute_discount(n, 3); 195 | } 196 | 197 | void complain(uint64_t n, uint64_t k) { 198 | auto check = [&](uint64_t n, uint64_t k) { 199 | if (!t(n, k)) std::cerr << k << "\n"; 200 | }; 201 | std::cerr << "Error: could not calculate Kneser-Ney discounts for " 202 | << int(n) << "-grams with adjusted count " << k << "\n" 203 | << "because it was not observed any " << int(n) 204 | << "-grams with adjusted count:\n"; 205 | check(n, 1); 206 | check(n, 2); 207 | check(n, 3); 208 | std::cerr << "Is this small or artificial data?" << std::endl; 209 | } 210 | }; 211 | 212 | statistics(uint64_t order) 213 | : m_num_ngrams(order, 0) 214 | , m_t(order, std::vector(4, 0)) 215 | , m_D(order, std::vector(4, 0)) 216 | , m_total_num_words(0) 217 | , m_unk_prob(0.0) {} 218 | 219 | inline float D(uint64_t n, uint64_t k) const { 220 | assert(k > 0); 221 | assert(n >= 1 and n <= order()); 222 | if (k >= 3) return m_D[n - 1].back(); 223 | return m_D[n - 1][k - 1]; 224 | } 225 | 226 | inline uint64_t t(uint64_t n, uint64_t k) const { 227 | assert(n >= 1 and n <= order()); 228 | assert(k > 0 and k <= 4); 229 | return m_t[n - 1][k - 1]; 230 | } 231 | 232 | inline uint64_t& num_ngrams(uint64_t n) { 233 | assert(n >= 1 and n <= order()); 234 | return m_num_ngrams[n - 1]; 235 | } 236 | 237 | inline uint64_t total_words() const { 238 | return m_total_num_words; 239 | } 240 | 241 | uint64_t total_grams() const { 242 | return std::accumulate(m_num_ngrams.begin(), m_num_ngrams.end(), 243 | uint64_t(0)); 244 | } 245 | 246 | inline float unk_prob() const { 247 | return m_unk_prob; 248 | } 249 | 250 | uint64_t order() const { 251 | return m_num_ngrams.size(); 252 | } 253 | 254 | void print() { 255 | std::cerr << "number of ngrams:\n"; 256 | for (uint64_t n = 1; n <= order(); ++n) { 257 | std::cerr << n << "-grams: " << num_ngrams(n) << "\n"; 258 | } 259 | 260 | std::cerr << "smoothing statistics:\n"; 261 | for (uint64_t n = 1; n <= order(); ++n) { 262 | uint64_t sum = 0; 263 | for (uint64_t k = 1; k <= 4; ++k) { 264 | std::cerr << "t_" << n << "(" << k << ") = " << t(n, k) << "\n"; 265 | sum += t(n, k); 266 | } 267 | std::cerr << "sum: " << sum << "\n" << std::endl; 268 | } 269 | 270 | std::cerr << "discounts:\n"; 271 | for (uint64_t n = 1; n <= order(); ++n) { 272 | for (uint64_t k = 1; k <= 3; ++k) { 273 | std::cerr << "D_" << n << "(" << k << ") = " << D(n, k) << " "; 274 | } 275 | std::cerr << std::endl; 276 | } 277 | } 278 | 279 | private: 280 | std::vector m_num_ngrams; 281 | std::vector> m_t; 282 | std::vector> m_D; 283 | uint64_t m_total_num_words; 284 | float m_unk_prob; 285 | }; 286 | 287 | } // namespace tongrams 288 | -------------------------------------------------------------------------------- /include/stream.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "util.hpp" 7 | #include "ngrams_block.hpp" 8 | #include "front_coding.hpp" 9 | 10 | #include "../external/tongrams/include/utils/util_types.hpp" 11 | 12 | namespace tongrams::stream { 13 | 14 | typedef ngrams_block uncompressed_block_type; 15 | typedef fc::ngrams_block compressed_block_type; 16 | 17 | template 18 | struct async_ngrams_file_source { 19 | async_ngrams_file_source() : m_file_size(0), m_handle_ptr(nullptr) {} 20 | 21 | async_ngrams_file_source(std::string const& filename) 22 | : m_file_size(0), m_handle_ptr(nullptr) { 23 | open(filename); 24 | } 25 | 26 | void open(std::string const& filename) { 27 | m_filename = filename; 28 | m_is.open(filename.c_str(), std::ifstream::binary); 29 | if (not m_is.good()) { 30 | throw std::runtime_error( 31 | "Error in opening binary file, it may not exist or be " 32 | "malformed."); 33 | } 34 | m_is.seekg(0, m_is.end); 35 | m_file_size = static_cast(m_is.tellg()); 36 | m_is.seekg(0, m_is.beg); 37 | } 38 | 39 | void close() { 40 | util::wait(m_handle_ptr); 41 | if (m_is.is_open()) m_is.close(); 42 | } 43 | 44 | void close_and_remove() { 45 | close(); 46 | std::remove(m_filename.c_str()); 47 | } 48 | 49 | size_t size() const { 50 | return m_buffer.size(); 51 | } 52 | 53 | bool empty() const { 54 | return m_buffer.empty(); 55 | } 56 | 57 | Block* get_block() { 58 | if (empty()) util::wait(m_handle_ptr); 59 | assert(size()); 60 | return &m_buffer.front(); 61 | } 62 | 63 | void release_block() { 64 | m_buffer.front().release(); 65 | m_buffer.pop_front(); 66 | } 67 | 68 | protected: 69 | std::string m_filename; 70 | std::ifstream m_is; 71 | size_t m_file_size; 72 | std::deque m_buffer; 73 | std::unique_ptr m_handle_ptr; 74 | }; 75 | 76 | struct uncompressed_stream_generator 77 | : async_ngrams_file_source { 78 | typedef uncompressed_block_type block_type; 79 | 80 | uncompressed_stream_generator() {} 81 | 82 | uncompressed_stream_generator(uint8_t ngram_order) 83 | : m_read_bytes(0), m_N(ngram_order), m_eos(false), m_I_time(0.0) {} 84 | 85 | void open(std::string const& filename) { 86 | async_ngrams_file_source::open(filename); 87 | } 88 | 89 | void async_fetch_next_block(size_t num_bytes) { 90 | util::wait(m_handle_ptr); 91 | m_handle_ptr = 92 | util::async_call(uncompressed_stream_generator::fetch, num_bytes); 93 | } 94 | 95 | void fetch_next_block(size_t num_bytes) { 96 | fetch(num_bytes); 97 | } 98 | 99 | double I_time() const { 100 | return m_I_time; 101 | } 102 | 103 | bool eos() const { 104 | return m_eos; 105 | } 106 | 107 | private: 108 | size_t m_read_bytes; 109 | uint8_t m_N; 110 | bool m_eos; 111 | double m_I_time; 112 | 113 | std::function fetch = [&](size_t bytes) { 114 | if (eos()) return; 115 | auto s = clock_type::now(); 116 | block_type block(m_N); 117 | if (m_read_bytes + bytes >= m_file_size) { 118 | bytes = m_file_size - m_read_bytes; 119 | m_eos = true; 120 | } 121 | m_read_bytes += bytes; 122 | assert(bytes % block.record_size() == 0); 123 | uint64_t num_ngrams = bytes / block.record_size(); 124 | char* begin = block.initialize_memory(bytes); 125 | block.read_bytes(m_is, begin, bytes); 126 | block.materialize_index(num_ngrams); 127 | m_buffer.push_back(std::move(block)); 128 | auto e = clock_type::now(); 129 | std::chrono::duration elapsed = e - s; 130 | m_I_time += elapsed.count(); 131 | }; 132 | }; 133 | 134 | struct compressed_stream_generator 135 | : async_ngrams_file_source { 136 | typedef compressed_block_type block_type; 137 | 138 | compressed_stream_generator() {} 139 | 140 | compressed_stream_generator(uint8_t ngram_order) 141 | : m_read_bytes(0) 142 | , m_N(ngram_order) 143 | , m_w(0) 144 | , m_v(0) 145 | , m_eos(false) 146 | , m_I_time(0.0) {} 147 | 148 | void open(std::string const& filename) { 149 | async_ngrams_file_source::open(filename); 150 | essentials::load_pod(m_is, m_w); 151 | essentials::load_pod(m_is, m_v); 152 | m_read_bytes = sizeof(m_w) + sizeof(m_v); 153 | } 154 | 155 | void async_fetch_next_block(size_t /*num_bytes*/) { 156 | util::wait(m_handle_ptr); 157 | m_handle_ptr = util::async_call(compressed_stream_generator::fetch); 158 | } 159 | 160 | void fetch_next_block(size_t /*num_bytes*/) { 161 | fetch(); 162 | } 163 | 164 | double I_time() const { 165 | return m_I_time; 166 | } 167 | 168 | bool eos() const { 169 | return m_eos; 170 | } 171 | 172 | private: 173 | size_t m_read_bytes; 174 | uint8_t m_N; 175 | uint8_t m_w; 176 | uint8_t m_v; 177 | bool m_eos; 178 | double m_I_time; 179 | 180 | std::function fetch = [&]() { 181 | if (eos()) return; 182 | auto s = clock_type::now(); 183 | size_t size = 0; 184 | essentials::load_pod(m_is, size); 185 | m_read_bytes += sizeof(size); 186 | assert(size > 0); 187 | block_type block(m_N, size, m_w, m_v); 188 | size_t bytes = fc::BLOCK_BYTES; 189 | if (m_read_bytes + bytes >= m_file_size) { 190 | bytes = m_file_size - m_read_bytes; 191 | m_eos = true; 192 | } 193 | m_read_bytes += bytes; 194 | block.read(m_is, bytes); 195 | m_buffer.push_back(std::move(block)); 196 | auto e = clock_type::now(); 197 | std::chrono::duration elapsed = e - s; 198 | m_I_time += elapsed.count(); 199 | }; 200 | }; 201 | 202 | struct writer { 203 | writer(uint8_t order) : m_order(order) {} 204 | 205 | template 206 | void write_block(std::ofstream& os, Iterator begin, Iterator end, size_t, 207 | ngrams_block_statistics const&) { 208 | std::streamsize record_size = ngrams_block::record_size(m_order); 209 | for (auto it = begin; it != end; ++it) { 210 | auto ptr = *it; 211 | os.write(reinterpret_cast(ptr.data), record_size); 212 | } 213 | } 214 | 215 | private: 216 | uint8_t m_order; 217 | }; 218 | 219 | template 220 | struct floats_vec { 221 | typedef T value_type; 222 | typedef typename std::vector::iterator iterator; 223 | 224 | floats_vec(size_t n) : m_floats(n) { 225 | m_reint.uint_value = 0; 226 | } 227 | 228 | void clear() { 229 | m_floats.clear(); 230 | } 231 | 232 | void reserve(size_t n) { 233 | m_floats.reserve(n); 234 | } 235 | 236 | void resize(size_t n) { 237 | m_floats.resize(n); 238 | } 239 | 240 | void push_back(float x) { 241 | m_reint.uint_value = 0; 242 | m_reint.float_value = x; 243 | m_floats.push_back(m_reint.uint_value); 244 | } 245 | 246 | inline float operator[](size_t i) { 247 | assert(i < m_floats.size()); 248 | m_reint.uint_value = m_floats[i]; 249 | return m_reint.float_value; 250 | } 251 | 252 | size_t size() const { 253 | return m_floats.size(); 254 | } 255 | 256 | auto* data() { 257 | return m_floats.data(); 258 | } 259 | 260 | void swap(floats_vec& other) { 261 | m_floats.swap(other.m_floats); 262 | std::swap(m_reint, other.m_reint); 263 | } 264 | 265 | iterator begin() { 266 | return m_floats.begin(); 267 | } 268 | 269 | iterator end() { 270 | return m_floats.end(); 271 | } 272 | 273 | private: 274 | bits::reinterpret m_reint; 275 | std::vector m_floats; 276 | }; 277 | 278 | } // namespace tongrams::stream 279 | -------------------------------------------------------------------------------- /include/tmp.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "ngrams_block.hpp" 4 | #include "vocabulary.hpp" 5 | 6 | #include 7 | 8 | namespace tongrams { 9 | namespace tmp { 10 | 11 | struct statistics { 12 | static const word_id invalid_word_id = word_id(-1); 13 | static const range_id invalid_range_id = range_id(-1); 14 | 15 | struct word_statistic { 16 | range_id id; // current range id to which the word belongs to 17 | word_id left; // last seen word to the left of the word 18 | }; 19 | 20 | statistics(uint64_t order) 21 | : t(order, std::vector(5, 0)) 22 | , r(order, std::vector(5, 0)) 23 | , current_range_id(order, 0) 24 | 25 | // order - 1 because modified counts for N-grams are the raw occurrence 26 | // counts in text 27 | , occs(order - 1, 28 | std::vector( 29 | 0, 0)) // num. of distinct words appearing 30 | // to the left of the word (modified count) 31 | , stats(order - 1, std::vector( 32 | 0, {invalid_range_id, invalid_word_id})) {} 33 | 34 | void release(uint64_t n) { 35 | assert(n > 0); 36 | stats[n - 1].resize(0, {invalid_range_id, invalid_word_id}); 37 | } 38 | 39 | void resize(uint64_t n, size_t vocab_size) { 40 | assert(n > 0); 41 | occs[n - 1].resize(vocab_size, 0); 42 | stats[n - 1].resize(vocab_size, {invalid_range_id, invalid_word_id}); 43 | } 44 | 45 | void clear() { 46 | for (uint64_t n = 0; n < t.size(); ++n) { 47 | for (uint64_t k = 0; k < 5; ++k) { 48 | r[n][k] = 0; 49 | t[n][k] = 0; 50 | } 51 | } 52 | } 53 | 54 | bool was_not_seen(uint64_t n, word_id right) { 55 | auto& stat = stats[n - 1][right]; 56 | if (stat.id != current_range_id[n - 1]) { // range changes 57 | stat.id = current_range_id[n - 1]; 58 | return true; 59 | } 60 | return false; 61 | } 62 | 63 | bool update(uint64_t n, word_id left, word_id right) { 64 | assert(n > 0 and n < t.size()); 65 | auto& stat = stats[n - 1][right]; 66 | auto& occ = occs[n - 1][right]; 67 | 68 | if (n != 1) { // do not reset occurrence for uni-grams 69 | if (stat.id != current_range_id[n - 1]) { // range changes 70 | // update range id if different from the current one 71 | // and reset number of occurrences 72 | stat.id = current_range_id[n - 1]; 73 | occ = 0; 74 | stat.left = invalid_word_id; 75 | } 76 | } 77 | 78 | if (stat.left != left) { 79 | stat.left = left; 80 | ++occ; 81 | assert(occ > 0); 82 | if (occ == 1) { 83 | ++r[n - 1][0]; 84 | } else if (occ > 1 and occ <= 5) { 85 | ++r[n - 1][occ - 1]; 86 | --r[n - 1][occ - 2]; 87 | } 88 | return true; 89 | } 90 | 91 | return false; 92 | } 93 | 94 | void combine(uint64_t n) { 95 | assert(n > 0); 96 | ++current_range_id[n - 1]; 97 | for (uint64_t k = 0; k < 5; ++k) { 98 | uint64_t& c = r[n - 1][k]; 99 | t[n - 1][k] += c; 100 | c = 0; 101 | } 102 | } 103 | 104 | // void print_stats() { 105 | // std::cerr << "modified counts for unigrams" << std::endl; 106 | // for (auto x : occs[0]) { 107 | // std::cerr << x << std::endl; 108 | // } 109 | // for (uint64_t n = 1; n <= t.size(); ++n) { 110 | // for (uint64_t k = 1; k <= 5; ++k) { 111 | // std::cerr << "r_" << int(n) << "(" << k << ") = " 112 | // << r[n - 1][k - 1] << std::endl; 113 | // std::cerr << "t_" << int(n) << "(" << k 114 | // << ") = " << t[n - 1][k - 1] << std::endl; 115 | // } 116 | // } 117 | // } 118 | 119 | std::vector> 120 | t; // number of n-grams, for n = 1,...,4, having modified count equal 121 | // to 1, 2, 3, 4 and 4+ globally (i.e., all ranges) 122 | std::vector> 123 | r; // number of n-grams, for n = 1,...,4, having modified count equal 124 | // to 1, 2, 3, 4 and 4+ in a range 125 | std::vector 126 | current_range_id; // keep track of current range id to know when we 127 | // switch to the next range 128 | std::vector> occs; 129 | std::vector> stats; 130 | }; 131 | 132 | struct data { 133 | data() : vocab_builder(0) { 134 | word_ids.set_empty_key(constants::invalid_hash); 135 | assert(vocab_builder.size() == 0); 136 | } 137 | 138 | words_map word_ids; // map from unigrams' hashes to word ids 139 | 140 | vocabulary::builder vocab_builder; 141 | 142 | /* 143 | Offsets at which we will write the computed probabilities 144 | in the trie levels: these are equivalent to the counts that 145 | counting sort would compute: we need them for 1 < n <= N. 146 | */ 147 | std::vector> probs_offsets; 148 | 149 | /* 150 | Block partitions' offsets. 151 | Each block corresponds to a partition of the total N-grams file. 152 | */ 153 | std::vector> blocks_offsets; 154 | }; 155 | 156 | } // namespace tmp 157 | } // namespace tongrams 158 | -------------------------------------------------------------------------------- /include/util.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "../external/tongrams/include/utils/iterators.hpp" 4 | #include "../external/tongrams/include/utils/util_types.hpp" 5 | 6 | #include 7 | #include 8 | 9 | #include // for POSIX_MADV_SEQUENTIAL and POSIX_MADV_RANDOM 10 | #include 11 | #include 12 | 13 | namespace tongrams::util { 14 | 15 | void write(std::ofstream& os, byte_range br) { 16 | os.write(reinterpret_cast(br.first), 17 | (br.second - br.first) * sizeof(char)); 18 | } 19 | 20 | size_t file_size(const char* filename) { 21 | boost::filesystem::path filepath(filename); 22 | return boost::filesystem::file_size(filepath); 23 | } 24 | 25 | bool exists(const char* filename) { 26 | boost::filesystem::path filepath(filename); 27 | return boost::filesystem::exists(filepath); 28 | } 29 | 30 | template 31 | void check_file(File const& file) { 32 | if (not file.is_open()) { 33 | throw std::runtime_error( 34 | "Error in opening file: it may not exist or be malformed."); 35 | } 36 | } 37 | 38 | template 39 | void optimize_access(Address addr, size_t len, int MODE) { 40 | auto ret = posix_madvise((void*)addr, len, MODE); 41 | if (ret) { 42 | std::cerr << "Error in calling madvice: " << errno << std::endl; 43 | } 44 | } 45 | 46 | #define optimize_sequential_access(addr, len) \ 47 | optimize_access(addr, len, POSIX_MADV_SEQUENTIAL) 48 | #define optimize_random_access(addr, len) \ 49 | optimize_access(addr, len, POSIX_MADV_RANDOM) 50 | 51 | template 52 | uint8_t const* open_file_partition(File& file, std::string const& filename, 53 | size_t partition_size, 54 | size_t offset // in bytes 55 | ) { 56 | file.open(filename.c_str(), partition_size, offset); 57 | util::check_file(file); 58 | assert(file.size() == partition_size); 59 | return reinterpret_cast(file.data()); 60 | } 61 | 62 | void clean_temporaries(std::string const& tmp_dirname) { 63 | boost::filesystem::remove_all(boost::filesystem::path(tmp_dirname.c_str())); 64 | } 65 | 66 | template 67 | auto async_call(Funct& f, Args&&... args) { 68 | return std::make_unique(f, args...); 69 | } 70 | 71 | void wait(std::unique_ptr& handle_ptr) { 72 | if (handle_ptr and handle_ptr->joinable()) handle_ptr->join(); 73 | } 74 | 75 | } // namespace tongrams::util 76 | -------------------------------------------------------------------------------- /include/util_types.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #define BOOST_THREAD_VERSION 4 11 | #define BOOST_THREAD_PROVIDES_EXECUTORS 12 | 13 | #include 14 | #include 15 | #include 16 | 17 | #include "../external/tongrams/include/utils/util_types.hpp" 18 | 19 | namespace tongrams { 20 | 21 | typedef uint32_t ngram_id; 22 | typedef uint32_t word_id; 23 | typedef uint32_t range_id; 24 | typedef uint32_t occurrence; 25 | typedef uint64_t count_type; 26 | typedef uint64_t iterator; 27 | typedef std::vector ngram_type; 28 | typedef google::dense_hash_map words_map; 29 | typedef std::chrono::high_resolution_clock clock_type; 30 | 31 | uint64_t sizeof_ngram(uint8_t order) { 32 | return sizeof(word_id) * order; 33 | } 34 | 35 | bool equal_to(word_id const* x, word_id const* y, size_t n) { 36 | return memcmp(x, y, n) == 0; 37 | } 38 | 39 | typedef boost::executors::basic_thread_pool executor_type; 40 | typedef boost::experimental::parallel::v2::task_region_handle_gen 41 | task_region_handle; 42 | using boost::experimental::parallel::v2::task_region; 43 | 44 | struct parallel_executor { 45 | parallel_executor( 46 | size_t num_threads = std::thread::hardware_concurrency()) { 47 | executor.reset(new executor_type(num_threads)); 48 | } 49 | std::unique_ptr executor; 50 | }; 51 | 52 | template 53 | struct iterator_range { 54 | iterator_range() {} 55 | iterator_range(Iterator const& begin, Iterator const& end) 56 | : begin(begin), end(end) {} 57 | 58 | Iterator begin; 59 | Iterator end; 60 | }; 61 | 62 | template 63 | struct adaptor { 64 | byte_range operator()(T const& x) const { 65 | const uint8_t* buf = reinterpret_cast(&x); 66 | return {buf, buf + sizeof(T)}; 67 | } 68 | }; 69 | 70 | struct filename_generator { 71 | filename_generator(std::string const& dir_name, std::string const& prefix, 72 | std::string const& extension, int seed = -1) 73 | : m_seed(seed) 74 | , m_prefix(dir_name + "/.tmp." + prefix) 75 | , m_extension(extension) { 76 | next(); 77 | } 78 | 79 | auto const& operator()() { 80 | return m_cur_filename; 81 | } 82 | 83 | auto const& prx() { 84 | return m_prefix; 85 | } 86 | 87 | auto const& ext() { 88 | return m_extension; 89 | } 90 | 91 | auto seed() const { 92 | return m_seed; 93 | } 94 | 95 | void next() { 96 | ++m_seed; 97 | m_cur_filename = prx() + std::to_string(m_seed) + "." + ext(); 98 | } 99 | 100 | private: 101 | int m_seed; 102 | std::string m_prefix; 103 | std::string m_extension; 104 | std::string m_cur_filename; 105 | }; 106 | 107 | template 108 | struct semi_sync_queue { 109 | semi_sync_queue() { 110 | open(); 111 | } 112 | 113 | void close() { 114 | m_open = false; 115 | } 116 | 117 | void open() { 118 | m_open = true; 119 | } 120 | 121 | void lock() { 122 | m_mutex.lock(); 123 | } 124 | 125 | void unlock() { 126 | m_mutex.unlock(); 127 | } 128 | 129 | void push(T& val) { 130 | m_buffer.push_back(std::move(val)); 131 | } 132 | 133 | T& pick() { 134 | return m_buffer.front(); 135 | } 136 | 137 | void pop() { 138 | m_buffer.pop_front(); 139 | } 140 | 141 | bool active() const { 142 | return m_open; 143 | } 144 | 145 | bool empty() const { 146 | return m_buffer.empty(); 147 | } 148 | 149 | size_t size() const { 150 | return m_buffer.size(); 151 | } 152 | 153 | auto begin() { 154 | return m_buffer.begin(); 155 | } 156 | 157 | auto end() { 158 | return m_buffer.end(); 159 | } 160 | 161 | void release() { 162 | std::deque().swap(m_buffer); 163 | } 164 | 165 | private: 166 | std::mutex m_mutex; 167 | std::deque m_buffer; 168 | bool m_open; 169 | }; 170 | 171 | } // namespace tongrams 172 | -------------------------------------------------------------------------------- /include/vocabulary.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "constants.hpp" 4 | #include "util_types.hpp" 5 | 6 | #include "../external/tongrams/include/utils/iterators.hpp" 7 | #include "../external/tongrams/include/utils/pools.hpp" 8 | 9 | namespace tongrams { 10 | 11 | struct vocabulary { 12 | struct builder { 13 | builder() {} 14 | 15 | builder(size_t vocab_size, size_t bytes = 0) 16 | : m_vocab_size(vocab_size) { 17 | m_unigram_strings.reserve(bytes); 18 | m_offsets.reserve(vocab_size + 1); 19 | m_offsets.push_back(0); 20 | } 21 | 22 | void reserve(size_t bytes) { 23 | m_unigram_strings.reserve(bytes); 24 | } 25 | 26 | void push_empty() { 27 | m_offsets.push_back(m_unigram_strings.bytes()); 28 | } 29 | 30 | void push_back(byte_range br) { 31 | m_unigram_strings.append(br); 32 | m_offsets.push_back(m_unigram_strings.bytes()); 33 | } 34 | 35 | void load(std::string const& vocab_filename) { 36 | text_lines it(vocab_filename.c_str()); 37 | for (uint64_t i = 0; i != m_vocab_size; ++i) { 38 | auto unigram = it.next_word(); 39 | if (bytes::equal_bytes(unigram, 40 | constants::empty_token_byte_range)) { 41 | push_empty(); 42 | } else { 43 | push_back(unigram); 44 | } 45 | } 46 | } 47 | 48 | void swap(builder& other) { 49 | std::swap(m_vocab_size, other.m_vocab_size); 50 | m_unigram_strings.swap(other.m_unigram_strings); 51 | m_offsets.swap(other.m_offsets); 52 | } 53 | 54 | void build(vocabulary& vocab) { 55 | vocab.m_unigram_strings.swap(m_unigram_strings); 56 | vocab.m_unigram_strings.shrink_to_fit(); 57 | vocab.m_base_addr = vocab.m_unigram_strings.base_addr(); 58 | vocab.m_offsets.swap(m_offsets); 59 | builder().swap(*this); 60 | } 61 | 62 | size_t size() const { 63 | return m_offsets.size() - 1; 64 | } 65 | 66 | private: 67 | size_t m_vocab_size; 68 | strings_pool m_unigram_strings; 69 | std::vector m_offsets; 70 | }; 71 | 72 | vocabulary() {} 73 | 74 | byte_range operator[](word_id id) const { 75 | assert(id < m_offsets.size() - 1); 76 | uint64_t begin = m_offsets[id]; 77 | uint64_t end = m_offsets[id + 1]; 78 | assert(end >= begin); 79 | if (LIKELY(begin != end)) { 80 | return m_unigram_strings.get_bytes(m_base_addr, begin, end); 81 | } 82 | return constants::empty_token_byte_range; 83 | } 84 | 85 | private: 86 | uint8_t const* m_base_addr; 87 | strings_pool m_unigram_strings; 88 | std::vector m_offsets; 89 | }; 90 | 91 | } // namespace tongrams 92 | -------------------------------------------------------------------------------- /src/count.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "../external/tongrams/external/cmd_line_parser/include/parser.hpp" 5 | 6 | #include "configuration.hpp" 7 | #include "counter.hpp" 8 | 9 | int main(int argc, char** argv) { 10 | using namespace tongrams; 11 | 12 | configuration config; 13 | cmd_line_parser::parser parser(argc, argv); 14 | parser.add("text_filename", "Input text filename."); 15 | parser.add("order", "Language model order. It must be > 2 and <= " + 16 | std::to_string(global::max_order) + "."); 17 | parser.add("ram", 18 | "Amount to RAM in GiB. Default is " + 19 | std::to_string(static_cast( 20 | static_cast(config.RAM) / essentials::GiB)) + 21 | " GiB.", 22 | "--ram", false); 23 | parser.add("tmp_dir", 24 | "Temporary directory used for counting. Default is directory '" + 25 | constants::default_tmp_dirname + "'.", 26 | "--tmp", false); 27 | parser.add("num_threads", 28 | "Number of threads. Default is " + 29 | std::to_string(config.num_threads) + " on this machine.", 30 | "--thr", false); 31 | parser.add("compress_blocks", 32 | "Compress temporary files during estimation. Default is " + 33 | (config.compress_blocks ? std::string("true") 34 | : std::string("false")) + 35 | ".", 36 | "--compress_blocks", true); 37 | parser.add("out", 38 | "Output filename. Default is '" + 39 | constants::default_output_filename + "'.", 40 | "--out", false); 41 | if (!parser.parse()) return 1; 42 | 43 | config.text_filename = parser.get("text_filename"); 44 | if (!util::exists(config.text_filename.c_str())) { 45 | std::cerr << "Error: corpus file does not exist" << std::endl; 46 | return 1; 47 | } 48 | 49 | config.text_size = util::file_size(config.text_filename.c_str()); 50 | std::cerr << "reading from '" << config.text_filename << "' (" 51 | << config.text_size << " bytes)" << std::endl; 52 | config.max_order = parser.get("order"); 53 | if (config.max_order <= 2 or config.max_order > global::max_order) { 54 | std::cerr << "invalid language model order" << std::endl; 55 | return 1; 56 | } 57 | 58 | size_t available_ram = sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES); 59 | 60 | if (parser.parsed("ram")) { 61 | uint64_t ram = 62 | static_cast(parser.get("ram") * essentials::GiB); 63 | if (ram > available_ram) { 64 | std::cerr << "Warning: this machine has " 65 | << available_ram / essentials::GiB << " GiB of RAM." 66 | << std::endl; 67 | std::cerr << "Thus, using defalt amount of " 68 | << config.RAM / essentials::GiB << " GiB" << std::endl; 69 | } else { 70 | config.RAM = ram; 71 | } 72 | } 73 | if (parser.parsed("tmp_dir")) { 74 | config.tmp_dirname = parser.get("tmp_dir"); 75 | } 76 | if (parser.parsed("num_threads")) { 77 | config.num_threads = parser.get("num_threads"); 78 | if (config.num_threads == 0) { 79 | std::cerr << "number of threads must be > 0" << std::endl; 80 | return 1; 81 | } 82 | } 83 | if (parser.parsed("compress_blocks")) { 84 | config.compress_blocks = parser.get("compress_blocks"); 85 | } 86 | if (parser.parsed("out")) { 87 | config.output_filename = parser.get("out"); 88 | } 89 | 90 | config.vocab_tmp_subdirname = config.tmp_dirname + "/vocab"; 91 | bool ok = essentials::create_directory(config.tmp_dirname) and 92 | essentials::create_directory(config.vocab_tmp_subdirname); 93 | if (not ok) return 1; 94 | 95 | std::cerr << "counting with " << config.RAM << "/" << available_ram 96 | << " bytes of RAM" 97 | << " (" << config.RAM * 100.0 / available_ram << "\%)\n"; 98 | 99 | std::ios_base::sync_with_stdio(false); 100 | std::cin.tie(NULL); 101 | 102 | counter c(config); 103 | c.run(); 104 | c.print_stats(); 105 | 106 | return 0; 107 | } 108 | -------------------------------------------------------------------------------- /src/estimate.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "../external/tongrams/external/cmd_line_parser/include/parser.hpp" 5 | 6 | #include "configuration.hpp" 7 | #include "estimation.hpp" 8 | 9 | int main(int argc, char** argv) { 10 | using namespace tongrams; 11 | 12 | configuration config; 13 | cmd_line_parser::parser parser(argc, argv); 14 | parser.add("text_filename", "Input text filename."); 15 | parser.add("order", "Language model order. It must be > 2 and <= " + 16 | std::to_string(global::max_order) + "."); 17 | parser.add("ram", 18 | "Amount to RAM dedicated to estimation in GiB. Default is " + 19 | std::to_string(static_cast( 20 | static_cast(config.RAM) / essentials::GiB)) + 21 | " GiB.", 22 | "--ram", false); 23 | parser.add( 24 | "tmp_dir", 25 | "Temporary directory used for estimation. Default is directory '" + 26 | constants::default_tmp_dirname + "'.", 27 | "--tmp", false); 28 | parser.add("num_threads", 29 | "Number of threads. Default is " + 30 | std::to_string(config.num_threads) + " on this machine.", 31 | "--thr", false); 32 | parser.add("compress_blocks", 33 | "Compress temporary files during estimation. Default is " + 34 | (config.compress_blocks ? std::string("true") 35 | : std::string("false")) + 36 | ".", 37 | "--compress_blocks", true); 38 | // parser.add("p", 39 | // "Probability quantization bits.", 40 | // "--p", false); 41 | // parser.add("b", 42 | // "Backoff quantization bits.", 43 | // "--b", false); 44 | parser.add("out", 45 | "Output filename. Default is '" + 46 | constants::default_output_filename + "'.", 47 | "--out", false); 48 | if (!parser.parse()) return 1; 49 | 50 | config.text_filename = parser.get("text_filename"); 51 | if (!util::exists(config.text_filename.c_str())) { 52 | std::cerr << "Error: corpus file does not exist" << std::endl; 53 | return 1; 54 | } 55 | 56 | config.text_size = util::file_size(config.text_filename.c_str()); 57 | std::cerr << "reading from '" << config.text_filename << "' (" 58 | << config.text_size << " bytes)" << std::endl; 59 | config.max_order = parser.get("order"); 60 | if (config.max_order <= 2 or config.max_order > global::max_order) { 61 | std::cerr << "invalid language model order" << std::endl; 62 | return 1; 63 | } 64 | 65 | size_t available_ram = sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES); 66 | 67 | if (parser.parsed("ram")) { 68 | uint64_t ram = 69 | static_cast(parser.get("ram") * essentials::GiB); 70 | if (ram > available_ram) { 71 | std::cerr << "Warning: this machine has " 72 | << available_ram / essentials::GiB << " GiB of RAM." 73 | << std::endl; 74 | std::cerr << "Thus, using defalt amount of " 75 | << config.RAM / essentials::GiB << " GiB" << std::endl; 76 | } else { 77 | config.RAM = ram; 78 | } 79 | } 80 | if (parser.parsed("tmp_dir")) { 81 | config.tmp_dirname = parser.get("tmp_dir"); 82 | } 83 | if (parser.parsed("num_threads")) { 84 | config.num_threads = parser.get("num_threads"); 85 | if (config.num_threads == 0) { 86 | std::cerr << "number of threads must be > 0" << std::endl; 87 | return 1; 88 | } 89 | } 90 | if (parser.parsed("compress_blocks")) { 91 | config.compress_blocks = parser.get("compress_blocks"); 92 | } 93 | if (parser.parsed("out")) { 94 | config.output_filename = parser.get("out"); 95 | } 96 | 97 | config.vocab_tmp_subdirname = config.tmp_dirname + "/vocab"; 98 | bool ok = essentials::create_directory(config.tmp_dirname) and 99 | essentials::create_directory(config.vocab_tmp_subdirname); 100 | if (not ok) return 1; 101 | 102 | std::cerr << "estimating with " << config.RAM << "/" << available_ram 103 | << " bytes of RAM" 104 | << " (" << config.RAM * 100.0 / available_ram << "\%)\n"; 105 | 106 | std::ios_base::sync_with_stdio(false); 107 | std::cin.tie(NULL); 108 | 109 | estimation e(config); 110 | e.run(); 111 | e.print_stats(); 112 | 113 | return 0; 114 | } 115 | -------------------------------------------------------------------------------- /test_data/1Billion.1M.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jermp/tongrams_estimation/d63e781983d6774f68f21f7ccb6396d2761b2131/test_data/1Billion.1M.gz --------------------------------------------------------------------------------