├── .clang-format
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE
├── README.md
├── include
    ├── adjusting
    │   ├── adjusting.hpp
    │   └── adjusting_writer.hpp
    ├── comparators.hpp
    ├── configuration.hpp
    ├── constants.hpp
    ├── counter.hpp
    ├── counting
    │   ├── counting.hpp
    │   ├── counting_common.hpp
    │   ├── counting_reader.hpp
    │   ├── counting_writer.hpp
    │   ├── hash_utils.hpp
    │   ├── ngrams_hash_block.hpp
    │   ├── parallel_radix_sort.hpp
    │   └── sliding_window.hpp
    ├── estimation.hpp
    ├── front_coding.hpp
    ├── last
    │   ├── estimation_builder.hpp
    │   ├── index_types.hpp
    │   ├── last.hpp
    │   └── write.hpp
    ├── merge_utils.hpp
    ├── merging
    │   ├── merging.hpp
    │   └── merging_writer.hpp
    ├── ngrams_block.hpp
    ├── statistics.hpp
    ├── stream.hpp
    ├── tmp.hpp
    ├── util.hpp
    ├── util_types.hpp
    └── vocabulary.hpp
├── src
    ├── count.cpp
    └── estimate.cpp
└── test_data
    └── 1Billion.1M.gz


/.clang-format:
--------------------------------------------------------------------------------
  1 | ---
  2 | Language:        Cpp
  3 | # BasedOnStyle:  Google
  4 | AccessModifierOffset: -4
  5 | AlignAfterOpenBracket: Align
  6 | AlignConsecutiveAssignments: false
  7 | AlignConsecutiveDeclarations: false
  8 | AlignEscapedNewlines: Left
  9 | AlignOperands:   true
 10 | AlignTrailingComments: true
 11 | AllowAllParametersOfDeclarationOnNextLine: true
 12 | AllowShortBlocksOnASingleLine: false
 13 | AllowShortCaseLabelsOnASingleLine: false
 14 | AllowShortFunctionsOnASingleLine: Empty
 15 | AllowShortIfStatementsOnASingleLine: true
 16 | AllowShortLoopsOnASingleLine: true
 17 | AlwaysBreakAfterDefinitionReturnType: None
 18 | AlwaysBreakAfterReturnType: None
 19 | AlwaysBreakBeforeMultilineStrings: true
 20 | AlwaysBreakTemplateDeclarations: Yes
 21 | BinPackArguments: true
 22 | BinPackParameters: true
 23 | BraceWrapping:
 24 |   AfterClass:      false
 25 |   AfterControlStatement: false
 26 |   AfterEnum:       false
 27 |   AfterFunction:   false
 28 |   AfterNamespace:  false
 29 |   AfterObjCDeclaration: false
 30 |   AfterStruct:     false
 31 |   AfterUnion:      false
 32 |   AfterExternBlock: false
 33 |   BeforeCatch:     false
 34 |   BeforeElse:      false
 35 |   IndentBraces:    false
 36 |   SplitEmptyFunction: true
 37 |   SplitEmptyRecord: true
 38 |   SplitEmptyNamespace: true
 39 | BreakBeforeBinaryOperators: None
 40 | BreakBeforeBraces: Attach
 41 | BreakBeforeInheritanceComma: false
 42 | BreakInheritanceList: BeforeComma
 43 | BreakBeforeTernaryOperators: true
 44 | BreakConstructorInitializersBeforeComma: true
 45 | BreakConstructorInitializers: BeforeComma
 46 | BreakAfterJavaFieldAnnotations: false
 47 | BreakStringLiterals: true
 48 | ColumnLimit:     80
 49 | CommentPragmas:  '^ IWYU pragma:'
 50 | CompactNamespaces: false
 51 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
 52 | ConstructorInitializerIndentWidth: 4
 53 | ContinuationIndentWidth: 4
 54 | Cpp11BracedListStyle: true
 55 | DerivePointerAlignment: false
 56 | DisableFormat:   false
 57 | ExperimentalAutoDetectBinPacking: false
 58 | FixNamespaceComments: true
 59 | ForEachMacros:
 60 |   - foreach
 61 |   - Q_FOREACH
 62 |   - BOOST_FOREACH
 63 | IncludeBlocks:   Preserve
 64 | IncludeCategories:
 65 |   - Regex:           '^<ext/.*\.h>'
 66 |     Priority:        2
 67 |   - Regex:           '^<.*\.h>'
 68 |     Priority:        1
 69 |   - Regex:           '^<.*'
 70 |     Priority:        2
 71 |   - Regex:           '.*'
 72 |     Priority:        3
 73 | IncludeIsMainRegex: '([-_](test|unittest))?$'
 74 | IndentCaseLabels: true
 75 | IndentPPDirectives: None
 76 | IndentWidth:     4
 77 | IndentWrappedFunctionNames: false
 78 | JavaScriptQuotes: Leave
 79 | JavaScriptWrapImports: true
 80 | KeepEmptyLinesAtTheStartOfBlocks: false
 81 | MacroBlockBegin: ''
 82 | MacroBlockEnd:   ''
 83 | MaxEmptyLinesToKeep: 1
 84 | NamespaceIndentation: None
 85 | ObjCBinPackProtocolList: Never
 86 | ObjCBlockIndentWidth: 2
 87 | ObjCSpaceAfterProperty: false
 88 | ObjCSpaceBeforeProtocolList: true
 89 | PenaltyBreakAssignment: 2
 90 | PenaltyBreakBeforeFirstCallParameter: 1
 91 | PenaltyBreakComment: 300
 92 | PenaltyBreakFirstLessLess: 120
 93 | PenaltyBreakString: 1000
 94 | PenaltyBreakTemplateDeclaration: 10
 95 | PenaltyExcessCharacter: 1000000
 96 | PenaltyReturnTypeOnItsOwnLine: 200
 97 | PointerAlignment: Left
 98 | RawStringFormats:
 99 |   - Language:        Cpp
100 |     Delimiters:
101 |       - cc
102 |       - CC
103 |       - cpp
104 |       - Cpp
105 |       - CPP
106 |       - 'c++'
107 |       - 'C++'
108 |     CanonicalDelimiter: ''
109 |     BasedOnStyle:    google
110 |   - Language:        TextProto
111 |     Delimiters:
112 |       - pb
113 |       - PB
114 |       - proto
115 |       - PROTO
116 |     EnclosingFunctions:
117 |       - EqualsProto
118 |       - EquivToProto
119 |       - PARSE_PARTIAL_TEXT_PROTO
120 |       - PARSE_TEST_PROTO
121 |       - PARSE_TEXT_PROTO
122 |       - ParseTextOrDie
123 |       - ParseTextProtoOrDie
124 |     CanonicalDelimiter: ''
125 |     BasedOnStyle:    google
126 | ReflowComments:  true
127 | SortIncludes:    false
128 | SortUsingDeclarations: true
129 | SpaceAfterCStyleCast: false
130 | SpaceAfterTemplateKeyword: true
131 | SpaceBeforeAssignmentOperators: true
132 | SpaceBeforeCpp11BracedList: false
133 | SpaceBeforeCtorInitializerColon: true
134 | SpaceBeforeInheritanceColon: true
135 | SpaceBeforeParens: ControlStatements
136 | SpaceBeforeRangeBasedForLoopColon: true
137 | SpaceInEmptyParentheses: false
138 | SpacesBeforeTrailingComments: 2
139 | SpacesInAngles:  false
140 | SpacesInContainerLiterals: true
141 | SpacesInCStyleCastParentheses: false
142 | SpacesInParentheses: false
143 | SpacesInSquareBrackets: false
144 | Standard:        Auto
145 | StatementMacros:
146 |   - Q_UNUSED
147 |   - QT_REQUIRE_VERSION
148 | TabWidth:        8
149 | UseTab:          Never
150 | ...
151 | 
152 | 
153 | 
154 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | .DS_Store
3 | python/files.txt
4 | python/build


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "external/tongrams"]
2 | 	path = external/tongrams
3 | 	url = https://github.com/jermp/tongrams.git
4 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8)
 2 | project(TONGRAMS_ESTIMATION)
 3 | 
 4 | if(CMAKE_BUILD_TYPE MATCHES Debug)
 5 |   MESSAGE(STATUS "DEBUG defined")
 6 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDEBUG")
 7 | endif()
 8 | 
 9 | if(NOT CMAKE_BUILD_TYPE)
10 |   set(CMAKE_BUILD_TYPE "Release")
11 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNDEBUG")
12 | endif()
13 | 
14 | if(LSD_RADIX_SORT)
15 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLSD_RADIX_SORT")
16 |   MESSAGE(STATUS "Sorting with LSD_RADIX_SORT")
17 | endif()
18 | 
19 | MESSAGE(STATUS "CMAKE_BUILD_TYPE: " ${CMAKE_BUILD_TYPE})
20 | 
21 | 
22 | if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
23 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
24 | endif ()
25 | 
26 | if (UNIX AND NOT APPLE)
27 |   MESSAGE(STATUS "Compiling with openmp")
28 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") # for __gnu_parallel::sort
29 | endif()
30 | 
31 | if (UNIX)
32 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
33 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
34 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
35 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb")
36 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-missing-braces")
37 | 
38 |   if(TONGRAMS_USE_SANITIZERS)
39 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
40 |   endif()
41 | 
42 | endif()
43 | 
44 | find_package(Boost COMPONENTS iostreams filesystem thread REQUIRED)
45 | include_directories(${Boost_INCLUDE_DIRS})
46 | link_directories(${Boost_LIBRARY_DIRS})
47 | 
48 | include_directories(${TONGRAMS_ESTIMATION_SOURCE_DIR}/include)
49 | include_directories(${TONGRAMS_ESTIMATION_SOURCE_DIR}/external/tongrams/include)
50 | 
51 | add_subdirectory(external/tongrams)
52 | 
53 | file(GLOB SRC_SOURCES src/*.cpp)
54 | foreach(SRC ${SRC_SOURCES})
55 |   get_filename_component (SRC_NAME ${SRC} NAME_WE) # without extension
56 |   add_executable(${SRC_NAME} ${SRC})
57 |   target_link_libraries(${SRC_NAME} ${Boost_LIBRARIES})
58 | endforeach(SRC)
59 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright 2017-2019 Giulio Ermanno Pibiri
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included
13 | in all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 | OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Tongrams Estimation
 2 | ===================
 3 | 
 4 | Modified [Kneser-Ney](https://en.wikipedia.org/wiki/Kneser%E2%80%93Ney_smoothing) language model estimation powered by [Tongrams](https://github.com/jermp/tongrams).
 5 | 
 6 | This C++ library implements the 1-Sort algorithm described in the paper
 7 | [*Handling Massive N-Gram Datasets Efficiently*](http://pages.di.unipi.it/pibiri/papers/TOIS19.pdf) by Giulio Ermanno Pibiri and Rossano Venturini, published in ACM TOIS, 2019 [1].
 8 | 
 9 | ### Compiling the code
10 | 
11 | 	git clone --recursive https://github.com/jermp/tongrams_estimation.git
12 | 	mkdir -p build; cd build
13 | 	cmake ..
14 | 	make -j
15 | 
16 | ### Sample usage
17 | 
18 | After installation of dependencies and compilation of the code, you can use
19 | the sample text (first 1M lines from the 1Billion corpus; see the paper for dataset
20 | information) in the directory
21 | `test_data`. The text is gzipped, so it must be first uncompressed.
22 | 
23 | 	cd build
24 | 	gunzip ../test_data/1Billion.1M.gz
25 | 
26 | ##### 1. Estimation
27 | 
28 | Then you can estimate a Kneser-Ney language model of order 5 (using 25% of RAM and whose index is serialized to the file `index.bin`) as follows.
29 | 
30 |     ./estimate ../test_data/1Billion.1M 5 --tmp tmp_dir --ram 0.25 --out index.bin
31 | 
32 | ##### 2. Computing Perplexity
33 | 
34 | With the index built and serialized to `index.bin` you can compute
35 | the perplexity score with:
36 | 
37 |     ./external/tongrams/score index.bin ../test_data/1Billion.1M
38 | 
39 | ##### 3. Counting N-Grams
40 | 
41 | You can also extract n-gram counts. An example follows below, for 3-grams.
42 | 
43 |     ./count ../test_data/1Billion.1M 3 --tmp tmp_dir --ram 0.25 --out 3-grams
44 | 
45 | The output file `3-grams` will list all extracted 3-grams sorted lexicographically
46 | in the following standard format:
47 | 
48 | 	<total_number_of_rows>
49 | 	<gram1> <TAB> <count1>
50 | 	<gram2> <TAB> <count2>
51 | 	<gram3> <TAB> <count3>
52 | 	...
53 | 
54 | where each `<gram>` is a sequence of words separated by a whitespace character.
55 | 
56 | ### Dependencies
57 | 
58 | 1. [boost](https://www.boost.org/)
59 | 2. [sparsehash](https://github.com/sparsehash/sparsehash)
60 | 
61 | ### Bibliography
62 | 
63 | [1] Pibiri, Giulio Ermanno, and Rossano Venturini. "Handling Massive N-Gram Datasets Efficiently." ACM Transactions on Information Systems (TOIS) 37.2 (2019): 1-41.
64 | 


--------------------------------------------------------------------------------
/include/adjusting/adjusting.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "util.hpp"
  4 | #include "constants.hpp"
  5 | #include "stream.hpp"
  6 | #include "statistics.hpp"
  7 | #include "merge_utils.hpp"
  8 | #include "adjusting_writer.hpp"
  9 | 
 10 | namespace tongrams {
 11 | 
 12 | template <typename StreamGenerator>
 13 | struct adjusting {
 14 |     typedef cursor_comparator<context_order_comparator_type>
 15 |         cursor_comparator_type;
 16 | 
 17 |     adjusting(configuration const& config, tmp::data& tmp_data,
 18 |               tmp::statistics& tmp_stats, statistics& stats)
 19 |         : m_config(config)
 20 |         , m_tmp_data(tmp_data)
 21 |         , m_stats(stats)
 22 |         , m_stats_builder(config, tmp_data, tmp_stats)
 23 |         , m_writer(config, constants::file_extension::merged)
 24 |         , m_comparator(config.max_order)
 25 |         , m_cursors(cursor_comparator_type(config.max_order))
 26 |         , m_CPU_time(0.0)
 27 |         , m_I_time(0.0)
 28 |         , m_O_time(0.0)
 29 |         , m_total_smooth_time(0.0)
 30 |         , m_total_time_waiting_for_disk(0.0) {
 31 |         auto start = clock_type::now();
 32 |         size_t vocab_size = m_stats.num_ngrams(1);
 33 |         if (!vocab_size) {
 34 |             throw std::runtime_error("vocabulary size must not be 0");
 35 |         }
 36 |         std::cerr << "vocabulary size: " << vocab_size << std::endl;
 37 |         tmp_stats.resize(1, vocab_size);
 38 |         m_stats_builder.init(vocab_size);
 39 |         auto end = clock_type::now();
 40 |         std::chrono::duration<double> elapsed = end - start;
 41 |         m_CPU_time += elapsed.count();
 42 |     }
 43 | 
 44 |     typedef typename StreamGenerator::block_type input_block_type;
 45 | 
 46 |     void print_stats() const {
 47 |         std::cout << "\"CPU\":" << m_CPU_time << ", ";
 48 |         std::cout << "\"I\":" << m_I_time << ", ";
 49 |         std::cout << "\"O\":" << m_O_time << ", ";
 50 |     }
 51 | 
 52 |     void run() {
 53 |         auto start = clock_type::now();
 54 |         std::vector<std::string> filenames;
 55 |         {
 56 |             essentials::directory tmp_dir(m_config.tmp_dirname);
 57 |             for (auto const& filename : tmp_dir) {
 58 |                 if (filename.extension == constants::file_extension::counts) {
 59 |                     filenames.push_back(filename.fullpath);
 60 |                 }
 61 |             }
 62 |         }
 63 | 
 64 |         size_t num_files_to_merge = filenames.size();
 65 |         assert(num_files_to_merge > 0);
 66 |         std::cerr << "merging " << num_files_to_merge << " files" << std::endl;
 67 | 
 68 |         uint64_t record_size = ngrams_block::record_size(m_config.max_order);
 69 |         uint64_t min_load_size = m_config.RAM / (2 * num_files_to_merge + 1) /
 70 |                                  record_size * record_size;
 71 |         uint64_t default_load_size =
 72 |             (64 * essentials::MiB) / record_size * record_size;
 73 |         uint64_t load_size = default_load_size;
 74 |         if (min_load_size < default_load_size) {
 75 |             std::cerr << "\tusing min. load size of " << min_load_size
 76 |                       << " because not enough RAM is available" << std::endl;
 77 |             load_size = min_load_size;
 78 |         }
 79 |         assert(load_size % record_size == 0);
 80 | 
 81 |         for (auto const& filename : filenames) {
 82 |             m_stream_generators.emplace_back(m_config.max_order);
 83 |             auto& gen = m_stream_generators.back();
 84 |             gen.open(filename);
 85 |             assert(gen.size() == 0);
 86 |             gen.fetch_next_block(load_size);
 87 |         }
 88 | 
 89 |         auto get_block = [](StreamGenerator& gen) {
 90 |             auto* block = gen.get_block();
 91 |             assert(block->template is_sorted<context_order_comparator_type>(
 92 |                 block->begin(), block->end()));
 93 |             return block;
 94 |         };
 95 | 
 96 |         assert(m_cursors.empty());
 97 |         for (uint64_t k = 0; k != m_stream_generators.size(); ++k) {
 98 |             auto& gen = m_stream_generators[k];
 99 |             auto* block = get_block(gen);
100 |             cursor<typename input_block_type::iterator> c(block->begin(),
101 |                                                           block->end(), k);
102 |             m_cursors.push(c);
103 |         }
104 | 
105 |         uint64_t num_ngrams_per_block = load_size / record_size;
106 |         std::cerr << "num_ngrams_per_block = " << num_ngrams_per_block
107 |                   << " ngrams" << std::endl;
108 | 
109 |         uint8_t N = m_config.max_order;
110 |         ngrams_block result(N);
111 |         result.resize_memory(num_ngrams_per_block);
112 |         result.reserve_index(num_ngrams_per_block);
113 |         uint64_t limit = num_ngrams_per_block;
114 | 
115 |         auto compute_left_extensions = [&]() {
116 |             assert(result.template is_sorted<context_order_comparator_type>(
117 |                 result.begin(), result.end()));
118 |             auto start = clock_type::now();
119 |             m_stats_builder.compute_left_extensions(result.begin(),
120 |                                                     result.size());
121 |             auto end = clock_type::now();
122 |             std::chrono::duration<double> elapsed = end - start;
123 |             m_total_smooth_time += elapsed.count();
124 |         };
125 | 
126 |         uint64_t num_Ngrams = 0;
127 |         uint64_t prev_offset = 0;
128 | 
129 |         auto save_offsets = [&]() {
130 |             uint64_t offset = num_Ngrams - prev_offset;
131 |             std::vector<uint64_t> offsets = {offset};
132 |             m_tmp_data.blocks_offsets.push_back(std::move(offsets));
133 |             prev_offset = num_Ngrams;
134 |             limit = num_Ngrams + num_ngrams_per_block;
135 |         };
136 | 
137 |         m_writer.start();
138 | 
139 |         while (!m_cursors.empty()) {
140 |             auto& top = m_cursors.top();
141 |             auto min = *(top.range.begin);
142 | 
143 |             if (!result.size()) {
144 |                 result.push_back(min.data, min.data + N, *(min.value(N)));
145 |                 ++num_Ngrams;
146 |             } else {
147 |                 auto& back = result.back();
148 |                 bool equal = equal_to(min.data, back.data, sizeof_ngram(N));
149 | 
150 |                 if (not equal) {
151 |                     if (num_Ngrams >= limit and
152 |                         compare_i<ngram_pointer>(
153 |                             min, back, m_comparator.begin()) > 0  // greater
154 |                     ) {
155 |                         save_offsets();
156 |                     }
157 | 
158 |                     if (result.size() == num_ngrams_per_block) {
159 |                         compute_left_extensions();
160 |                         auto start = clock_type::now();
161 |                         while (m_writer.size() > 0)
162 |                             ;  // wait for flush
163 |                         auto end = clock_type::now();
164 |                         std::chrono::duration<double> elapsed = end - start;
165 |                         m_total_time_waiting_for_disk += elapsed.count();
166 | 
167 |                         m_writer.push(result);
168 | 
169 |                         result.init(N);
170 |                         result.resize_memory(num_ngrams_per_block);
171 |                         result.reserve_index(num_ngrams_per_block);
172 |                         assert(result.empty());
173 |                     }
174 | 
175 |                     result.push_back(min.data, min.data + N, *(min.value(N)));
176 |                     ++num_Ngrams;
177 | 
178 |                 } else {
179 |                     *(back.value(N)) += *(min.value(N));
180 |                 }
181 |             }
182 | 
183 |             ++(top.range.begin);
184 | 
185 |             if (top.range.begin == top.range.end) {
186 |                 auto& gen = m_stream_generators[top.index];
187 |                 gen.release_block();
188 |                 if (gen.eos()) {
189 |                     assert(gen.empty());
190 |                     gen.close_and_remove();
191 |                     m_cursors.pop();
192 |                 } else {
193 |                     gen.fetch_next_block(load_size);
194 |                     auto* block = get_block(gen);
195 |                     top.range.begin = block->begin();
196 |                     top.range.end = block->end();
197 |                 }
198 |             }
199 | 
200 |             m_cursors.heapify();
201 |         }
202 | 
203 |         std::cerr << "MERGE DONE: " << num_Ngrams << " N-grams" << std::endl;
204 |         std::cerr << "\ttime waiting for disk = "
205 |                   << m_total_time_waiting_for_disk << " [sec]\n";
206 |         std::cerr << "\tsmoothing time: " << m_total_smooth_time << " [sec]"
207 |                   << std::endl;
208 | 
209 |         save_offsets();
210 |         compute_left_extensions();
211 |         m_stats_builder.finalize();
212 | 
213 |         auto end = clock_type::now();
214 |         std::chrono::duration<double> elapsed = end - start;
215 |         m_CPU_time += elapsed.count();
216 | 
217 |         m_writer.push(result);
218 |         m_writer.terminate();
219 | 
220 |         m_CPU_time -= m_total_time_waiting_for_disk;
221 |         for (auto& sg : m_stream_generators) m_I_time += sg.I_time();
222 | 
223 |         start = clock_type::now();
224 |         m_stats_builder.build(m_stats);
225 |         end = clock_type::now();
226 |         elapsed = end - start;
227 |         m_CPU_time += elapsed.count();
228 |         m_CPU_time -= m_I_time;
229 |         m_O_time += m_writer.time();
230 |     }
231 | 
232 | private:
233 |     configuration const& m_config;
234 |     tmp::data& m_tmp_data;
235 |     statistics& m_stats;
236 |     statistics::builder m_stats_builder;
237 |     std::deque<StreamGenerator> m_stream_generators;
238 |     adjusting_writer m_writer;
239 |     context_order_comparator_type m_comparator;
240 | 
241 |     min_heap<cursor<typename input_block_type::iterator>,
242 |              cursor_comparator_type>
243 |         m_cursors;
244 | 
245 |     double m_CPU_time;
246 |     double m_I_time;
247 |     double m_O_time;
248 |     double m_total_smooth_time;
249 |     double m_total_time_waiting_for_disk;
250 | };
251 | 
252 | }  // namespace tongrams
253 | 


--------------------------------------------------------------------------------
/include/adjusting/adjusting_writer.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "configuration.hpp"
  4 | #include "tmp.hpp"
  5 | 
  6 | namespace tongrams {
  7 | 
  8 | struct adjusting_writer {
  9 |     adjusting_writer(configuration const& config,
 10 |                      std::string const& file_extension)
 11 |         : m_num_flushes(0), m_time(0.0) {
 12 |         m_buffer.open();
 13 |         std::string output_filename =
 14 |             filename_generator(config.tmp_dirname, "", file_extension)();
 15 |         m_os.open(output_filename.c_str(), std::ofstream::binary |
 16 |                                                std::ofstream::ate |
 17 |                                                std::ofstream::app);
 18 |     }
 19 | 
 20 |     ~adjusting_writer() {
 21 |         if (!m_buffer.empty()) {
 22 |             std::cerr << "Error: some data still need to be written"
 23 |                       << std::endl;
 24 |             std::terminate();
 25 |         }
 26 |     }
 27 | 
 28 |     void start() {
 29 |         m_thread = std::thread(&adjusting_writer::run, this);
 30 |     }
 31 | 
 32 |     void terminate() {
 33 |         m_buffer.lock();
 34 |         m_buffer.close();
 35 |         m_buffer.unlock();
 36 |         if (m_thread.joinable()) m_thread.join();
 37 |         assert(!m_buffer.active());
 38 |         while (!m_buffer.empty()) flush();
 39 |         m_os.close();
 40 |         std::cerr << "\tadjusting_writer thread stats:\n";
 41 |         std::cerr << "\tflushed blocks: " << m_num_flushes << "\n";
 42 |         std::cerr << "\twrite time: " << m_time << "\n";
 43 |     }
 44 | 
 45 |     void push(ngrams_block& block) {
 46 |         m_buffer.lock();
 47 |         m_buffer.push(block);
 48 |         m_buffer.unlock();
 49 |     }
 50 | 
 51 |     size_t size() {
 52 |         m_buffer.lock();
 53 |         size_t s = m_buffer.size();
 54 |         m_buffer.unlock();
 55 |         return s;
 56 |     }
 57 | 
 58 |     double time() const {
 59 |         return m_time;
 60 |     }
 61 | 
 62 | private:
 63 |     semi_sync_queue<ngrams_block> m_buffer;
 64 |     std::ofstream m_os;
 65 |     std::thread m_thread;
 66 |     uint64_t m_num_flushes;
 67 |     double m_time;
 68 | 
 69 |     void run() {
 70 |         while (m_buffer.active()) flush();
 71 |     }
 72 | 
 73 |     void flush() {
 74 |         m_buffer.lock();
 75 |         if (m_buffer.empty()) {
 76 |             m_buffer.unlock();
 77 |             return;
 78 |         }
 79 |         auto& block = m_buffer.pick();
 80 |         m_buffer.unlock();
 81 | 
 82 |         auto start = clock_type::now();
 83 |         block.write_memory(m_os);
 84 |         auto end = clock_type::now();
 85 |         std::chrono::duration<double> elapsed = end - start;
 86 |         m_time += elapsed.count();
 87 | 
 88 |         block.release();
 89 | 
 90 |         m_buffer.lock();
 91 |         m_buffer.pop();
 92 |         m_buffer.unlock();
 93 |         ++m_num_flushes;
 94 |         if (m_num_flushes % 20 == 0) {
 95 |             std::cerr << "flushed " << m_num_flushes << " blocks" << std::endl;
 96 |         }
 97 |     }
 98 | };
 99 | 
100 | }  // namespace tongrams


--------------------------------------------------------------------------------
/include/comparators.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | namespace tongrams {
  4 | 
  5 | template <typename T>
  6 | int compare_i(T const& x, T const& y, int i) {
  7 |     if (x[i] != y[i]) {
  8 |         return x[i] < y[i] ? -1 : 1;
  9 |     }
 10 |     return 0;
 11 | }
 12 | 
 13 | template <typename T>
 14 | struct prefix_order_comparator {
 15 |     prefix_order_comparator() {}
 16 | 
 17 |     void init(uint8_t N) {
 18 |         m_N = N;
 19 |     }
 20 | 
 21 |     prefix_order_comparator(uint8_t N) {
 22 |         init(N);
 23 |     }
 24 | 
 25 |     int order() const {
 26 |         return m_N;
 27 |     }
 28 | 
 29 |     void swap(prefix_order_comparator& other) {
 30 |         std::swap(m_N, other.m_N);
 31 |     }
 32 | 
 33 |     bool operator()(T const& x, T const& y) const {
 34 |         return compare(x, y) < 0;
 35 |     }
 36 | 
 37 |     inline int begin() const {
 38 |         return 0;
 39 |     }
 40 | 
 41 |     inline int end() const {  // last valid index, not one-past the end
 42 |         return m_N - 1;
 43 |     }
 44 | 
 45 |     inline void next(int& i) const {
 46 |         ++i;
 47 |     }
 48 | 
 49 |     inline void advance(int& i, int n) const {
 50 |         i += n;
 51 |     }
 52 | 
 53 |     // returns the length of lcp(x,y)
 54 |     int lcp(T const& x, T const& y) const {
 55 |         for (int i = begin(); i != end(); next(i)) {
 56 |             int cmp = compare_i(x, y, i);
 57 |             if (cmp != 0) return i;
 58 |         }
 59 |         return m_N;
 60 |     }
 61 | 
 62 |     int compare(T const& x, T const& y) const {
 63 |         for (int i = begin(); i < m_N; ++i) {
 64 |             int cmp = compare_i(x, y, i);
 65 |             if (cmp != 0) return cmp;
 66 |         }
 67 |         return 0;
 68 |     }
 69 | 
 70 |     bool equals(T const& x, T const& y) const {
 71 |         return compare(x, y) == 0;
 72 |     }
 73 | 
 74 | private:
 75 |     int m_N;
 76 | };
 77 | 
 78 | template <typename T>
 79 | struct context_order_comparator {
 80 |     context_order_comparator() {}
 81 | 
 82 |     void init(uint8_t N) {
 83 |         m_N = N;
 84 |     }
 85 | 
 86 |     context_order_comparator(uint8_t N) {
 87 |         init(N);
 88 |     }
 89 | 
 90 |     int order() const {
 91 |         return m_N;
 92 |     }
 93 | 
 94 |     void swap(context_order_comparator& other) {
 95 |         std::swap(m_N, other.m_N);
 96 |     }
 97 | 
 98 |     bool operator()(T const& x, T const& y) const {
 99 |         return compare(x, y) < 0;
100 |     }
101 | 
102 |     inline int begin() const {
103 |         return m_N - 2;
104 |     }
105 | 
106 |     inline int end() const {  // last valid index, not one-past the end
107 |         return m_N - 1;
108 |     }
109 | 
110 |     inline void next(int& i) const {
111 |         if (i == 0) {
112 |             i = end();
113 |         } else {
114 |             --i;
115 |         }
116 |     }
117 | 
118 |     inline void advance(int& i, int n) const {
119 |         assert(n <= m_N);
120 |         i -= n;  // i -= n % m_N to fall back
121 |         if (i < 0) {
122 |             i += m_N;
123 |         }
124 |     }
125 | 
126 |     int lcp(T const& x, T const& y) const {
127 |         int l = 0;  // length of lcp(x,y)
128 |         for (int i = begin(); i != end(); next(i)) {
129 |             int cmp = compare_i(x, y, i);
130 |             if (cmp != 0) return l;
131 |             ++l;
132 |         }
133 |         return l;
134 |     }
135 | 
136 |     int compare(T const& x, T const& y) const {
137 |         for (int i = int(begin()); i != -1; --i) {
138 |             int cmp = compare_i(x, y, i);
139 |             if (cmp != 0) return cmp;
140 |         }
141 |         return compare_i(x, y, begin() + 1);
142 |     }
143 | 
144 |     bool equals(T const& x, T const& y) const {
145 |         return compare(x, y) == 0;
146 |     }
147 | 
148 | private:
149 |     int m_N;
150 | };
151 | 
152 | }  // namespace tongrams
153 | 


--------------------------------------------------------------------------------
/include/configuration.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <thread>
 4 | 
 5 | #include "constants.hpp"
 6 | 
 7 | namespace tongrams {
 8 | 
 9 | struct configuration {
10 |     configuration()
11 |         : RAM(1 * essentials::GiB)
12 |         , max_order(5)
13 |         , num_threads(std::thread::hardware_concurrency())
14 |         , text_size(0)
15 |         , tmp_dirname(constants::default_tmp_dirname)
16 |         , vocab_tmp_subdirname(tmp_dirname + "/vocab")
17 |         , vocab_filename("/vocabulary")
18 |         , output_filename(constants::default_output_filename)
19 |         , compress_blocks(false)
20 |         , probs_quantization_bits(global::default_probs_quantization_bits)
21 |         , backoffs_quantization_bits(
22 |               global::default_backoffs_quantization_bits) {}
23 | 
24 |     uint64_t RAM;
25 |     uint64_t max_order;
26 |     uint64_t num_threads;
27 |     uint64_t text_size;
28 |     std::string tmp_dirname;
29 |     std::string vocab_tmp_subdirname;
30 |     std::string vocab_filename;
31 |     std::string text_filename;
32 |     std::string output_filename;
33 |     bool compress_blocks;
34 |     uint8_t probs_quantization_bits;
35 |     uint8_t backoffs_quantization_bits;
36 | };
37 | 
38 | }  // namespace tongrams
39 | 


--------------------------------------------------------------------------------
/include/constants.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "util.hpp"
 4 | #include "util_types.hpp"
 5 | #include "../external/tongrams/include/utils/util_types.hpp"
 6 | 
 7 | namespace tongrams {
 8 | namespace constants {
 9 | 
10 | static const uint64_t invalid_hash = 0;
11 | 
12 | namespace file_extension {
13 | static const std::string counts("c");
14 | static const std::string merged("m");
15 | }  // namespace file_extension
16 | 
17 | static const std::string default_tmp_dirname("./tmp_dir");
18 | static const std::string default_output_filename("out.bin");
19 | 
20 | static const std::string empty_token("</>");
21 | static const word_id empty_token_word_id = 0;
22 | 
23 | static const byte_range empty_token_byte_range{
24 |     reinterpret_cast<uint8_t const*>(empty_token.c_str()),
25 |     reinterpret_cast<uint8_t const*>(empty_token.c_str()) + empty_token.size()};
26 | 
27 | }  // namespace constants
28 | }  // namespace tongrams
29 | 


--------------------------------------------------------------------------------
/include/counter.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "vocabulary.hpp"
 4 | #include "tmp.hpp"
 5 | #include "statistics.hpp"
 6 | #include "stream.hpp"
 7 | #include "counting/counting.hpp"
 8 | #include "merging/merging.hpp"
 9 | 
10 | namespace tongrams {
11 | 
12 | struct counter {
13 |     counter(configuration const& config)
14 |         : m_config(config)
15 |         , m_tmp_data()
16 |         , m_tmp_stats(config.max_order)
17 |         , m_stats(config.max_order) {
18 |         m_timings.reserve(2);
19 |         std::cout << "{";
20 |         std::cout << "\"dataset\":"
21 |                   << boost::filesystem::path(config.text_filename).stem()
22 |                   << ", ";
23 |         std::cout << "\"order\":" << config.max_order << ", ";
24 |         std::cout << "\"RAM\":" << config.RAM << ", ";
25 |         std::cout << "\"threads\":" << config.num_threads;
26 |     }
27 | 
28 |     ~counter() {
29 |         std::cout << "}" << std::endl;
30 |     }
31 | 
32 |     void run() {
33 |         if (m_config.compress_blocks) {
34 |             typedef fc::writer<prefix_order_comparator_type> block_writer_type;
35 |             run<counting<block_writer_type, prefix_order_comparator_type>>(
36 |                 "counting");
37 |         } else {
38 |             run<counting<stream::writer, prefix_order_comparator_type>>(
39 |                 "counting");
40 |         }
41 | 
42 |         m_stats.num_ngrams(1) = m_tmp_data.word_ids.size();
43 |         m_tmp_data.word_ids.clear();
44 |         // write_vocab();
45 | 
46 |         if (m_config.compress_blocks) {
47 |             run<merging<stream::compressed_stream_generator>>("merging");
48 |         } else {
49 |             run<merging<stream::uncompressed_stream_generator>>("merging");
50 |         }
51 |     }
52 | 
53 |     void print_stats() {
54 |         int step = 1;
55 |         for (auto t : m_timings) {
56 |             std::cerr << "step-" << step << ": " << t << " [sec]\n";
57 |             ++step;
58 |         }
59 |     }
60 | 
61 | private:
62 |     configuration const& m_config;
63 |     tmp::data m_tmp_data;
64 |     tmp::statistics m_tmp_stats;
65 |     statistics m_stats;
66 |     std::vector<double> m_timings;
67 | 
68 |     template <typename Step>
69 |     void run(std::string const& name) {
70 |         std::cout << ", ";
71 |         std::cout << "\"" + name + "\": {";
72 |         auto start = clock_type::now();
73 |         Step step(m_config, m_tmp_data, m_tmp_stats, m_stats);
74 |         step.run();
75 |         auto end = clock_type::now();
76 |         std::chrono::duration<double> elapsed = end - start;
77 |         double total_time = elapsed.count();
78 |         m_timings.push_back(total_time);
79 |         std::cout << "\"total\":" << total_time;
80 |         std::cout << "}";
81 |     }
82 | 
83 |     // std::function<void(void)> write_vocab = [&]() {
84 |     //     std::ofstream os(m_config.vocab_tmp_subdirname +
85 |     //                      m_config.vocab_filename);
86 |     //     size_t vocab_size = m_stats.num_ngrams(1);
87 |     //     vocabulary vocab;
88 |     //     m_tmp_data.vocab_builder.build(vocab);
89 |     //     for (size_t id = 0; id != vocab_size; ++id) {
90 |     //         util::write(os, vocab[id]);
91 |     //         os << "\n";
92 |     //     }
93 |     //     os.close();
94 |     // };
95 | };
96 | }  // namespace tongrams
97 | 


--------------------------------------------------------------------------------
/include/counting/counting.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "configuration.hpp"
  4 | #include "tmp.hpp"
  5 | #include "constants.hpp"
  6 | #include "statistics.hpp"
  7 | #include "util.hpp"
  8 | #include "util_types.hpp"
  9 | 
 10 | #include "../external/tongrams/include/utils/util.hpp"
 11 | 
 12 | #include "counting_common.hpp"
 13 | #include "counting_writer.hpp"
 14 | #include "counting_reader.hpp"
 15 | 
 16 | namespace tongrams {
 17 | 
 18 | template <typename BlockWriter, typename Comparator>
 19 | struct counting {
 20 |     counting(configuration const& config, tmp::data& tmp_data, tmp::statistics&,
 21 |              statistics&)
 22 |         : m_config(config)
 23 |         , m_CPU_time(0.0)
 24 |         , m_I_time(0.0)
 25 |         , m_writer(config, tmp_data, constants::file_extension::counts)
 26 |         , m_reader(config, tmp_data, m_writer) {
 27 |         tmp_data.vocab_builder.push_empty();
 28 |         tmp_data.word_ids[hash_utils::hash_empty_token] =
 29 |             constants::empty_token_word_id;
 30 |     }
 31 | 
 32 |     void run() {
 33 |         bool file_begin = true;
 34 |         bool file_end = false;
 35 |         static constexpr uint64_t mm_region_size = 1 * essentials::GiB;
 36 |         uint64_t blocks = util::ceil_div(m_config.text_size, mm_region_size);
 37 |         uint64_t page_size = sysconf(_SC_PAGESIZE);
 38 |         assert(mm_region_size >= page_size and mm_region_size % page_size == 0);
 39 | 
 40 |         m_writer.start();
 41 | 
 42 |         for (uint64_t block = 0,
 43 |                       page_id = 0;  // disk page containing the beginning of
 44 |                                     // current file block
 45 |              block != blocks; ++block) {
 46 |             uint64_t chunk_size = mm_region_size;
 47 |             uint64_t offset = page_id * page_size;
 48 |             if (offset + chunk_size > m_config.text_size) {
 49 |                 file_end = true;
 50 |                 chunk_size = m_config.text_size - offset;
 51 |             }
 52 | 
 53 |             m_data = util::open_file_partition(m_file, m_config.text_filename,
 54 |                                                chunk_size, offset);
 55 |             uint64_t begin = 0;
 56 |             uint64_t end = m_file.size();
 57 |             assert(end > 0);
 58 | 
 59 |             util::optimize_sequential_access(m_data, end);
 60 | 
 61 |             if (!file_begin) align_forward(begin);
 62 |             std::string boundary = m_boundary;
 63 |             m_boundary.clear();
 64 |             if (!is_aligned(end - 1)) align_backward(begin, --end);
 65 | 
 66 |             uint64_t n = end;
 67 |             assert(n != 0 and n <= mm_region_size);
 68 |             m_reader.init(m_data, boundary, begin, end, file_begin, file_end);
 69 |             m_reader.run();
 70 |             file_begin = false;
 71 | 
 72 |             uint64_t num_pages = util::ceil_div(n, page_size);
 73 |             assert(num_pages > 0);
 74 |             page_id += num_pages;
 75 |             m_file.close();
 76 |         }
 77 | 
 78 |         m_CPU_time += m_reader.CPU_time();
 79 |         m_I_time += m_reader.I_time();
 80 |         m_writer.terminate();
 81 |         m_reader.print_stats();
 82 |     }
 83 | 
 84 |     void print_stats() const {
 85 |         std::cout << "\"CPU\":" << m_CPU_time << ", ";
 86 |         std::cout << "\"I\":" << m_I_time << ", ";
 87 |         std::cout << "\"O\":" << m_writer.O_time() << ", ";
 88 |     }
 89 | 
 90 | private:
 91 |     bool is_aligned(uint64_t pos) const {
 92 |         return m_data[pos] == ' ' or m_data[pos] == '\n';
 93 |     }
 94 | 
 95 |     void align_forward(uint64_t& begin) {
 96 |         for (;; ++begin) {
 97 |             auto c = m_data[begin];
 98 |             if (c == ' ' or c == '\n') {
 99 |                 ++begin;  // first char after a whitespace
100 |                 break;
101 |             }
102 |             m_boundary.push_back(c);
103 |         }
104 |     }
105 | 
106 |     void align_backward(uint64_t begin, uint64_t& end) {
107 |         for (; begin != end; --end) {
108 |             auto c = m_data[end];
109 |             if (c == ' ' or c == '\n') {
110 |                 ++end;  // one-past
111 |                 std::reverse(m_boundary.begin(), m_boundary.end());
112 |                 break;
113 |             }
114 |             m_boundary.push_back(c);
115 |         }
116 |     }
117 | 
118 |     configuration const& m_config;
119 |     boost::iostreams::mapped_file_source m_file;
120 |     uint8_t const* m_data;
121 | 
122 |     std::string m_boundary;
123 | 
124 |     double m_CPU_time;
125 |     double m_I_time;
126 | 
127 |     typedef counting_writer<BlockWriter, Comparator> counting_writer_type;
128 |     typedef counting_reader<counting_writer_type> counting_reader_type;
129 |     counting_writer_type m_writer;
130 |     counting_reader_type m_reader;
131 | };
132 | 
133 | }  // namespace tongrams
134 | 


--------------------------------------------------------------------------------
/include/counting/counting_common.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "ngrams_block.hpp"
 4 | #include "ngrams_hash_block.hpp"
 5 | #include "hash_utils.hpp"
 6 | 
 7 | namespace tongrams {
 8 | namespace counting_step {
 9 | 
10 | typedef ngrams_hash_block<> block_type;
11 | 
12 | }  // namespace counting_step
13 | }  // namespace tongrams


--------------------------------------------------------------------------------
/include/counting/counting_reader.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "counting_common.hpp"
  4 | #include "configuration.hpp"
  5 | #include "tmp.hpp"
  6 | #include "sliding_window.hpp"
  7 | 
  8 | namespace tongrams {
  9 | 
 10 | template <typename Writer>
 11 | struct counting_reader {
 12 |     counting_reader(configuration const& config, tmp::data& tmp_data,
 13 |                     Writer& thread)
 14 |         : m_tmp_data(tmp_data)
 15 |         , m_window(config.max_order)
 16 |         , m_max_order(config.max_order)
 17 |         , m_writer(thread)
 18 |         , m_next_word_id(constants::empty_token_word_id + 1)
 19 |         , m_CPU_time(0.0) {
 20 |         m_window.fill(constants::empty_token_word_id);
 21 |         static constexpr double weight = 0.9;
 22 |         size_t bytes_per_ngram = sizeof_ngram(config.max_order) +
 23 |                                  sizeof(count_type) +  // payload
 24 |                                  sizeof(word_id*) +    // pointer
 25 |                                  sizeof(ngram_id);     // hashset
 26 |         m_num_ngrams_per_block = ((weight * config.RAM) /
 27 |                                   (2 * hash_utils::probing_space_multiplier)) /
 28 |                                  bytes_per_ngram;
 29 |     }
 30 | 
 31 |     void init(uint8_t const* data, std::string const& boundary,
 32 |               uint64_t partition_begin, uint64_t partition_end, bool file_begin,
 33 |               bool file_end) {
 34 |         auto s = clock_type::now();
 35 |         m_partition_end = partition_end;
 36 |         m_file_begin = file_begin;
 37 |         m_file_end = file_end;
 38 |         assert(partition_begin <= partition_end);
 39 |         m_counts.init(m_max_order, m_num_ngrams_per_block);
 40 |         if (file_begin) count();  // count empty window
 41 |         m_window.init({data + partition_begin, data + m_partition_end},
 42 |                       partition_begin);
 43 | 
 44 |         if (!boundary.empty()) {
 45 |             m_window.shift();
 46 |             stl_string_adaptor adaptor;
 47 |             byte_range range = adaptor(boundary);
 48 |             uint64_t hash = hash_utils::byte_range_hash64(range);
 49 |             auto id = find_or_insert(range, hash);
 50 |             m_window.eat(id);
 51 |             count();
 52 |         }
 53 | 
 54 |         auto e = clock_type::now();
 55 |         std::chrono::duration<double> diff = e - s;
 56 |         m_CPU_time += diff.count();
 57 |     }
 58 | 
 59 |     void print_stats() const {
 60 |         std::cerr << "\treader thread stats:\n";
 61 |         std::cerr << "\tCPU time: " << m_CPU_time << " [sec]\n";
 62 |         std::cerr << "\tI time: " << m_window.time() << " [sec]" << std::endl;
 63 |     }
 64 | 
 65 |     void run() {
 66 |         auto s = clock_type::now();
 67 |         while (advance()) count();
 68 | 
 69 |         // NOTE: if we are at the end of file,
 70 |         // add [m_max_order - 1] ngrams padded with empty tokens,
 71 |         // i.e., for max_order = 5 and m text words:
 72 |         // w_{m-3} w_{m-2} w_{m-1} w_m </>
 73 |         // w_{m-2} w_{m-1} w_m </> </>
 74 |         // w_{m-1} w_m </> </> </>
 75 |         // w_m </> </> </> </>
 76 |         if (m_file_end) {
 77 |             assert(m_max_order > 0);
 78 |             for (uint8_t i = 0; i != m_max_order - 1; ++i) {
 79 |                 m_window.shift();
 80 |                 m_window.eat(constants::empty_token_word_id);
 81 |                 count();
 82 |             }
 83 |         }
 84 | 
 85 |         push_block();
 86 | 
 87 |         auto e = clock_type::now();
 88 |         std::chrono::duration<double> diff = e - s;
 89 |         m_CPU_time += diff.count();
 90 |         m_CPU_time -= I_time();
 91 |     }
 92 | 
 93 |     double CPU_time() const {
 94 |         return m_CPU_time;
 95 |     }
 96 | 
 97 |     double I_time() const {
 98 |         return m_window.time();
 99 |     }
100 | 
101 | private:
102 |     tmp::data& m_tmp_data;
103 |     sliding_window m_window;
104 |     uint8_t m_max_order;
105 |     Writer& m_writer;
106 |     word_id m_next_word_id;
107 |     double m_CPU_time;
108 | 
109 |     uint64_t m_partition_end;
110 |     uint64_t m_num_ngrams_per_block;
111 |     bool m_file_begin, m_file_end;
112 |     counting_step::block_type m_counts;
113 | 
114 |     word_id find_or_insert(byte_range range, uint64_t hash) {
115 |         word_id id = m_next_word_id;
116 |         auto it = m_tmp_data.word_ids.find(hash);
117 |         if (it == m_tmp_data.word_ids.end()) {
118 |             m_tmp_data.word_ids[hash] = m_next_word_id;
119 |             m_tmp_data.vocab_builder.push_back(range);
120 |             ++m_next_word_id;
121 |         } else {
122 |             id = (*it).second;
123 |         }
124 |         assert(id < m_next_word_id);
125 |         return id;
126 |     }
127 | 
128 |     bool advance() {
129 |         if (!m_window.advance()) return false;
130 |         auto const& word = m_window.last();
131 |         assert(word.hash != constants::invalid_hash);
132 |         auto id = find_or_insert(word.range, word.hash);
133 |         assert(id < m_next_word_id);
134 |         m_window.eat(id);
135 |         return true;
136 |     }
137 | 
138 |     void count() {
139 |         uint64_t hash =
140 |             hash_utils::hash64(m_window.data(), sizeof_ngram(m_max_order));
141 |         auto [found, at] = m_counts.find_or_insert(m_window.get(), hash);
142 |         if (found) {
143 |             auto count = ++m_counts[at];
144 |             auto& max_count = m_counts.statistics().max_count;
145 |             if (count > max_count) max_count = count;
146 |         }
147 |         if (m_counts.size() == m_num_ngrams_per_block) push_block();
148 |     }
149 | 
150 |     void push_block() {
151 |         while (m_writer.size() > 0)
152 |             ;  // wait for flush
153 |         counting_step::block_type tmp;
154 |         tmp.init(m_max_order, m_num_ngrams_per_block);
155 |         tmp.swap(m_counts);
156 |         tmp.release_hash_index();
157 |         m_writer.push(tmp);
158 |     }
159 | };
160 | 
161 | }  // namespace tongrams


--------------------------------------------------------------------------------
/include/counting/counting_writer.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "counting_common.hpp"
  4 | #include "configuration.hpp"
  5 | #include "tmp.hpp"
  6 | #include "comparators.hpp"
  7 | 
  8 | namespace tongrams {
  9 | 
 10 | template <typename BlockWriter, typename Comparator>
 11 | struct counting_writer {
 12 |     counting_writer(configuration const& config, tmp::data& tmp_data,
 13 |                     std::string const& file_extension)
 14 |         : m_tmp_data(tmp_data)
 15 |         , m_filename_gen(config.tmp_dirname, "", file_extension)
 16 |         , m_O_time(0.0)
 17 |         , m_CPU_time(0.0)
 18 |         , m_num_flushes(0)
 19 |         , m_writer(config.max_order)
 20 |         , m_comparator(config.max_order) {
 21 |         m_buffer.open();
 22 |     }
 23 | 
 24 |     ~counting_writer() {
 25 |         if (!m_buffer.empty()) {
 26 |             std::cerr << "Error: some data still need to be written"
 27 |                       << std::endl;
 28 |             std::terminate();
 29 |         }
 30 |     }
 31 | 
 32 |     void start() {
 33 |         m_thread = std::thread(&counting_writer::run, this);
 34 |     }
 35 | 
 36 |     void terminate() {
 37 |         m_buffer.lock();
 38 |         m_buffer.close();
 39 |         m_buffer.unlock();
 40 |         if (m_thread.joinable()) m_thread.join();
 41 |         assert(!m_buffer.active());
 42 |         while (!m_buffer.empty()) flush();
 43 |         std::cerr << "\tcounting_writer thread stats:\n";
 44 |         std::cerr << "\tflushed blocks: " << m_num_flushes << "\n";
 45 |         std::cerr << "\tO time: " << m_O_time << "\n";
 46 |         std::cerr << "\tCPU time: " << m_CPU_time << "\n";
 47 |     }
 48 | 
 49 |     void push(counting_step::block_type& block) {
 50 |         m_buffer.lock();
 51 |         m_buffer.push(block);
 52 |         m_buffer.unlock();
 53 |     }
 54 | 
 55 |     size_t size() {
 56 |         m_buffer.lock();
 57 |         size_t s = m_buffer.size();
 58 |         m_buffer.unlock();
 59 |         return s;
 60 |     }
 61 | 
 62 |     double CPU_time() const {
 63 |         return m_CPU_time;
 64 |     }
 65 | 
 66 |     double O_time() const {
 67 |         return m_O_time;
 68 |     }
 69 | 
 70 | private:
 71 |     tmp::data& m_tmp_data;
 72 |     semi_sync_queue<counting_step::block_type> m_buffer;
 73 |     std::thread m_thread;
 74 |     filename_generator m_filename_gen;
 75 |     double m_O_time;
 76 |     double m_CPU_time;
 77 |     uint64_t m_num_flushes;
 78 |     BlockWriter m_writer;
 79 |     Comparator m_comparator;
 80 | 
 81 |     void run() {
 82 |         while (m_buffer.active()) flush();
 83 |     }
 84 | 
 85 |     void flush() {
 86 |         m_buffer.lock();
 87 |         if (m_buffer.empty()) {
 88 |             m_buffer.unlock();
 89 |             return;
 90 |         }
 91 |         auto& block = m_buffer.pick();
 92 |         m_buffer.unlock();
 93 | 
 94 |         block.statistics().max_word_id = m_tmp_data.word_ids.size();
 95 | 
 96 |         auto start = clock_type::now();
 97 |         block.sort(m_comparator);
 98 |         auto end = clock_type::now();
 99 |         std::chrono::duration<double> elapsed = end - start;
100 |         m_CPU_time += elapsed.count();
101 |         std::cerr << "sorting took " << elapsed.count() << " [sec]"
102 |                   << std::endl;
103 | 
104 |         start = clock_type::now();
105 |         std::string filename = m_filename_gen();
106 |         std::ofstream os(filename.c_str(), std::ofstream::binary |
107 |                                                std::ofstream::ate |
108 |                                                std::ofstream::app);
109 | 
110 |         m_writer.write_block(os, block.begin(), block.end(), block.size(),
111 |                              block.statistics());
112 | 
113 |         os.close();
114 |         end = clock_type::now();
115 |         elapsed = end - start;
116 |         m_O_time += elapsed.count();
117 | 
118 |         block.release();
119 | 
120 |         m_buffer.lock();
121 |         m_buffer.pop();
122 |         m_buffer.unlock();
123 |         ++m_num_flushes;
124 |         m_filename_gen.next();
125 |     }
126 | };
127 | 
128 | }  // namespace tongrams


--------------------------------------------------------------------------------
/include/counting/hash_utils.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "constants.hpp"
  4 | 
  5 | namespace tongrams {
  6 | namespace hash_utils {
  7 | 
  8 | /*
  9 |     This code is an adaptation from
 10 |     https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp
 11 |         by Austin Appleby
 12 | */
 13 | uint64_t murmur_hash64(const void* key, size_t len, uint64_t seed) {
 14 |     const uint64_t m = 0xc6a4a7935bd1e995ULL;
 15 |     const int r = 47;
 16 | 
 17 |     uint64_t h = seed ^ (len * m);
 18 | 
 19 | #if defined(__arm) || defined(__arm__)
 20 |     const size_t ksize = sizeof(uint64_t);
 21 |     const unsigned char* data = (const unsigned char*)key;
 22 |     const unsigned char* end = data + (std::size_t)(len / 8) * ksize;
 23 | #else
 24 |     const uint64_t* data = (const uint64_t*)key;
 25 |     const uint64_t* end = data + (len / 8);
 26 | #endif
 27 | 
 28 |     while (data != end) {
 29 | #if defined(__arm) || defined(__arm__)
 30 |         uint64_t k;
 31 |         memcpy(&k, data, ksize);
 32 |         data += ksize;
 33 | #else
 34 |         uint64_t k = *data++;
 35 | #endif
 36 | 
 37 |         k *= m;
 38 |         k ^= k >> r;
 39 |         k *= m;
 40 | 
 41 |         h ^= k;
 42 |         h *= m;
 43 |     }
 44 | 
 45 |     const unsigned char* data2 = (const unsigned char*)data;
 46 | 
 47 |     switch (len & 7) {
 48 |         // fall through
 49 |         case 7:
 50 |             h ^= uint64_t(data2[6]) << 48;
 51 |         // fall through
 52 |         case 6:
 53 |             h ^= uint64_t(data2[5]) << 40;
 54 |         // fall through
 55 |         case 5:
 56 |             h ^= uint64_t(data2[4]) << 32;
 57 |         // fall through
 58 |         case 4:
 59 |             h ^= uint64_t(data2[3]) << 24;
 60 |         // fall through
 61 |         case 3:
 62 |             h ^= uint64_t(data2[2]) << 16;
 63 |         // fall through
 64 |         case 2:
 65 |             h ^= uint64_t(data2[1]) << 8;
 66 |         // fall through
 67 |         case 1:
 68 |             h ^= uint64_t(data2[0]);
 69 |             h *= m;
 70 |     };
 71 | 
 72 |     h ^= h >> r;
 73 |     h *= m;
 74 |     h ^= h >> r;
 75 | 
 76 |     return h;
 77 | }
 78 | 
 79 | static inline uint64_t byte_range_hash64(byte_range const& br) {
 80 |     return murmur_hash64(br.first, br.second - br.first, 0);
 81 | }
 82 | 
 83 | static inline uint64_t hash64(const void* data, size_t bytes) {
 84 |     return murmur_hash64(data, bytes, 0);
 85 | }
 86 | 
 87 | static const uint64_t hash_empty_token =
 88 |     byte_range_hash64(constants::empty_token_byte_range);
 89 | static constexpr float probing_space_multiplier = 1.5;
 90 | 
 91 | struct linear_prober {
 92 |     linear_prober(iterator position, uint64_t universe)
 93 |         : m_position(position % universe), m_universe(universe) {}
 94 | 
 95 |     inline iterator operator*() {
 96 |         if (m_position == m_universe) m_position = 0;  // fall back
 97 |         return m_position;
 98 |     }
 99 | 
100 |     inline void operator++() {
101 |         ++m_position;
102 |     }
103 | 
104 | private:
105 |     iterator m_position;
106 |     uint64_t m_universe;
107 | };
108 | 
109 | }  // namespace hash_utils
110 | }  // namespace tongrams
111 | 


--------------------------------------------------------------------------------
/include/counting/ngrams_hash_block.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #ifndef __APPLE__
  4 | #include <parallel/algorithm>
  5 | #endif
  6 | 
  7 | #include "util.hpp"
  8 | #include "hash_utils.hpp"
  9 | #include "ngrams_block.hpp"
 10 | #include "parallel_radix_sort.hpp"
 11 | 
 12 | namespace tongrams {
 13 | 
 14 | template <typename Prober = hash_utils::linear_prober>
 15 | struct ngrams_hash_block {
 16 |     static constexpr ngram_id invalid_ngram_id = ngram_id(-1);
 17 | 
 18 |     ngrams_hash_block() : m_size(0), m_num_bytes(0) {
 19 |         resize(0);
 20 |     }
 21 | 
 22 |     void init(uint8_t ngram_order, uint64_t size) {
 23 |         m_num_bytes = ngram_order * sizeof(word_id);
 24 |         m_block.init(ngram_order);
 25 |         resize(size);
 26 |     }
 27 | 
 28 |     void resize(uint64_t size) {
 29 |         uint64_t buckets = size * hash_utils::probing_space_multiplier;
 30 |         m_data.resize(buckets, invalid_ngram_id);
 31 |         m_block.resize_memory(size);
 32 |         m_block.resize_index(size);
 33 |     }
 34 | 
 35 |     std::pair<bool, ngram_id> find_or_insert(ngram_type const& key,
 36 |                                              iterator hint) {
 37 |         assert(buckets());
 38 |         Prober prober(hint, buckets());
 39 |         iterator start = *prober;
 40 |         iterator it = start;
 41 |         ngram_id at = invalid_ngram_id;
 42 | 
 43 |         while (m_data[it] != invalid_ngram_id) {
 44 |             assert(it < buckets());
 45 |             if (equal_to(m_block[m_data[it]].data, key.data(), m_num_bytes)) {
 46 |                 at = m_data[it];
 47 |                 return {true, at};
 48 |             }
 49 |             ++prober;
 50 |             it = *prober;
 51 |             if (it == start) {  // back to starting point:
 52 |                                 // thus all positions have been checked
 53 |                 std::cerr << "ERROR: all positions have been checked"
 54 |                           << std::endl;
 55 |                 at = invalid_ngram_id;
 56 |                 return {false, at};
 57 |             }
 58 |         }
 59 | 
 60 |         // insert
 61 |         m_data[it] = m_size++;
 62 |         at = m_data[it];
 63 |         m_block.set(at, key.begin(), key.end(), 1);
 64 |         return {false, at};
 65 |     }
 66 | 
 67 |     template <typename Comparator>
 68 |     void sort(Comparator const& comparator) {
 69 |         std::cerr << "block size = " << m_size << std::endl;
 70 |         auto begin = m_block.begin();
 71 |         auto end = begin + size();
 72 | 
 73 | #ifdef LSD_RADIX_SORT
 74 |         (void)comparator;
 75 |         uint32_t max_digit = statistics().max_word_id;
 76 |         uint32_t num_digits = m_block.order();
 77 |         // std::cerr << "max_digit = " << max_digit
 78 |         //           << "; num_digits = " << num_digits << std::endl;
 79 |         parallel_lsd_radix_sorter<typename ngrams_block::iterator> sorter(
 80 |             max_digit, num_digits);
 81 |         sorter.sort(begin, end);
 82 | #else
 83 | 
 84 | #ifdef __APPLE__
 85 |         std::sort
 86 | #else
 87 |         __gnu_parallel::sort
 88 | #endif
 89 |             (begin, end, [&](auto l, auto r) { return comparator(l, r); });
 90 | #endif
 91 | 
 92 |         assert(m_block.template is_sorted<Comparator>(begin, end));
 93 |     }
 94 | 
 95 |     inline count_type& operator[](ngram_id at) {
 96 |         assert(at < size());
 97 |         return m_block.value(at);
 98 |     }
 99 | 
100 |     inline uint64_t size() const {
101 |         return m_size;
102 |     }
103 | 
104 |     inline bool empty() const {
105 |         return size() == 0;
106 |     }
107 | 
108 |     inline uint64_t buckets() const {
109 |         return m_data.size();
110 |     }
111 | 
112 |     double load_factor() const {
113 |         return static_cast<double>(size()) / buckets();
114 |     }
115 | 
116 |     auto begin() {
117 |         return enumerator(m_block);
118 |     }
119 | 
120 |     auto end() {
121 |         return enumerator(m_block, size());
122 |     }
123 | 
124 |     struct enumerator {
125 |         enumerator(ngrams_block& block, size_t pos = 0)
126 |             : m_pos(pos), m_block(block) {}
127 | 
128 |         bool operator==(enumerator const& rhs) {
129 |             return m_pos == rhs.m_pos;
130 |         }
131 | 
132 |         bool operator!=(enumerator const& rhs) {
133 |             return not(*this == rhs);
134 |         }
135 | 
136 |         void operator++() {
137 |             ++m_pos;
138 |         }
139 | 
140 |         auto operator*() {
141 |             return m_block[m_pos];
142 |         }
143 | 
144 |     private:
145 |         size_t m_pos;
146 |         ngrams_block& m_block;
147 |     };
148 | 
149 |     void swap(ngrams_hash_block<Prober>& other) {
150 |         std::swap(m_size, other.m_size);
151 |         std::swap(m_num_bytes, other.m_num_bytes);
152 |         m_data.swap(other.m_data);
153 |         m_block.swap(other.m_block);
154 |     }
155 | 
156 |     void release_hash_index() {
157 |         std::vector<ngram_id>().swap(m_data);
158 |     }
159 | 
160 |     void release() {
161 |         ngrams_hash_block().swap(*this);
162 |     }
163 | 
164 |     auto& statistics() {
165 |         return m_block.stats;
166 |     }
167 | 
168 | private:
169 |     uint64_t m_size;
170 |     size_t m_num_bytes;
171 |     std::vector<ngram_id> m_data;
172 |     ngrams_block m_block;
173 | };
174 | 
175 | }  // namespace tongrams
176 | 


--------------------------------------------------------------------------------
/include/counting/parallel_radix_sort.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | namespace tongrams {
  4 | 
  5 | template <typename ForwardIterator>
  6 | struct parallel_lsd_radix_sorter {
  7 |     parallel_lsd_radix_sorter(
  8 |         uint32_t max_digit, uint32_t num_digits,
  9 |         uint32_t num_threads = std::thread::hardware_concurrency())
 10 |         : m_max_digit(max_digit)
 11 |         , m_num_digits(num_digits)
 12 |         , m_num_threads(num_threads) {}
 13 | 
 14 |     void sort(ForwardIterator begin, ForwardIterator end) const {
 15 |         uint32_t first_column_index = m_num_digits;
 16 |         for (uint32_t column_index = first_column_index;
 17 |              column_index - first_column_index < m_num_digits; ++column_index) {
 18 |             uint32_t k = column_index - 1;
 19 |             if (column_index > m_num_digits) {
 20 |                 k -= first_column_index;
 21 |             }
 22 |             parallel_counting_sort(begin, end, k);
 23 |         }
 24 |     }
 25 | 
 26 | private:
 27 |     uint32_t m_max_digit;
 28 |     uint32_t m_num_digits;
 29 |     uint32_t m_num_threads;
 30 | 
 31 |     void parallel_counting_sort(ForwardIterator begin, ForwardIterator end,
 32 |                                 uint32_t column_index) const {
 33 |         std::vector<std::vector<uint32_t>> counts(
 34 |             m_num_threads + 1, std::vector<uint32_t>(m_max_digit, 0));
 35 |         size_t n = end - begin;
 36 |         uint64_t batch_size = n / m_num_threads;
 37 |         if (!batch_size) throw std::runtime_error("too many threads");
 38 | 
 39 |         parallel_executor p(m_num_threads);
 40 |         task_region(*(p.executor), [&](task_region_handle& trh) {
 41 |             for (uint64_t i = 0; i < m_num_threads; ++i) {
 42 |                 trh.run([&, i] {
 43 |                     auto b = begin + i * batch_size;
 44 |                     auto e = b + batch_size;
 45 |                     if (i == m_num_threads - 1) e = end;
 46 |                     std::for_each(b, e, [&](auto const& x) {
 47 |                         uint32_t id = x[column_index];
 48 |                         assert(id < m_max_digit);
 49 |                         ++counts[i + 1][id];
 50 |                     });
 51 |                 });
 52 |             }
 53 |         });
 54 | 
 55 |         // prefix sum
 56 |         for (uint32_t j = 0, sum = 0; j < m_max_digit; ++j) {
 57 |             for (uint32_t i = 0; i < m_num_threads + 1; ++i) {
 58 |                 uint32_t occ = counts[i][j];
 59 |                 counts[i][j] = sum;
 60 |                 sum += occ;
 61 |             }
 62 |         }
 63 | 
 64 |         // for (auto const& positions: counts) {
 65 |         //     for (auto pos: positions) {
 66 |         //         std::cerr << pos << " ";
 67 |         //     }
 68 |         //     std::cerr << std::endl;
 69 |         // }
 70 | 
 71 |         std::vector<typename ForwardIterator::value_type> tmp_index(n);
 72 |         task_region(*(p.executor), [&](task_region_handle& trh) {
 73 |             for (uint64_t i = 0; i < m_num_threads; ++i) {
 74 |                 trh.run([&, i] {
 75 |                     auto b = begin + i * batch_size;
 76 |                     auto e = b + batch_size;
 77 |                     if (i == m_num_threads - 1) e = end;
 78 |                     auto& partition_counts = counts[i + 1];
 79 |                     std::for_each(b, e, [&](auto const& x) {
 80 |                         uint32_t id = x[column_index];
 81 |                         assert(id < m_max_digit);
 82 |                         tmp_index[partition_counts[id]++] = x;
 83 |                     });
 84 |                 });
 85 |             }
 86 |         });
 87 | 
 88 |         task_region(*(p.executor), [&](task_region_handle& trh) {
 89 |             for (uint64_t i = 0; i < m_num_threads; ++i) {
 90 |                 trh.run([&, i] {
 91 |                     auto b = tmp_index.begin() + i * batch_size;
 92 |                     auto output = begin + i * batch_size;
 93 |                     auto e = b + batch_size;
 94 |                     if (i == m_num_threads - 1) e = tmp_index.end();
 95 |                     std::for_each(b, e, [&](auto const& x) {
 96 |                         *output = x;
 97 |                         ++output;
 98 |                     });
 99 |                 });
100 |             }
101 |         });
102 |     }
103 | };
104 | }  // namespace tongrams
105 | 


--------------------------------------------------------------------------------
/include/counting/sliding_window.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cstring>  // for std::memmove
  4 | 
  5 | #include "../external/tongrams/include/utils/iterators.hpp"
  6 | 
  7 | #include "util.hpp"
  8 | #include "hash_utils.hpp"
  9 | #include "constants.hpp"
 10 | 
 11 | namespace tongrams {
 12 | 
 13 | struct sliding_window {
 14 |     sliding_window(uint8_t capacity)
 15 |         : m_end(2), m_buff(capacity), m_time(0.0) {}
 16 | 
 17 |     void init(byte_range text, uint64_t pos = 2) {
 18 |         m_end = pos;
 19 |         m_iterator.init(text);
 20 |         m_time = 0.0;
 21 |     }
 22 | 
 23 |     void fill(word_id id) {
 24 |         m_buff.assign(m_buff.size(), id);
 25 |     }
 26 | 
 27 |     void print() const {
 28 |         for (auto x : m_buff) {
 29 |             std::cout << x << " ";
 30 |         }
 31 |         std::cout << std::endl;
 32 |     }
 33 | 
 34 |     struct word {
 35 |         void init(uint64_t h, byte_range br) {
 36 |             hash = h;
 37 |             range = br;
 38 |         }
 39 | 
 40 |         uint64_t hash;
 41 |         byte_range range;
 42 |     };
 43 | 
 44 |     inline void shift() {
 45 |         std::memmove(&m_buff[0], &m_buff[1],
 46 |                      sizeof_ngram(m_buff.size() - 1));  // shift left by one
 47 |     }
 48 | 
 49 |     bool advance() {
 50 |         if (!m_iterator.has_next()) return false;
 51 | 
 52 |         shift();
 53 |         uint64_t hash = hash_utils::hash_empty_token;
 54 |         byte_range range = constants::empty_token_byte_range;
 55 |         size_t range_len = 0;
 56 | 
 57 |         while (range_len == 0) {  // skip blank lines
 58 |             if (m_iterator.has_next()) {
 59 |                 auto start = clock_type::now();
 60 |                 range = m_iterator.next();
 61 |                 auto end = clock_type::now();
 62 |                 std::chrono::duration<double> elapsed = end - start;
 63 |                 m_time += elapsed.count();
 64 |                 range_len = range.second - range.first;
 65 |             } else {
 66 |                 m_end += 2;
 67 |                 m_last.init(hash, range);
 68 |                 return false;
 69 |             }
 70 |         }
 71 | 
 72 |         ++range_len;
 73 |         hash = hash_utils::byte_range_hash64(range);
 74 |         m_end += range_len;
 75 |         m_last.init(hash, range);
 76 | 
 77 |         return true;
 78 |     }
 79 | 
 80 |     void eat(word_id id) {
 81 |         m_buff.back() = id;
 82 |     }
 83 | 
 84 |     ngram_type const& get() {
 85 |         return m_buff;
 86 |     }
 87 | 
 88 |     word_id const* data() {
 89 |         return m_buff.data();
 90 |     }
 91 | 
 92 |     inline auto const& last() const {
 93 |         return m_last;
 94 |     }
 95 | 
 96 |     inline word_id front() const {
 97 |         return m_buff.back();
 98 |     }
 99 | 
100 |     inline word_id back() const {
101 |         return m_buff.front();
102 |     }
103 | 
104 |     double time() const {
105 |         return m_time;
106 |     }
107 | 
108 | private:
109 |     uint64_t m_end;  // beginning of next word
110 |     word m_last;
111 |     forward_byte_range_iterator m_iterator;
112 |     ngram_type m_buff;
113 |     double m_time;
114 | };
115 | 
116 | }  // namespace tongrams
117 | 


--------------------------------------------------------------------------------
/include/estimation.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "vocabulary.hpp"
  4 | #include "tmp.hpp"
  5 | #include "statistics.hpp"
  6 | #include "stream.hpp"
  7 | #include "counting/counting.hpp"
  8 | #include "adjusting/adjusting.hpp"
  9 | #include "last/last.hpp"
 10 | #include "last/write.hpp"
 11 | 
 12 | namespace tongrams {
 13 | 
 14 | struct estimation {
 15 |     estimation(configuration const& config)
 16 |         : m_config(config)
 17 |         , m_tmp_data()
 18 |         , m_tmp_stats(config.max_order)
 19 |         , m_stats(config.max_order) {
 20 |         m_timings.reserve(3);
 21 |         std::cout << "{";
 22 |         std::cout << "\"dataset\":"
 23 |                   << boost::filesystem::path(config.text_filename).stem()
 24 |                   << ", ";
 25 |         std::cout << "\"order\":" << config.max_order << ", ";
 26 |         std::cout << "\"RAM\":" << config.RAM << ", ";
 27 |         std::cout << "\"threads\":" << config.num_threads;
 28 |     }
 29 | 
 30 |     ~estimation() {
 31 |         std::cout << "}" << std::endl;
 32 |     }
 33 | 
 34 |     void run() {
 35 |         if (m_config.compress_blocks) {
 36 |             typedef fc::writer<context_order_comparator_type> block_writer_type;
 37 |             run<counting<block_writer_type, context_order_comparator_type>>(
 38 |                 "counting");
 39 |         } else {
 40 |             run<counting<stream::writer, context_order_comparator_type>>(
 41 |                 "counting");
 42 |         }
 43 | 
 44 |         m_stats.num_ngrams(1) = m_tmp_data.word_ids.size();
 45 |         m_tmp_data.word_ids.clear();
 46 |         auto handle = util::async_call(write_vocab);
 47 | 
 48 |         if (m_config.compress_blocks) {
 49 |             run<adjusting<stream::compressed_stream_generator>>("adjusting");
 50 |         } else {
 51 |             run<adjusting<stream::uncompressed_stream_generator>>("adjusting");
 52 |         }
 53 | 
 54 |         util::wait(handle);
 55 | 
 56 |         run<last>("last");
 57 | 
 58 |         // util::clean_temporaries(m_config.tmp_dirname);
 59 |     }
 60 | 
 61 |     void print_stats() {
 62 |         std::cerr
 63 |             << "==== STATISTICS =======================================\n";
 64 |         std::cerr << "total num. of words = " << m_stats.total_words() << "\n";
 65 |         std::cerr << "total num. of grams = " << m_stats.total_grams() << "\n";
 66 |         std::cerr << "probability of <unk> word = " << m_stats.unk_prob()
 67 |                   << "\n";
 68 |         m_stats.print();
 69 |         int step = 1;
 70 |         for (auto t : m_timings) {
 71 |             std::cerr << "step-" << step << ": " << t << " [sec]\n";
 72 |             ++step;
 73 |         }
 74 |         std::cerr << "======================================================="
 75 |                   << std::endl;
 76 |     }
 77 | 
 78 | private:
 79 |     configuration const& m_config;
 80 |     tmp::data m_tmp_data;
 81 |     tmp::statistics m_tmp_stats;
 82 |     statistics m_stats;
 83 |     std::vector<double> m_timings;
 84 | 
 85 |     template <typename Step>
 86 |     void run(std::string const& name) {
 87 |         std::cout << ", ";
 88 |         std::cout << "\"" + name + "\": {";
 89 |         auto start = clock_type::now();
 90 |         Step step(m_config, m_tmp_data, m_tmp_stats, m_stats);
 91 |         step.run();
 92 |         auto end = clock_type::now();
 93 |         std::chrono::duration<double> elapsed = end - start;
 94 |         double total_time = elapsed.count();
 95 |         m_timings.push_back(total_time);
 96 |         step.print_stats();
 97 |         std::cout << "\"total\":" << total_time;
 98 |         std::cout << "}";
 99 |     }
100 | 
101 |     std::function<void(void)> write_vocab = [&]() {
102 |         std::ofstream os(m_config.vocab_tmp_subdirname +
103 |                          m_config.vocab_filename);
104 |         size_t vocab_size = m_stats.num_ngrams(1);
105 |         vocabulary vocab;
106 |         m_tmp_data.vocab_builder.build(vocab);
107 |         for (size_t id = 0; id != vocab_size; ++id) {
108 |             util::write(os, vocab[id]);
109 |             os << "\n";
110 |         }
111 |         os.close();
112 |     };
113 | };
114 | }  // namespace tongrams
115 | 


--------------------------------------------------------------------------------
/include/front_coding.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "util_types.hpp"
  4 | #include "../external/tongrams/include/vectors/bit_vector.hpp"
  5 | 
  6 | #include <fstream>
  7 | 
  8 | namespace tongrams {
  9 | namespace fc {
 10 | 
 11 | const static std::streamsize BLOCK_BYTES = 64 * essentials::MiB;
 12 | const static std::streamsize BLOCK_BITS = BLOCK_BYTES * 8;
 13 | 
 14 | template <typename Comparator>
 15 | struct writer {
 16 |     writer(uint8_t N) : m_comparator(N) {}
 17 | 
 18 |     template <typename Iterator>
 19 |     void write_block(std::ofstream& os, Iterator begin, Iterator end, size_t n,
 20 |                      ngrams_block_statistics const& stats) {
 21 |         // in bytes
 22 |         uint8_t l = 1;
 23 |         uint8_t w = (util::ceil_log2(stats.max_word_id + 1) + 7) / 8;
 24 |         uint8_t v = (util::ceil_log2(stats.max_count + 1) + 7) / 8;
 25 |         essentials::save_pod(os, w);
 26 |         essentials::save_pod(os, v);
 27 |         // in bits
 28 |         l *= 8;
 29 |         w *= 8;
 30 |         v *= 8;
 31 |         uint8_t N = m_comparator.order();
 32 |         size_t max_record_size = l + N * w + v;  // in bits
 33 | 
 34 |         m_buffer.init();
 35 |         m_buffer.reserve(BLOCK_BITS);
 36 | 
 37 |         auto explicit_write = [&](ngram_pointer ptr) {
 38 |             for (int i = 0; i < N; ++i) {
 39 |                 m_buffer.append_bits(ptr[i], w);
 40 |             }
 41 |             m_buffer.append_bits(*(ptr.value(N)), v);
 42 |         };
 43 | 
 44 |         auto prev_ptr = *begin;
 45 |         explicit_write(prev_ptr);
 46 |         ++begin;
 47 | 
 48 |         uint64_t written = 0;
 49 |         uint64_t num_ngrams_in_block = 1;  // first is written explicitly
 50 |         for (uint64_t encoded = 0; begin != end;
 51 |              ++begin, ++encoded, ++num_ngrams_in_block) {
 52 |             int lcp = 0;
 53 |             auto ptr = *begin;
 54 | 
 55 |             if (BLOCK_BITS - m_buffer.size() < max_record_size) {
 56 |                 // flush current buffer, inserting padding
 57 |                 // always flush exactly BLOCK_BYTES bytes
 58 |                 flush_buffer(os, BLOCK_BYTES, num_ngrams_in_block);
 59 |                 m_buffer.init();
 60 |                 m_buffer.reserve(BLOCK_BITS);
 61 |                 written = encoded;
 62 |                 num_ngrams_in_block = 0;
 63 |                 explicit_write(ptr);
 64 |             } else {
 65 |                 lcp = m_comparator.lcp(ptr, prev_ptr);
 66 |                 assert(lcp < N);
 67 |                 m_buffer.append_bits(lcp, l);
 68 |                 if (lcp == 0) {
 69 |                     explicit_write(ptr);
 70 |                 } else {
 71 |                     int i = m_comparator.begin();
 72 |                     m_comparator.advance(i, lcp);
 73 |                     for (;; m_comparator.next(i)) {
 74 |                         m_buffer.append_bits(ptr[i], w);
 75 |                         if (i == m_comparator.end()) break;
 76 |                     }
 77 |                     m_buffer.append_bits(*(ptr.value(N)), v);
 78 |                 }
 79 |             }
 80 | 
 81 |             prev_ptr = ptr;
 82 |         }
 83 | 
 84 |         // save last block if needed
 85 |         if (written != n) {
 86 |             size_t bytes = (m_buffer.size() + 7) / 8;
 87 |             flush_buffer(os, bytes, num_ngrams_in_block);
 88 |         }
 89 |     }
 90 | 
 91 | private:
 92 |     Comparator m_comparator;
 93 |     bit_vector_builder m_buffer;  // NOTE: need a buffer beacuse we do not know
 94 |                                   // how many ngrams we can compress in a block
 95 | 
 96 |     void flush_buffer(std::ofstream& os, size_t bytes,
 97 |                       uint64_t num_ngrams_in_block) {
 98 |         assert(num_ngrams_in_block > 0);
 99 |         essentials::save_pod(os, num_ngrams_in_block);
100 |         os.write(reinterpret_cast<char const*>(m_buffer.data().data()), bytes);
101 |     }
102 | };
103 | 
104 | struct cache {
105 |     cache() : pos(nullptr), m_begin(nullptr), m_data(0, 0) {}
106 | 
107 |     cache(uint8_t N) : m_data(ngrams_block::record_size(N), 0) {
108 |         init();
109 |     }
110 | 
111 |     inline void init() {
112 |         m_begin = m_data.data();
113 |         pos = m_begin;
114 |     }
115 | 
116 |     inline uint8_t* begin() const {
117 |         return m_begin;
118 |     }
119 | 
120 |     void store(uint8_t const* src, size_t n) {
121 |         std::memcpy(pos, src, n);
122 |     }
123 | 
124 |     void swap(cache& other) {
125 |         std::swap(pos, other.pos);
126 |         std::swap(m_begin, other.m_begin);
127 |         m_data.swap(other.m_data);
128 |     }
129 | 
130 |     uint8_t* pos;
131 | 
132 | private:
133 |     uint8_t* m_begin;
134 |     std::vector<uint8_t> m_data;
135 | };
136 | 
137 | template <typename Comparator>
138 | struct ngrams_block {
139 |     ngrams_block() {}
140 | 
141 |     struct fc_iterator {
142 |         const static size_t W = sizeof(word_id);
143 | 
144 |         fc_iterator(uint8_t N, size_t pos, size_t size,
145 |                     ngrams_block<Comparator>& m_block)
146 |             : m_it(m_block.m_memory.data())
147 |             , m_comparator(N)
148 |             , m_back(N)
149 |             , m_pos(pos)
150 |             , m_size(size)
151 |             , m_w(m_block.m_w)
152 |             , m_v(m_block.m_v) {
153 |             if (pos != size) decode_explicit();
154 |         }
155 | 
156 |         void swap(fc_iterator& other) {
157 |             std::swap(m_it, other.m_it);
158 |             m_comparator.swap(other.m_comparator);
159 |             m_back.swap(other.m_back);
160 |             m_back.init();
161 |             std::swap(m_pos, other.m_pos);
162 |             std::swap(m_size, other.m_size);
163 |             std::swap(m_w, other.m_w);
164 |             std::swap(m_v, other.m_v);
165 |         }
166 | 
167 |         fc_iterator(fc_iterator&& rhs) {
168 |             *this = std::move(rhs);
169 |         }
170 | 
171 |         inline fc_iterator& operator=(fc_iterator&& rhs) {
172 |             if (this != &rhs) swap(rhs);
173 |             return *this;
174 |         };
175 | 
176 |         fc_iterator(fc_iterator const& rhs) {
177 |             *this = rhs;
178 |         }
179 | 
180 |         fc_iterator& operator=(fc_iterator const& rhs) {
181 |             if (this != &rhs) {
182 |                 m_it = rhs.m_it;
183 |                 m_comparator = rhs.m_comparator;
184 |                 m_back = rhs.m_back;
185 |                 m_back.init();
186 |                 m_pos = rhs.m_pos;
187 |                 m_size = rhs.m_size;
188 |                 m_w = rhs.m_w;
189 |                 m_v = rhs.m_v;
190 |             }
191 |             return *this;
192 |         };
193 | 
194 |         bool operator==(fc_iterator const& rhs) {
195 |             return m_pos == rhs.m_pos;
196 |         }
197 | 
198 |         bool operator!=(fc_iterator const& rhs) {
199 |             return not(*this == rhs);
200 |         }
201 | 
202 |         inline ngram_pointer operator*() const {
203 |             ngram_pointer ptr;
204 |             ptr.data = reinterpret_cast<word_id*>(m_back.begin());
205 |             return ptr;
206 |         }
207 | 
208 |         void operator++() {
209 |             if (m_pos == m_size - 1) {
210 |                 ++m_pos;  // one-past the end
211 |                 return;
212 |             }
213 |             decode();
214 |             ++m_pos;
215 |         }
216 | 
217 |     private:
218 |         uint8_t const* m_it;
219 |         Comparator m_comparator;
220 |         cache m_back;
221 |         size_t m_pos, m_size;
222 |         uint8_t m_w, m_v;
223 | 
224 |         void decode_value() {
225 |             m_back.store(m_it, m_v);
226 |             m_back.pos += sizeof(count_type);
227 |             m_it += m_v;
228 |         }
229 | 
230 |         void decode_explicit() {
231 |             uint8_t N = m_comparator.order();
232 |             assert(m_back.pos == m_back.begin());
233 |             for (uint8_t i = 0; i < N; ++i) {
234 |                 m_back.store(m_it, m_w);
235 |                 m_back.pos += W;
236 |                 m_it += m_w;
237 |             }
238 |             decode_value();
239 |         }
240 | 
241 |         void decode() {
242 |             m_back.init();
243 | 
244 |             uint8_t lcp = *m_it++;
245 |             if (lcp == 0) {
246 |                 decode_explicit();
247 |                 return;
248 |             }
249 | 
250 |             int i = m_comparator.begin();
251 |             m_comparator.advance(i, lcp);
252 |             m_back.pos = m_back.begin() + i * W;
253 |             uint8_t N = m_comparator.order();
254 |             assert(lcp < N);
255 | 
256 |             // store into [m_back] the other [N] - [lcp] word_ids
257 |             for (int j = 0; j < N - lcp; ++j) {
258 |                 m_back.store(m_it, m_w);
259 |                 m_comparator.next(i);
260 |                 m_back.pos = m_back.begin() + i * W;
261 |                 m_it += m_w;
262 |             }
263 | 
264 |             m_back.pos = m_back.begin() + N * W;
265 |             decode_value();
266 |         }
267 |     };
268 | 
269 |     typedef fc_iterator iterator;
270 | 
271 |     ngrams_block(uint8_t N, size_t size, uint8_t w, uint8_t v)
272 |         : m_size(size), m_N(N), m_w(w), m_v(v) {}
273 | 
274 |     void read(std::ifstream& is, size_t bytes) {
275 |         m_memory.resize(bytes);
276 |         is.read(reinterpret_cast<char*>(m_memory.data()), bytes);
277 |     }
278 | 
279 |     template <typename C>
280 |     bool is_sorted(iterator begin, iterator end) {
281 |         C comparator(m_N);
282 |         auto it = begin;
283 | 
284 |         size_t record_bytes = tongrams::ngrams_block::record_size(m_N);
285 |         cache prev(m_N);
286 |         prev.init();
287 |         prev.store(reinterpret_cast<uint8_t const*>((*it).data), record_bytes);
288 |         ngram_pointer prev_ptr;
289 |         prev_ptr.data = reinterpret_cast<word_id*>(prev.begin());
290 | 
291 |         ++it;
292 |         bool ret = true;
293 |         for (size_t i = 1; it != end; ++i, ++it) {
294 |             auto curr_ptr = *it;
295 |             int cmp = comparator.compare(prev_ptr, curr_ptr);
296 |             if (cmp == 0) {
297 |                 std::cerr << "Error at " << i << "/" << size() << ":\n";
298 |                 prev_ptr.print(m_N);
299 |                 curr_ptr.print(m_N);
300 |                 std::cerr << "Repeated ngrams" << std::endl;
301 |             }
302 | 
303 |             if (cmp > 0) {
304 |                 std::cerr << "Error at " << i << "/" << size() << ":\n";
305 |                 prev_ptr.print(m_N);
306 |                 curr_ptr.print(m_N);
307 |                 std::cerr << std::endl;
308 |                 ret = false;
309 |             }
310 |             prev.init();
311 |             prev.store(reinterpret_cast<uint8_t const*>(curr_ptr.data),
312 |                        record_bytes);
313 |             prev_ptr.data = reinterpret_cast<word_id*>(prev.begin());
314 |         }
315 |         return ret;
316 |     }
317 | 
318 |     void materialize_index() {}
319 | 
320 |     void swap(ngrams_block<Comparator>& other) {
321 |         m_memory.swap(other.m_memory);
322 |         std::swap(m_size, other.m_size);
323 |         std::swap(m_N, other.m_N);
324 |         std::swap(m_w, other.m_w);
325 |         std::swap(m_v, other.m_v);
326 |     }
327 | 
328 |     void release() {
329 |         fc::ngrams_block<Comparator>().swap(*this);
330 |     }
331 | 
332 |     friend struct fc_iterator;
333 | 
334 |     inline auto begin() {
335 |         return fc_iterator(m_N, 0, m_size, *this);
336 |     }
337 | 
338 |     inline auto end() {
339 |         return fc_iterator(m_N, m_size, m_size, *this);
340 |     }
341 | 
342 |     size_t size() const {
343 |         return m_size;
344 |     }
345 | 
346 | private:
347 |     std::vector<uint8_t> m_memory;
348 |     size_t m_size;
349 |     uint8_t m_N;
350 |     uint8_t m_w, m_v;
351 | };
352 | 
353 | }  // namespace fc
354 | }  // namespace tongrams
355 | 


--------------------------------------------------------------------------------
/include/last/estimation_builder.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "../external/tongrams/include/trie_prob_lm.hpp"
  4 | 
  5 | namespace tongrams {
  6 | 
  7 | template <typename Vocabulary, typename Mapper, typename Values, typename Ranks,
  8 |           typename Grams, typename Pointers>
  9 | struct trie_prob_lm<Vocabulary, Mapper, Values, Ranks, Grams,
 10 |                     Pointers>::estimation_builder {
 11 |     estimation_builder() {}
 12 | 
 13 |     estimation_builder(uint64_t order, configuration const& config,
 14 |                        statistics& stats)
 15 |         : m_order(order)
 16 |         , m_unk_prob(stats.unk_prob())
 17 |         , m_arrays(order)
 18 |         , m_next_positions(order, 0) {
 19 |         building_util::check_order(m_order);
 20 | 
 21 |         uint64_t vocab_size = stats.num_ngrams(1);
 22 |         size_t log_vocab_size = util::ceil_log2(vocab_size + 1);
 23 |         m_vocab_values.resize(vocab_size,
 24 |                               64);  // values are not quantized
 25 | 
 26 |         m_arrays.front().pointers.resize(
 27 |             vocab_size + 1, util::ceil_log2(stats.num_ngrams(2) + 1));
 28 |         m_probs.resize(order - 1);
 29 |         m_backoffs.resize(order - 2);
 30 | 
 31 |         std::vector<float> probs;
 32 |         std::vector<float> backoffs;
 33 |         size_t probs_levels = uint64_t(1) << config.probs_quantization_bits;
 34 |         size_t backoffs_levels = uint64_t(1)
 35 |                                  << config.backoffs_quantization_bits;
 36 |         double prob_quantum = 1.0 / probs_levels;
 37 |         double backoff_quantum = 1.0 / backoffs_levels;
 38 | 
 39 |         for (uint64_t ord = 2; ord <= m_order; ++ord) {
 40 |             uint64_t n = stats.num_ngrams(ord);
 41 |             auto& level = m_arrays[ord - 1];
 42 |             level.word_ids.resize(n, log_vocab_size);
 43 |             level.probs_backoffs_ranks.resize(
 44 |                 n,
 45 |                 config.probs_quantization_bits +
 46 |                     ((ord != m_order) ? config.backoffs_quantization_bits : 0));
 47 |             probs.resize(probs_levels, 0.0);
 48 |             for (uint64_t i = 1; i != probs_levels + 1; ++i) {
 49 |                 probs[i - 1] = std::log10(i * prob_quantum);
 50 |             }
 51 |             m_probs.add_sequence(ord - 1, config.probs_quantization_bits,
 52 |                                  probs);
 53 | 
 54 |             if (ord != m_order) {
 55 |                 backoffs.resize(backoffs_levels + 1, 0.0);
 56 |                 for (uint64_t i = 1; i != backoffs_levels + 1; ++i) {
 57 |                     backoffs[i] = std::log10(i * backoff_quantum);
 58 |                 }
 59 |                 m_backoffs.add_sequence(
 60 |                     ord - 1, config.backoffs_quantization_bits, backoffs);
 61 |                 uint64_t pointer_bits =
 62 |                     util::ceil_log2(stats.num_ngrams(ord + 1) + 1);
 63 |                 level.pointers.resize(n + 1, pointer_bits);
 64 |             }
 65 |         }
 66 |     }
 67 | 
 68 |     void set_next_word(uint64_t n, word_id id) {
 69 |         assert(n >= 2 and n <= m_order);
 70 |         m_arrays[n - 1].word_ids.push_back(id);
 71 |     }
 72 | 
 73 |     void set_next_pointer(uint64_t n, uint64_t pointer) {
 74 |         assert(n >= 1 and n < m_order);
 75 |         m_arrays[n - 1].pointers.push_back(pointer);
 76 |     }
 77 | 
 78 |     void set_next_backoff(uint64_t n, float backoff) {
 79 |         assert(n >= 2 and n < m_order);
 80 |         uint64_t backoff_rank =
 81 |             m_backoffs.rank(n - 2, std::log10(backoff), 1  // reserved
 82 |             );
 83 |         uint64_t& next_pos = m_next_positions[n - 1];
 84 |         uint64_t prob_backoff_rank =
 85 |             m_arrays[n - 1].probs_backoffs_ranks[next_pos];
 86 |         uint64_t probs_quantization_bits = m_probs.quantization_bits(n - 2);
 87 |         assert(probs_quantization_bits);
 88 |         prob_backoff_rank |= (backoff_rank << probs_quantization_bits);
 89 |         m_arrays[n - 1].probs_backoffs_ranks.push_back(prob_backoff_rank);
 90 |         ++next_pos;
 91 |     }
 92 | 
 93 |     void set_backoff(uint64_t n, uint64_t pos, float backoff) {
 94 |         assert(n >= 2 and n < m_order);
 95 |         uint64_t backoff_rank =
 96 |             m_backoffs.rank(n - 2, std::log10(backoff), 1  // reserved
 97 |             );
 98 |         uint64_t prob_backoff_rank = m_arrays[n - 1].probs_backoffs_ranks[pos];
 99 |         uint64_t probs_quantization_bits = m_probs.quantization_bits(n - 2);
100 |         assert(probs_quantization_bits);
101 |         prob_backoff_rank |= (backoff_rank << probs_quantization_bits);
102 |         m_arrays[n - 1].probs_backoffs_ranks.set(pos, prob_backoff_rank);
103 |     }
104 | 
105 |     void set_next_unigram_values(float prob, float backoff) {
106 |         uint64_t packed = 0;
107 |         bits::pack(packed, std::log10(prob), std::log10(backoff));
108 |         m_vocab_values.push_back(packed);
109 |     }
110 | 
111 |     void set_unigram_values(uint64_t pos, float prob, float backoff) {
112 |         uint64_t packed = 0;
113 |         bits::pack(packed, std::log10(prob), std::log10(backoff));
114 |         m_vocab_values.set(pos, packed);
115 |     }
116 | 
117 |     void set_word(uint64_t n, uint64_t pos, word_id id) {
118 |         assert(n >= 2 and n <= m_order);
119 |         m_arrays[n - 1].word_ids.set(pos, id);
120 |     }
121 | 
122 |     void set_pointer(uint64_t n, uint64_t pos, uint64_t pointer) {
123 |         assert(n >= 1 and n < m_order);
124 |         m_arrays[n - 1].pointers.set(pos, pointer);
125 |     }
126 | 
127 |     void set_prob(uint64_t n, uint64_t pos, float prob) {
128 |         assert(n >= 2 and n <= m_order);
129 |         uint64_t prob_backoff_rank = m_arrays[n - 1].probs_backoffs_ranks[pos];
130 |         uint64_t prob_rank = m_probs.rank(n - 2, std::log10(prob), 0);
131 |         prob_backoff_rank |= prob_rank;
132 |         m_arrays[n - 1].probs_backoffs_ranks.set(pos, prob_backoff_rank);
133 |     }
134 | 
135 |     void build(trie_prob_lm& trie, configuration const& config) {
136 |         trie.m_order = m_order;
137 |         trie.m_unk_prob = std::log10(m_unk_prob);
138 | 
139 |         parallel_executor p(2);
140 |         task_region(*(p.executor), [&](task_region_handle& trh) {
141 |             trh.run([&] {
142 |                 essentials::logger("building vocabulary");
143 |                 uint64_t vocab_size = m_vocab_values.size();
144 |                 vocabulary vocab;
145 |                 {
146 |                     size_t num_bytes =
147 |                         sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES);
148 |                     vocabulary::builder vocab_builder(vocab_size, num_bytes);
149 |                     vocab_builder.load(config.vocab_tmp_subdirname +
150 |                                        config.vocab_filename);
151 |                     vocab_builder.build(vocab);
152 |                 }
153 | 
154 |                 std::vector<byte_range> bytes;
155 |                 bytes.reserve(vocab_size);
156 |                 compact_vector::builder vocab_ids(
157 |                     vocab_size, util::ceil_log2(vocab_size + 1));
158 |                 for (uint64_t id = 0; id < vocab_size; ++id) {
159 |                     bytes.emplace_back(vocab[id]);
160 |                     vocab_ids.push_back(id);
161 |                 }
162 | 
163 |                 trie.m_vocab.build(bytes,
164 |                                    compact_vector(),  // use default hash-keys
165 |                                    compact_vector(vocab_ids),
166 |                                    compact_vector(m_vocab_values),
167 |                                    identity_adaptor());
168 |             });
169 | 
170 |             trh.run([&] {
171 |                 m_probs.build(trie.m_probs_averages);
172 |                 m_backoffs.build(trie.m_backoffs_averages);
173 | 
174 |                 trie.m_arrays.resize(m_order);
175 | 
176 |                 // #pragma omp parallel for
177 |                 for (uint64_t n = 2; n <= m_order; ++n) {
178 |                     if (n == m_order) {
179 |                         // prefix sums pointers for N-grams
180 |                         // std::cerr << "prefix summing pointers for "
181 |                         //           << int(m_order) << "-grams" << std::endl;
182 |                         auto& pointers = m_arrays[n - 2].pointers;
183 |                         uint64_t prev = 0;
184 |                         for (uint64_t pos = 1; pos < pointers.size(); ++pos) {
185 |                             prev += pointers[pos];
186 |                             pointers.set(pos, prev);
187 |                         }
188 |                     }
189 | 
190 |                     // std::cerr << "building " << int(n) << "-level word_ids"
191 |                     //           << std::endl;
192 |                     // std::cerr << "m_arrays[" << int(n) - 2
193 |                     //           << "].pointers.back() = "
194 |                     //           << m_arrays[n - 2].pointers.back() << "; ";
195 |                     // std::cerr << "m_arrays[" << int(n) - 1
196 |                     //           << "].word_ids.size() = "
197 |                     //           << m_arrays[n - 1].word_ids.size() <<
198 |                     //           std::endl;
199 |                     assert(m_arrays[n - 2].pointers.back() ==
200 |                            m_arrays[n - 1].word_ids.size());
201 |                     m_arrays[n - 1].build_word_ids(n, trie.m_arrays[n - 1],
202 |                                                    m_arrays[n - 2].pointers);
203 |                     m_arrays[n - 1].build_probs_backoffs_ranks(
204 |                         trie.m_arrays[n - 1]);
205 |                     // std::cerr << "DONE" << std::endl;
206 |                 }
207 | 
208 |                 // #pragma omp parallel for
209 |                 for (uint64_t n = 1; n < m_order; ++n) {
210 |                     // std::cerr << "building " << int(n) << "-level pointers"
211 |                     //           << std::endl;
212 |                     m_arrays[n - 1].build_pointers(trie.m_arrays[n - 1]);
213 |                     // std::cerr << "DONE" << std::endl;
214 |                 }
215 |             });
216 |         });
217 | 
218 |         estimation_builder().swap(*this);
219 |     }
220 | 
221 |     // for bebug
222 |     // void print_stats() const {
223 |     //     int n = 1;
224 |     //     for (auto& l : m_arrays) {
225 |     //         std::cerr << "===========\n";
226 |     //         std::cerr << "level-" << n << " statistics:\n";
227 |     //         for (auto x : l.word_ids) {
228 |     //             std::cerr << x << " ";
229 |     //         }
230 |     //         std::cerr << std::endl;
231 |     //         for (auto x : l.probs_backoffs_ranks) {
232 |     //             std::cerr << x << " ";
233 |     //         }
234 |     //         std::cerr << std::endl;
235 |     //         for (auto x : l.pointers) {
236 |     //             std::cerr << x << " ";
237 |     //         }
238 |     //         std::cerr << std::endl;
239 |     //         ++n;
240 |     //     }
241 |     //     std::cerr << std::endl;
242 |     // }
243 | 
244 |     void swap(estimation_builder& other) {
245 |         std::swap(m_order, other.m_order);
246 |         std::swap(m_unk_prob, other.m_unk_prob);
247 |         m_vocab_values.swap(other.m_vocab_values);
248 |         m_probs.swap(other.m_probs);
249 |         m_backoffs.swap(other.m_backoffs);
250 |         m_arrays.swap(other.m_arrays);
251 |     }
252 | 
253 | private:
254 |     uint64_t m_order;
255 |     float m_unk_prob;
256 |     compact_vector::builder m_vocab_values;
257 |     typename Values::builder m_probs;
258 |     typename Values::builder m_backoffs;
259 |     std::vector<typename sorted_array_type::estimation_builder> m_arrays;
260 |     std::vector<uint64_t> m_next_positions;
261 | };
262 | 
263 | }  // namespace tongrams


--------------------------------------------------------------------------------
/include/last/index_types.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "../external/tongrams/include/lm_types.hpp"
 4 | 
 5 | namespace tongrams {
 6 | 
 7 | typedef trie_prob_lm<double_valued_mpht64,           // vocabulary
 8 |                      identity_mapper,                // mapper
 9 |                      quantized_sequence_collection,  // values
10 |                      compact_vector,                 // ranks
11 |                      pef::uniform_pef_sequence,      // word ids
12 |                      ef_sequence                     // pointers
13 |                      >
14 |     reversed_trie_index;
15 | 
16 | }  // namespace tongrams
17 | 


--------------------------------------------------------------------------------
/include/last/last.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "../external/tongrams/include/utils/util.hpp"
  4 | 
  5 | #include "constants.hpp"
  6 | #include "util.hpp"
  7 | #include "stream.hpp"
  8 | #include "estimation_builder.hpp"
  9 | #include "index_types.hpp"
 10 | 
 11 | namespace tongrams {
 12 | 
 13 | struct last {
 14 |     typedef stream::floats_vec<> float_vector_type;
 15 | 
 16 |     last(configuration const& config, tmp::data& tmp_data,
 17 |          tmp::statistics& tmp_stats, statistics& stats)
 18 |         : m_config(config)
 19 |         , m_stream_generator(config.max_order)
 20 |         , m_tmp_data(tmp_data)
 21 |         , m_stats(stats)
 22 |         , m_tmp_stats(tmp_stats)
 23 |         , m_record_size(ngrams_block::record_size(config.max_order))
 24 |         , m_pointers(config.max_order - 1, 0)
 25 |         , m_probs(config.max_order, float_vector_type(0))
 26 |         , m_index_builder(config.max_order, config, stats)
 27 |         , m_current_block_id(0)
 28 |         , m_fetched_block_id(0)
 29 |         , m_num_blocks(tmp_data.blocks_offsets.size())
 30 |         , m_CPU_time(0.0)
 31 |         , m_I_time(0.0)
 32 |         , m_O_time(0.0) {
 33 |         assert(m_num_blocks);
 34 |         std::cout << "processing " << m_num_blocks << " blocks" << std::endl;
 35 |         uint8_t N = m_config.max_order;
 36 |         {
 37 |             essentials::directory tmp_dir(m_config.tmp_dirname);
 38 |             for (auto const& filename : tmp_dir) {
 39 |                 if (filename.extension == constants::file_extension::merged) {
 40 |                     m_stream_generator.open(filename.fullpath);
 41 |                     async_fetch_next_block();
 42 |                     break;
 43 |                 }
 44 |             }
 45 |         }
 46 | 
 47 |         auto start = clock_type::now();
 48 |         size_t vocab_size = m_stats.num_ngrams(1);
 49 |         for (uint8_t n = 2; n < N; ++n) {
 50 |             m_tmp_stats.resize(n, vocab_size);
 51 |             m_index_builder.set_next_pointer(n - 1, 0);
 52 |         }
 53 |         m_index_builder.set_next_pointer(N - 1, 0);
 54 |         auto end = clock_type::now();
 55 |         std::chrono::duration<double> elapsed = end - start;
 56 |         m_CPU_time += elapsed.count();
 57 |     }
 58 | 
 59 |     void print_stats() const {
 60 |         std::cout << "\"CPU\":" << m_CPU_time << ", ";
 61 |         std::cout << "\"I\":" << m_I_time << ", ";
 62 |         std::cout << "\"O\":" << m_O_time << ", ";
 63 |     }
 64 | 
 65 |     void async_fetch_next_block() {
 66 |         if (m_fetched_block_id != m_num_blocks) {
 67 |             auto const& offsets = m_tmp_data.blocks_offsets[m_fetched_block_id];
 68 |             // std::cout << "offsets:\n";
 69 |             // for (auto off: offsets) {
 70 |             //     std::cout << off << std::endl;
 71 |             // }
 72 |             size_t n = offsets.back();
 73 |             assert(n > 0);
 74 |             m_stream_generator.async_fetch_next_block(n * m_record_size);
 75 |             ++m_fetched_block_id;
 76 |         }
 77 |     }
 78 | 
 79 |     void run() {
 80 |         auto start = clock_type::now();
 81 | 
 82 |         for (; m_current_block_id < m_num_blocks;) {
 83 |             auto* block = m_stream_generator.get_block();
 84 |             async_fetch_next_block();
 85 |             uint8_t N = block->order();
 86 | 
 87 |             for (uint8_t n = 1; n < N; ++n) {
 88 |                 m_probs[n - 1].reserve(block->size());
 89 |             }
 90 | 
 91 |             /*
 92 |                 - n = 1: (empty context) the denominator is equal to the number
 93 |                of bi-grams;
 94 |                 - n = N: the denominator is equal to the sum of the raw
 95 |                 counts of N-grams having the same context;
 96 |                 - 1 < n < N (otherwise):
 97 |                 the denominator is equal to the sum of the modified counts of
 98 |                all n-grams having the same context.
 99 |             */
100 |             auto begin = block->begin();
101 |             auto end = block->end();
102 |             state s(N, begin, end);
103 | 
104 |             m_tmp_stats.clear();
105 |             // m_tmp_stats.print_stats();
106 | 
107 |             while (s.iterators.back() != end) {
108 |                 for (uint8_t n = 2; n <= N; ++n) {
109 |                     auto& it = s.iterators[n - 1];
110 |                     if (it == end) continue;
111 |                     auto prev_ptr = *it;
112 |                     for (; it != end; ++it) {
113 |                         auto ptr = *it;
114 |                         // std::cout << "scanning " << int(n) << ": ";
115 |                         // ptr.print(N);
116 | 
117 |                         bool context_changes =
118 |                             !ptr.equal_to(prev_ptr, N - n, N - 1);
119 |                         if (context_changes) break;
120 | 
121 |                         ++s.range_lengths[n - 1];
122 |                         auto right = ptr[N - 1];
123 | 
124 |                         if (n == N) {
125 |                             uint64_t count = *(ptr.value(N));
126 |                             s.N_gram_denominator += count;
127 |                             if (count < 5) {
128 |                                 ++m_tmp_stats.r[N - 1][count - 1];
129 |                             } else {
130 |                                 ++m_tmp_stats.r[N - 1].back();
131 |                             }
132 |                         } else {
133 |                             if (n == 2) {
134 |                                 float u = unigram_prob(right);
135 |                                 m_probs[0].push_back(u);
136 |                             }
137 | 
138 |                             auto left = ptr[N - n - 1];
139 |                             m_tmp_stats.update(n, left, right);
140 |                             auto prev_left = prev_ptr[N - n - 1];
141 |                             if (left != prev_left) ++m_pointers[n - 2];
142 |                         }
143 | 
144 |                         prev_ptr = ptr;
145 |                     }
146 |                     write(n, s);
147 |                 }
148 |             }
149 | 
150 |             // write last entries since [begin, end) is aligned
151 |             // according to unigrams' boundaries
152 |             for (uint8_t n = 2; n <= N; ++n) write(n, s);
153 |             for (auto& p : m_probs) p.clear();
154 | 
155 |             ++m_current_block_id;
156 |             if (m_current_block_id % 20 == 0) {
157 |                 std::cerr << "processed " << m_current_block_id << "/"
158 |                           << m_num_blocks << " blocks" << std::endl;
159 |             }
160 | 
161 |             m_stream_generator.release_block();
162 |         }
163 | 
164 |         auto end = clock_type::now();
165 |         std::chrono::duration<double> elapsed = end - start;
166 |         m_CPU_time += elapsed.count();
167 | 
168 |         std::cerr << "processed " << m_current_block_id << "/" << m_num_blocks
169 |                   << " blocks" << std::endl;
170 | 
171 |         std::vector<float_vector_type>().swap(m_probs);
172 | 
173 |         // Close but do not destroy: deleting large file from disk is expensive
174 |         // and we can do this after construction is over.
175 |         m_stream_generator.close();
176 |         // m_index_builder.print_stats();
177 | 
178 |         essentials::logger("compressing index");
179 |         start = clock_type::now();
180 |         reversed_trie_index index;
181 |         m_index_builder.build(index, m_config);
182 |         end = clock_type::now();
183 |         elapsed = end - start;
184 |         std::cerr << "compressing index took: " << elapsed.count() << " [sec]"
185 |                   << std::endl;
186 |         m_CPU_time += elapsed.count();
187 | 
188 |         essentials::logger("writing index");
189 |         start = clock_type::now();
190 |         binary_header bin_header;
191 |         bin_header.remapping_order = 0;
192 |         bin_header.data_structure_t = data_structure_type::pef_trie;
193 |         bin_header.value_t = value_type::prob_backoff;
194 |         util::save(bin_header.get(), index, m_config.output_filename.c_str());
195 |         end = clock_type::now();
196 |         elapsed = end - start;
197 |         std::cerr << "flushing index took: " << elapsed.count() << " [sec]"
198 |                   << std::endl;
199 |         m_O_time = elapsed.count();
200 |         m_I_time = m_stream_generator.I_time();
201 |     }
202 | 
203 | private:
204 |     configuration const& m_config;
205 |     stream::uncompressed_stream_generator m_stream_generator;
206 |     tmp::data& m_tmp_data;
207 |     statistics& m_stats;
208 | 
209 |     tmp::statistics m_tmp_stats;
210 |     size_t m_record_size;
211 | 
212 |     std::vector<uint64_t> m_pointers;
213 |     std::vector<float_vector_type> m_probs;  // buffer of uncompressed probs
214 | 
215 |     reversed_trie_index::estimation_builder m_index_builder;
216 | 
217 |     uint64_t m_current_block_id;
218 |     uint64_t m_fetched_block_id;
219 |     uint64_t m_num_blocks;
220 |     double m_CPU_time;
221 |     double m_I_time;
222 |     double m_O_time;
223 | 
224 |     struct state {
225 |         state(uint8_t N, ngrams_block::iterator begin,
226 |               ngrams_block::iterator end)
227 |             : range_lengths(N, 0)
228 |             , probs_offsets(N, 0)
229 |             , iterators(N, begin)
230 |             , end(end)
231 |             , N_gram_denominator(0) {}
232 |         std::vector<uint64_t> range_lengths;
233 |         std::vector<uint64_t> probs_offsets;
234 |         std::vector<ngrams_block::iterator> iterators;
235 |         const ngrams_block::iterator end;
236 |         uint64_t N_gram_denominator;
237 |     };
238 | 
239 |     float unigram_prob(word_id w);
240 |     void write(uint8_t n, state& s);
241 | };
242 | 
243 | }  // namespace tongrams


--------------------------------------------------------------------------------
/include/last/write.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "last.hpp"
  4 | 
  5 | namespace tongrams {
  6 | 
  7 | float last::unigram_prob(word_id w) {
  8 |     uint64_t uni_gram_count = m_tmp_stats.occs[0][w];
  9 |     uint64_t uni_gram_denominator = m_stats.num_ngrams(2);
 10 |     float u =
 11 |         (static_cast<float>(uni_gram_count) - m_stats.D(1, uni_gram_count)) /
 12 |         uni_gram_denominator;
 13 |     u += m_stats.unk_prob();  // interpolate
 14 |     assert(u <= 1.0);
 15 |     return u;
 16 | }
 17 | 
 18 | void last::write(uint8_t n, last::state& s) {  // write ngram
 19 |     uint8_t N = m_config.max_order;
 20 |     assert(n >= 2 and n <= N);
 21 | 
 22 |     auto& l = s.range_lengths[n - 1];
 23 |     if (l == 0) return;
 24 |     auto it = s.iterators[n - 1];
 25 |     auto prev_ptr = *(it - 1);  // always one-past the end
 26 | 
 27 |     if (n != 2) {
 28 |         auto left = prev_ptr[N - n];
 29 |         m_index_builder.set_next_word(n - 1, left);
 30 |     }
 31 | 
 32 |     if (n != N) {
 33 |         ++m_pointers[n - 2];
 34 |         auto pointer = m_pointers[n - 2];
 35 |         m_index_builder.set_next_pointer(n - 1, pointer);
 36 |     }
 37 | 
 38 |     float backoff = 0.0;  // backoff numerator
 39 |     // D_n(1) * N_n(1) + D_n(2) * N_n(2) + D_n(3) * N_n(>= 3),
 40 |     // where: N_n(c) = # n-grams with modified count equal to c
 41 |     // N_n(>= 3) = # n-grams with modified count >= 3
 42 |     for (uint64_t k = 1; k <= 5; ++k) {
 43 |         auto& c = m_tmp_stats.r[n - 1][k - 1];
 44 |         backoff += c * m_stats.D(n, k);  // = D(n, 3) for k >= 3
 45 |         c = 0;                           // reset current range counts
 46 |     }
 47 | 
 48 |     auto& offset = s.probs_offsets[n - 1];
 49 |     assert(offset < m_probs[n - 2].size());
 50 | 
 51 |     if (n != N) {
 52 |         ++m_tmp_stats.current_range_id[n - 1];
 53 | 
 54 |         uint64_t denominator = 0;
 55 |         std::for_each(it - l, it, [&](auto ptr) {
 56 |             auto right = ptr[N - 1];
 57 |             if (m_tmp_stats.was_not_seen(n, right)) {
 58 |                 uint64_t count = m_tmp_stats.occs[n - 1][right];
 59 |                 denominator += count;
 60 |             }
 61 |         });
 62 |         assert(denominator > 0);
 63 |         assert(backoff <= denominator);
 64 |         backoff /= denominator;
 65 | 
 66 |         ++m_tmp_stats.current_range_id[n - 1];
 67 | 
 68 |         std::for_each(it - l, it, [&](auto ptr) {
 69 |             auto right = ptr[N - 1];
 70 |             uint64_t count = m_tmp_stats.occs[n - 1][right];
 71 |             assert(count > 0);
 72 |             float prob =
 73 |                 (static_cast<float>(count) - m_stats.D(n, count)) / denominator;
 74 |             prob += backoff * m_probs[n - 2][offset];
 75 | 
 76 |             if (m_tmp_stats.was_not_seen(n, right)) {
 77 |                 auto& pos = m_tmp_data.probs_offsets[n - 1][right];
 78 |                 m_index_builder.set_prob(n, pos, prob);
 79 |                 if (n == N - 1) {
 80 |                     m_index_builder.set_pointer(n, pos + 1, count);
 81 |                 }
 82 |                 ++pos;
 83 |             }
 84 | 
 85 |             assert(prob <= 1.0);
 86 |             m_probs[n - 1].push_back(prob);
 87 |             ++offset;
 88 |         });
 89 | 
 90 |         ++m_tmp_stats.current_range_id[n - 1];
 91 | 
 92 |     } else {  // N-gram case
 93 | 
 94 |         assert(s.N_gram_denominator > 0);
 95 |         assert(backoff <= s.N_gram_denominator);
 96 |         backoff /= s.N_gram_denominator;
 97 | 
 98 |         std::for_each(it - l, it, [&](auto ptr) {
 99 |             uint64_t count = *(ptr.value(N));
100 |             assert(count > 0);
101 |             float prob = (static_cast<float>(count) - m_stats.D(N, count)) /
102 |                          s.N_gram_denominator;
103 |             prob += backoff * m_probs[N - 2][offset];  // interpolate
104 |             assert(prob <= 1.0);
105 | 
106 |             auto right = ptr[N - 1];
107 |             auto& pos = m_tmp_data.probs_offsets[N - 1][right];
108 | 
109 |             m_index_builder.set_prob(N, pos, prob);
110 |             m_index_builder.set_word(N, pos,
111 |                                      ptr[0]);  // for suffix order
112 |             ++pos;
113 |         });
114 | 
115 |         if (it != s.end) s.N_gram_denominator = *(it->value(N));
116 |     }
117 | 
118 |     if (n == 2) {
119 |         auto context = prev_ptr[N - 2];
120 |         float u = unigram_prob(context);
121 |         m_index_builder.set_next_unigram_values(u, backoff);
122 |     } else {
123 |         m_index_builder.set_next_backoff(n - 1, backoff);
124 |     }
125 | 
126 |     if (n != N) s.probs_offsets[n] = 0;  // reset next order's offset
127 | 
128 |     l = 0;
129 | };
130 | 
131 | }  // namespace tongrams


--------------------------------------------------------------------------------
/include/merge_utils.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "util_types.hpp"
 4 | 
 5 | #include <vector>
 6 | #include <algorithm>
 7 | 
 8 | namespace tongrams {
 9 | 
10 | template <typename Iterator>
11 | struct cursor {
12 |     cursor(Iterator const& begin, Iterator const& end, uint64_t i)
13 |         : range(begin, end), index(i) {}
14 | 
15 |     iterator_range<Iterator> range;
16 |     uint64_t index;
17 | };
18 | 
19 | template <typename Comparator>
20 | struct cursor_comparator {
21 |     cursor_comparator() {}
22 |     cursor_comparator(uint8_t ngram_order) : m_comparator(ngram_order) {}
23 | 
24 |     template <typename T>
25 |     bool operator()(cursor<T>& l, cursor<T>& r) {
26 |         return m_comparator.compare(l.range.begin.operator*(),
27 |                                     r.range.begin.operator*()) >= 0;
28 |     }
29 | 
30 | private:
31 |     Comparator m_comparator;
32 | };
33 | 
34 | template <typename T, typename Comparator>
35 | struct min_heap {
36 |     min_heap(Comparator comparator) : m_comparator(comparator) {}
37 | 
38 |     void push(T const& t) {
39 |         m_q.push_back(t);
40 |         std::push_heap(m_q.begin(), m_q.end(), m_comparator);
41 |     }
42 | 
43 |     T& top() {
44 |         return m_q.front();
45 |     }
46 | 
47 |     void pop() {
48 |         std::pop_heap(m_q.begin(), m_q.end(), m_comparator);
49 |         m_q.pop_back();
50 |     }
51 | 
52 |     void heapify() {
53 |         sink(0);
54 |     }
55 | 
56 |     void clear() {
57 |         m_q.clear();
58 |     }
59 | 
60 |     bool empty() const {
61 |         return m_q.empty();
62 |     }
63 | 
64 |     inline uint64_t size() const {
65 |         return m_q.size();
66 |     }
67 | 
68 | private:
69 |     std::vector<T> m_q;
70 |     Comparator m_comparator;
71 | 
72 |     void sink(uint64_t pos) {
73 |         assert(pos <= size());
74 |         while (2 * pos + 1 < size()) {
75 |             uint64_t i = 2 * pos + 1;
76 |             if (i + 1 < size() and m_comparator(m_q[i], m_q[i + 1])) ++i;
77 |             if (!m_comparator(m_q[pos], m_q[i])) break;
78 |             std::swap(m_q[pos], m_q[i]);
79 |             pos = i;
80 |         }
81 |     }
82 | };
83 | 
84 | }  // namespace tongrams
85 | 


--------------------------------------------------------------------------------
/include/merging/merging.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "util.hpp"
  4 | #include "constants.hpp"
  5 | #include "stream.hpp"
  6 | #include "merge_utils.hpp"
  7 | #include "merging_writer.hpp"
  8 | 
  9 | namespace tongrams {
 10 | 
 11 | template <typename StreamGenerator>
 12 | struct merging {
 13 |     typedef cursor_comparator<prefix_order_comparator_type>
 14 |         cursor_comparator_type;
 15 | 
 16 |     merging(configuration const& config, tmp::data& tmp_data,
 17 |             tmp::statistics& /*tmp_stats*/, statistics& /*stats*/)
 18 |         : m_config(config)
 19 |         , m_writer(config, tmp_data)
 20 |         , m_comparator(config.max_order)
 21 |         , m_cursors(cursor_comparator_type(config.max_order)) {}
 22 | 
 23 |     typedef typename StreamGenerator::block_type input_block_type;
 24 | 
 25 |     void run() {
 26 |         std::vector<std::string> filenames;
 27 |         {
 28 |             essentials::directory tmp_dir(m_config.tmp_dirname);
 29 |             for (auto const& filename : tmp_dir) {
 30 |                 if (filename.extension == constants::file_extension::counts) {
 31 |                     filenames.push_back(filename.fullpath);
 32 |                 }
 33 |             }
 34 |         }
 35 | 
 36 |         uint8_t N = m_config.max_order;
 37 |         size_t num_files_to_merge = filenames.size();
 38 |         assert(num_files_to_merge > 0);
 39 |         std::cerr << "merging " << num_files_to_merge << " files" << std::endl;
 40 | 
 41 |         uint64_t record_size = ngrams_block::record_size(N);
 42 |         uint64_t min_load_size = m_config.RAM / (2 * num_files_to_merge + 1) /
 43 |                                  record_size * record_size;
 44 |         uint64_t default_load_size =
 45 |             (64 * essentials::MiB) / record_size * record_size;
 46 |         uint64_t load_size = default_load_size;
 47 |         if (min_load_size < default_load_size) {
 48 |             std::cerr << "using min. load size of " << min_load_size
 49 |                       << " because not enough RAM is available" << std::endl;
 50 |             load_size = min_load_size;
 51 |         }
 52 |         assert(load_size % record_size == 0);
 53 | 
 54 |         for (auto const& filename : filenames) {
 55 |             m_stream_generators.emplace_back(N);
 56 |             auto& gen = m_stream_generators.back();
 57 |             gen.open(filename);
 58 |             assert(gen.size() == 0);
 59 |             gen.fetch_next_block(load_size);
 60 |         }
 61 | 
 62 |         auto get_block = [](StreamGenerator& gen) {
 63 |             auto* block = gen.get_block();
 64 |             assert(block->template is_sorted<prefix_order_comparator_type>(
 65 |                 block->begin(), block->end()));
 66 |             return block;
 67 |         };
 68 | 
 69 |         assert(m_cursors.empty());
 70 |         for (uint64_t k = 0; k != m_stream_generators.size(); ++k) {
 71 |             auto& gen = m_stream_generators[k];
 72 |             auto* block = get_block(gen);
 73 |             cursor<typename input_block_type::iterator> c(block->begin(),
 74 |                                                           block->end(), k);
 75 |             m_cursors.push(c);
 76 |         }
 77 | 
 78 |         uint64_t num_ngrams_per_block = load_size / record_size;
 79 |         std::cerr << "num_ngrams_per_block = " << num_ngrams_per_block
 80 |                   << " ngrams" << std::endl;
 81 | 
 82 |         ngrams_block result(N);
 83 |         result.resize_memory(num_ngrams_per_block);
 84 |         result.reserve_index(num_ngrams_per_block);
 85 | 
 86 |         m_writer.start();
 87 | 
 88 |         while (!m_cursors.empty()) {
 89 |             auto& top = m_cursors.top();
 90 |             auto min = *(top.range.begin);
 91 | 
 92 |             if (!result.size()) {
 93 |                 result.push_back(min.data, min.data + N, *(min.value(N)));
 94 |             } else {
 95 |                 auto& back = result.back();
 96 |                 bool equal = equal_to(min.data, back.data, sizeof_ngram(N));
 97 |                 if (!equal) {
 98 |                     if (result.size() == num_ngrams_per_block) {
 99 |                         while (m_writer.size() > 0)
100 |                             ;  // wait for flush
101 |                         m_writer.push(result);
102 | 
103 |                         result.init(N);
104 |                         result.resize_memory(num_ngrams_per_block);
105 |                         result.reserve_index(num_ngrams_per_block);
106 |                         assert(result.empty());
107 |                     }
108 |                     result.push_back(min.data, min.data + N, *(min.value(N)));
109 |                 } else {
110 |                     *(back.value(N)) += *(min.value(N));
111 |                 }
112 |             }
113 | 
114 |             ++(top.range.begin);
115 | 
116 |             if (top.range.begin == top.range.end) {
117 |                 auto& gen = m_stream_generators[top.index];
118 |                 gen.release_block();
119 |                 if (gen.eos()) {
120 |                     assert(gen.empty());
121 |                     gen.close_and_remove();
122 |                     m_cursors.pop();
123 |                 } else {
124 |                     gen.fetch_next_block(load_size);
125 |                     auto* block = get_block(gen);
126 |                     top.range.begin = block->begin();
127 |                     top.range.end = block->end();
128 |                 }
129 |             }
130 | 
131 |             m_cursors.heapify();
132 |         }
133 | 
134 |         m_writer.push(result);
135 |         m_writer.terminate();
136 |     }
137 | 
138 | private:
139 |     configuration const& m_config;
140 |     std::deque<StreamGenerator> m_stream_generators;
141 |     merging_writer m_writer;
142 |     prefix_order_comparator_type m_comparator;
143 | 
144 |     min_heap<cursor<typename input_block_type::iterator>,
145 |              cursor_comparator_type>
146 |         m_cursors;
147 | };
148 | 
149 | }  // namespace tongrams
150 | 


--------------------------------------------------------------------------------
/include/merging/merging_writer.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "configuration.hpp"
  4 | #include "tmp.hpp"
  5 | 
  6 | namespace tongrams {
  7 | 
  8 | struct merging_writer {
  9 |     merging_writer(configuration const& config, tmp::data& tmp_data)
 10 |         : m_num_flushes(0), m_order(config.max_order), m_ngrams(0) {
 11 |         m_buffer.open();
 12 |         m_os.open(config.output_filename.c_str(),
 13 |                   std::ofstream::ate | std::ofstream::app);
 14 | 
 15 |         tmp_data.vocab_builder.build(m_vocab);
 16 | 
 17 |         //                               18446744073709551615\n
 18 |         static std::string empty_line = "                    \n";
 19 |         m_os.write(empty_line.data(), empty_line.size());
 20 | 
 21 |         m_params.path = config.output_filename;
 22 |         m_params.offset = 0;
 23 |         m_params.length = sysconf(_SC_PAGESIZE);
 24 |     }
 25 | 
 26 |     ~merging_writer() {
 27 |         if (!m_buffer.empty()) {
 28 |             std::cerr << "Error: some data still need to be written"
 29 |                       << std::endl;
 30 |             std::terminate();
 31 |         }
 32 |     }
 33 | 
 34 |     void start() {
 35 |         m_thread = std::thread(&merging_writer::run, this);
 36 |     }
 37 | 
 38 |     void terminate() {
 39 |         m_buffer.lock();
 40 |         m_buffer.close();
 41 |         m_buffer.unlock();
 42 |         if (m_thread.joinable()) m_thread.join();
 43 |         assert(!m_buffer.active());
 44 |         while (!m_buffer.empty()) flush();
 45 |         m_os.close();
 46 | 
 47 |         // write number of ngrams at the beginning of file
 48 |         boost::iostreams::mapped_file_sink tmp(m_params);
 49 |         char* data = tmp.data();
 50 |         std::string str = std::to_string(m_ngrams);
 51 |         memcpy(data, str.data(), str.size());
 52 |         tmp.close();
 53 | 
 54 |         std::cerr << "\tmerging_writer thread stats:\n";
 55 |         std::cerr << "\tflushed blocks: " << m_num_flushes << "\n";
 56 |         std::cerr << "\tflushed ngrams: " << m_ngrams << "\n";
 57 |     }
 58 | 
 59 |     void push(ngrams_block& block) {
 60 |         m_buffer.lock();
 61 |         m_buffer.push(block);
 62 |         m_buffer.unlock();
 63 |     }
 64 | 
 65 |     size_t size() {
 66 |         m_buffer.lock();
 67 |         size_t s = m_buffer.size();
 68 |         m_buffer.unlock();
 69 |         return s;
 70 |     }
 71 | 
 72 | private:
 73 |     semi_sync_queue<ngrams_block> m_buffer;
 74 |     std::ofstream m_os;
 75 |     std::thread m_thread;
 76 |     uint64_t m_num_flushes;
 77 |     uint64_t m_order;
 78 |     uint64_t m_ngrams;
 79 |     vocabulary m_vocab;
 80 |     boost::iostreams::mapped_file_params m_params;
 81 | 
 82 |     void run() {
 83 |         while (m_buffer.active()) flush();
 84 |     }
 85 | 
 86 |     void flush() {
 87 |         m_buffer.lock();
 88 |         if (m_buffer.empty()) {
 89 |             m_buffer.unlock();
 90 |             return;
 91 |         }
 92 |         auto& block = m_buffer.pick();
 93 |         m_buffer.unlock();
 94 | 
 95 |         for (auto const ngram : block) {
 96 |             for (uint64_t i = 0; i != m_order; ++i) {
 97 |                 auto br = m_vocab[ngram[i]];
 98 |                 util::write(m_os, br);
 99 |                 if (i != m_order - 1) m_os << " ";
100 |             }
101 |             m_os << "\t" << *ngram.value(m_order) << "\n";
102 |         }
103 | 
104 |         m_ngrams += block.size();
105 |         block.release();
106 | 
107 |         m_buffer.lock();
108 |         m_buffer.pop();
109 |         m_buffer.unlock();
110 |         ++m_num_flushes;
111 |         if (m_num_flushes % 20 == 0) {
112 |             std::cerr << "flushed " << m_num_flushes << " blocks" << std::endl;
113 |         }
114 |     }
115 | };
116 | 
117 | }  // namespace tongrams


--------------------------------------------------------------------------------
/include/ngrams_block.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "util_types.hpp"
  4 | #include "comparators.hpp"
  5 | #include "../external/tongrams/include/utils/util.hpp"
  6 | 
  7 | namespace tongrams {
  8 | 
  9 | struct ngram_pointer {
 10 |     inline word_id operator[](size_t i) const {
 11 |         return data[i];
 12 |     }
 13 | 
 14 |     inline count_type* value(uint8_t order) const {
 15 |         return reinterpret_cast<count_type*>(data + order);
 16 |     }
 17 | 
 18 |     inline bool equal_to(ngram_pointer const& other, size_t begin,
 19 |                          size_t end) const {
 20 |         return memcmp(other.data + begin, this->data + begin,
 21 |                       (end - begin) * sizeof(word_id)) == 0;
 22 |     }
 23 | 
 24 |     void print(uint8_t order) const {
 25 |         for (uint8_t i = 0; i < order; ++i) {
 26 |             std::cerr << data[i] << " ";
 27 |         }
 28 |         std::cerr << "[" << value(order) << "]\n";
 29 |     }
 30 | 
 31 |     word_id* data;
 32 | };
 33 | 
 34 | typedef context_order_comparator<ngram_pointer> context_order_comparator_type;
 35 | typedef prefix_order_comparator<ngram_pointer> prefix_order_comparator_type;
 36 | 
 37 | struct ngrams_block_statistics {
 38 |     word_id max_word_id;
 39 |     uint64_t max_count;
 40 | };
 41 | 
 42 | struct ngrams_allocator {
 43 |     ngrams_allocator() : m_offset(0), m_alignment(0) {}
 44 | 
 45 |     ngrams_allocator(uint8_t order) {
 46 |         init(order);
 47 |     }
 48 | 
 49 |     void init(uint8_t order) {
 50 |         m_offset = 0;
 51 |         m_alignment = sizeof_ngram(order);
 52 |     }
 53 | 
 54 |     void resize(std::vector<uint8_t>& memory, uint64_t num_ngrams) {
 55 |         memory.resize((m_alignment + sizeof(count_type)) * num_ngrams);
 56 |     }
 57 | 
 58 |     template <typename Iterator>
 59 |     void construct(ngram_pointer& ptr, Iterator begin, Iterator end,
 60 |                    count_type count) {
 61 |         uint64_t n = 0;
 62 |         for (; begin != end; ++n, ++begin) ptr.data[n] = *begin;
 63 |         *(ptr.value(n)) = count;
 64 |     }
 65 | 
 66 |     auto allocate(std::vector<uint8_t>& memory) {
 67 |         assert(m_offset < memory.size());
 68 |         ngram_pointer ptr;
 69 |         ptr.data = reinterpret_cast<word_id*>(&memory[m_offset]);
 70 |         m_offset += m_alignment + sizeof(count_type);
 71 |         return ptr;
 72 |     }
 73 | 
 74 |     auto allocate(std::vector<uint8_t>& memory, uint64_t i) {
 75 |         uint64_t offset = i * (m_alignment + sizeof(count_type));
 76 |         assert(offset < memory.size());
 77 |         ngram_pointer ptr;
 78 |         ptr.data = reinterpret_cast<word_id*>(&memory[offset]);
 79 |         return ptr;
 80 |     }
 81 | 
 82 |     uint8_t order() const {
 83 |         return m_alignment / sizeof(word_id);
 84 |     }
 85 | 
 86 |     void swap(ngrams_allocator& other) {
 87 |         std::swap(m_offset, other.m_offset);
 88 |         std::swap(m_alignment, other.m_alignment);
 89 |     }
 90 | 
 91 | private:
 92 |     uint64_t m_offset;
 93 |     uint64_t m_alignment;
 94 | };
 95 | 
 96 | struct ngrams_block {
 97 |     typedef typename std::vector<ngram_pointer>::iterator iterator;
 98 | 
 99 |     ngrams_block() {}
100 | 
101 |     ngrams_block(uint8_t order) {
102 |         init(order);
103 |     }
104 | 
105 |     ngrams_block(ngrams_block&& rhs) {
106 |         *this = std::move(rhs);
107 |     }
108 | 
109 |     void init(uint8_t order) {
110 |         stats = {0, 0};
111 |         m_memory.resize(0);
112 |         m_allocator.init(order);
113 |         m_index.resize(0);
114 |     }
115 | 
116 |     inline ngrams_block& operator=(ngrams_block&& rhs) {
117 |         if (this != &rhs) swap(rhs);
118 |         return *this;
119 |     };
120 | 
121 |     ngrams_block(ngrams_block const&) {
122 |         assert(false);
123 |     }
124 | 
125 |     ngrams_block& operator=(ngrams_block const&) {
126 |         assert(false);
127 |         return *this;
128 |     };
129 | 
130 |     inline static size_t record_size(uint8_t order) {
131 |         return sizeof_ngram(order) + sizeof(count_type);
132 |     }
133 | 
134 |     inline uint64_t record_size() const {
135 |         return record_size(order());
136 |     }
137 | 
138 |     void resize_memory(uint64_t num_ngrams) {
139 |         m_allocator.resize(m_memory, num_ngrams);
140 |     }
141 | 
142 |     void reserve_index(uint64_t num_ngrams) {
143 |         m_index.reserve(num_ngrams);
144 |     }
145 | 
146 |     void resize_index(uint64_t num_ngrams) {
147 |         m_index.resize(num_ngrams);
148 |     }
149 | 
150 |     void release() {
151 |         ngrams_block().swap(*this);
152 |     }
153 | 
154 |     void push_back(ngram_pointer ptr) {
155 |         m_index.push_back(ptr);
156 |     }
157 | 
158 |     template <typename Iterator>
159 |     void push_back(Iterator begin, Iterator end, count_type count) {
160 |         auto ptr = m_allocator.allocate(m_memory);
161 |         m_allocator.construct(ptr, begin, end, count);
162 |         push_back(ptr);
163 |     }
164 | 
165 |     template <typename Iterator>
166 |     void set(uint64_t i, Iterator begin, Iterator end, count_type count) {
167 |         assert(i < size());
168 |         auto ptr = m_allocator.allocate(m_memory, i);
169 |         m_allocator.construct(ptr, begin, end, count);
170 |         m_index[i] = ptr;
171 |     }
172 | 
173 |     inline size_t size() const {
174 |         return m_index.size();
175 |     }
176 | 
177 |     inline bool empty() const {
178 |         return m_index.empty();
179 |     }
180 | 
181 |     inline uint8_t order() const {
182 |         return m_allocator.order();
183 |     }
184 | 
185 |     void write_memory(std::ofstream& os) {
186 |         assert(m_memory.size() > 0);
187 |         std::streamsize num_bytes = size() * record_size();
188 |         os.write(reinterpret_cast<char const*>(m_memory.data()), num_bytes);
189 |     }
190 | 
191 |     char* initialize_memory(size_t num_bytes) {
192 |         m_memory.resize(num_bytes);
193 |         return reinterpret_cast<char*>(m_memory.data());
194 |     }
195 | 
196 |     char* read_bytes(std::ifstream& is, char* dest, size_t num_bytes) {
197 |         is.read(dest, static_cast<std::streamsize>(num_bytes));
198 |         dest += num_bytes;
199 |         return dest;
200 |     }
201 | 
202 |     void materialize_index(uint64_t num_ngrams) {
203 |         m_index.clear();
204 |         m_index.reserve(num_ngrams);
205 |         assert(m_memory.size() > 0);
206 |         for (uint64_t i = 0; i != num_ngrams; ++i) {
207 |             auto ptr = m_allocator.allocate(m_memory);
208 |             push_back(ptr);
209 |         }
210 |         assert(size() == num_ngrams);
211 |     }
212 | 
213 |     inline ngram_pointer operator[](size_t i) {
214 |         assert(i < size());
215 |         return m_index[i];
216 |     }
217 | 
218 |     inline count_type& value(size_t i) {
219 |         assert(i < size());
220 |         return *(m_index[i].value(order()));
221 |     }
222 | 
223 |     inline iterator begin() {
224 |         return m_index.begin();
225 |     }
226 | 
227 |     inline iterator end() {
228 |         return m_index.end();
229 |     }
230 | 
231 |     inline ngram_pointer& front() {
232 |         return m_index.front();
233 |     }
234 | 
235 |     inline ngram_pointer& back() {
236 |         return m_index.back();
237 |     }
238 | 
239 |     size_t num_bytes() const {
240 |         return m_memory.size();
241 |     }
242 | 
243 |     template <typename Comparator, typename Iterator>
244 |     bool is_sorted(Iterator begin, Iterator end) {
245 |         std::cerr << "checking if block is sorted...";
246 |         uint8_t N = order();
247 |         Comparator comparator(N);
248 |         auto it = begin;
249 |         auto prev = *it;
250 |         ++it;
251 |         bool ret = true;
252 |         for (size_t i = 1; it != end; ++i, ++it) {
253 |             auto curr = *it;
254 |             int cmp = comparator.compare(prev, curr);
255 |             if (cmp == 0) {
256 |                 std::cerr << "Error at " << i << "/" << size() << ":\n";
257 |                 prev.print(N);
258 |                 curr.print(N);
259 |                 std::cerr << "Repeated ngrams" << std::endl;
260 |             }
261 |             if (cmp > 0) {
262 |                 std::cerr << "Error at " << i << "/" << size() << ":\n";
263 |                 prev.print(N);
264 |                 curr.print(N);
265 |                 std::cerr << std::endl;
266 |                 ret = false;
267 |             }
268 |             prev = curr;
269 |         }
270 |         if (ret) std::cerr << "OK!" << std::endl;
271 |         return ret;
272 |     }
273 | 
274 |     void swap(ngrams_block& other) {
275 |         std::swap(stats.max_word_id, other.stats.max_word_id);
276 |         std::swap(stats.max_count, other.stats.max_count);
277 |         m_memory.swap(other.m_memory);
278 |         m_allocator.swap(other.m_allocator);
279 |         m_index.swap(other.m_index);
280 |     }
281 | 
282 |     ngrams_block_statistics stats;
283 | 
284 | protected:
285 |     std::vector<uint8_t> m_memory;
286 |     ngrams_allocator m_allocator;
287 |     std::vector<ngram_pointer> m_index;
288 | };
289 | 
290 | struct ngram_cache {
291 |     ngram_cache() : m_empty(true) {}
292 | 
293 |     typedef ngram_pointer pointer;
294 | 
295 |     ngram_cache(uint8_t order) {
296 |         init(order);
297 |     }
298 | 
299 |     void init(uint8_t order) {
300 |         m_data.resize(ngrams_block::record_size(order));
301 |         m_empty = true;
302 |     }
303 | 
304 |     pointer get() {
305 |         pointer ptr;
306 |         ptr.data = reinterpret_cast<word_id*>(m_data.data());
307 |         return ptr;
308 |     }
309 | 
310 |     void store(pointer const& ptr) {
311 |         std::memcpy(m_data.data(), ptr.data, m_data.size());
312 |         m_empty = false;
313 |     }
314 | 
315 |     bool empty() const {
316 |         return m_empty;
317 |     }
318 | 
319 |     void swap(ngram_cache& other) {
320 |         m_data.swap(other.m_data);
321 |         std::swap(m_empty, other.m_empty);
322 |     }
323 | 
324 | private:
325 |     std::vector<uint8_t> m_data;
326 |     bool m_empty;
327 | };
328 | 
329 | }  // namespace tongrams
330 | 


--------------------------------------------------------------------------------
/include/statistics.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <numeric>
  4 | 
  5 | #include "tmp.hpp"
  6 | #include "configuration.hpp"
  7 | #include "util_types.hpp"
  8 | 
  9 | namespace tongrams {
 10 | 
 11 | struct statistics {
 12 |     struct builder {
 13 |         builder(configuration const& config, tmp::data& tmp_data,
 14 |                 tmp::statistics& tmp_stats)
 15 |             : m_config(config)
 16 |             , m_tmp_stats(tmp_stats)
 17 |             , m_tmp_data(tmp_data)
 18 |             , m_ngram_cache(config.max_order)
 19 |             , m_t(config.max_order, std::vector<uint64_t>(4, 0))
 20 |             , m_D(config.max_order, std::vector<float>(4, 0))
 21 |             , m_num_ngrams(config.max_order, 0)
 22 |             , m_total_num_words(0)
 23 |             , m_unk_prob(0.0) {}
 24 | 
 25 |         void init(size_t vocab_size) {
 26 |             assert(vocab_size);
 27 |             m_vocab_size = vocab_size;
 28 |             uint64_t N = m_config.max_order;
 29 |             for (uint64_t n = 1; n < N; ++n) {
 30 |                 m_tmp_stats.resize(n, m_vocab_size);
 31 |             }
 32 |             m_tmp_data.probs_offsets.resize(N, std::vector<uint64_t>(0, 0));
 33 |             for (uint64_t n = 2; n <= N; ++n) {
 34 |                 m_tmp_data.probs_offsets[n - 1].resize(m_vocab_size, 0);
 35 |             }
 36 |         }
 37 | 
 38 |         template <typename Iterator>
 39 |         void compute_left_extensions(Iterator begin, size_t len) {
 40 |             uint64_t N = m_config.max_order;
 41 |             m_num_ngrams[N - 1] += len;
 42 | 
 43 |             auto prev_ptr = *begin;
 44 |             if (not m_ngram_cache.empty()) {
 45 |                 prev_ptr = m_ngram_cache.get();
 46 |             }
 47 | 
 48 |             for (size_t i = 0; i < len; ++i, ++begin) {
 49 |                 auto ptr = *begin;
 50 |                 word_id right = ptr[N - 1];
 51 | 
 52 |                 for (uint64_t n = 1; n < N; ++n) {
 53 |                     bool context_changes =
 54 |                         !ptr.equal_to(prev_ptr, N - n, N - 1);
 55 |                     if (n != 1 and context_changes) {
 56 |                         m_tmp_stats.combine(n);
 57 |                         ++m_num_ngrams[n - 2];  // previous order
 58 |                     }
 59 | 
 60 |                     word_id left = ptr[N - n - 1];
 61 |                     if (m_tmp_stats.update(n, left, right)) {
 62 |                         ++m_tmp_data.probs_offsets[n][right];
 63 |                     }
 64 |                 }
 65 | 
 66 |                 if (!ptr.equal_to(prev_ptr, 0, N - 1)) ++m_num_ngrams[N - 2];
 67 | 
 68 |                 // N-gram case: they do not have modified counts,
 69 |                 // rather their counts are equal to the occurrence in corpus
 70 |                 uint64_t count = *(ptr.value(N));
 71 |                 assert(count > 0);
 72 |                 m_total_num_words += count;
 73 |                 if (count <= 4) ++m_tmp_stats.t[N - 1][count - 1];
 74 |                 prev_ptr = ptr;
 75 |             }
 76 | 
 77 |             m_ngram_cache.store(prev_ptr);
 78 |         }
 79 | 
 80 |         void finalize() {
 81 |             ++m_num_ngrams[m_config.max_order - 2];
 82 |             for (uint64_t n = 2; n < m_config.max_order; ++n) {
 83 |                 ++m_num_ngrams[n - 2];
 84 |                 m_tmp_stats.combine(n);
 85 |                 m_tmp_stats.release(n);
 86 |             }
 87 |             for (uint64_t n = 2; n <= m_config.max_order; ++n) {
 88 |                 for (uint64_t k = 1; k <= 4; ++k) {
 89 |                     m_t[n - 1][k - 1] = m_tmp_stats.t[n - 1][k - 1];
 90 |                 }
 91 |             }
 92 |         }
 93 | 
 94 |         void build(statistics& stats) {
 95 |             uint64_t N = m_config.max_order;
 96 |             stats.num_ngrams(1) = m_vocab_size;
 97 |             for (uint64_t n = 2; n <= N; ++n) {
 98 |                 stats.num_ngrams(n) = m_num_ngrams[n - 1];
 99 |             }
100 | 
101 |             std::cerr << "number of ngrams:\n";
102 |             size_t sum = 0;
103 |             for (uint64_t n = 1; n <= N; ++n) {
104 |                 std::cerr << int(n) << "-grams: " << stats.num_ngrams(n)
105 |                           << "\n";
106 |                 sum += stats.num_ngrams(n);
107 |             }
108 |             std::cerr << "total num. grams: " << sum << std::endl;
109 | 
110 |             // NOTE: smoothing statistics for unigrams must be computed globally
111 |             for (auto k : m_tmp_stats.occs[0]) {
112 |                 assert(k);
113 |                 if (k <= 4) ++m_t[0][k - 1];
114 |             }
115 | 
116 |             for (uint64_t n = 2; n <= N; ++n) {
117 |                 auto& positions = m_tmp_data.probs_offsets[n - 1];
118 |                 // compute prefix sums
119 |                 for (uint64_t id = 0, sum = 0; id < m_vocab_size; ++id) {
120 |                     uint64_t occ = positions[id];
121 |                     positions[id] = sum;
122 |                     sum += occ;
123 |                 }
124 |                 // for (auto x: positions) {
125 |                 //     std::cerr << x << " ";
126 |                 // }
127 |                 // std::cerr << std::endl;
128 |             }
129 | 
130 |             // NOTE: do not compute for small synthetic datasets
131 |             for (uint64_t n = 1; n <= N; ++n) {
132 |                 for (uint64_t k = 1; k <= 4; ++k) {
133 |                     try {
134 |                         D(n, k) = compute_discount(n, k);
135 |                     } catch (std::runtime_error const& e) {
136 |                         e.what();
137 |                         complain(n, k);
138 |                         util::clean_temporaries(m_config.tmp_dirname);
139 |                         std::abort();
140 |                     }
141 |                 }
142 |             }
143 | 
144 |             for (uint64_t k = 1; k <= 3; ++k) {
145 |                 m_unk_prob += t(1, k) * D(1, k);
146 |             }
147 |             m_unk_prob /= stats.num_ngrams(2);  // uni-grams' denominator
148 |             m_unk_prob /= stats.num_ngrams(
149 |                 1);  // interpolate with uniform distribution: 1/|vocabulary|
150 | 
151 |             stats.m_t.swap(m_t);
152 |             stats.m_D.swap(m_D);
153 |             stats.m_total_num_words = m_total_num_words - N + 1;
154 |             stats.m_unk_prob = m_unk_prob;
155 |             std::cerr << "total num. tokens: " << stats.m_total_num_words
156 |                       << std::endl;
157 |         }
158 | 
159 |     private:
160 |         configuration const& m_config;
161 |         tmp::statistics& m_tmp_stats;
162 |         tmp::data& m_tmp_data;
163 |         ngram_cache m_ngram_cache;
164 |         std::vector<std::vector<uint64_t>> m_t;
165 |         std::vector<std::vector<float>> m_D;
166 |         std::vector<uint64_t> m_num_ngrams;
167 |         uint64_t m_total_num_words;  // total numer of words in the text corpus
168 |         size_t m_vocab_size;
169 |         float m_unk_prob;  // prob of <unk> word, which is backoff(empty) /
170 |                            // vocabulary_size
171 | 
172 |         float& D(uint64_t n, uint64_t k) {
173 |             assert(k > 0);
174 |             assert(n >= 1 and n <= m_config.max_order);
175 |             if (k >= 3) return m_D[n - 1].back();
176 |             return m_D[n - 1][k - 1];
177 |         }
178 | 
179 |         inline uint64_t t(uint64_t n, uint64_t k) const {
180 |             assert(n >= 1 and n <= m_config.max_order);
181 |             assert(k > 0 and k <= 4);
182 |             return m_t[n - 1][k - 1];
183 |         }
184 | 
185 |         float compute_discount(uint64_t n, uint64_t k) {
186 |             assert(k > 0 and k <= 4);
187 |             assert(n >= 1 and n <= m_config.max_order);
188 |             if (k <= 3) {
189 |                 float d = (t(n, 1) + 2 * t(n, 2)) * t(n, k);
190 |                 if (d == 0.0) throw std::runtime_error("bad discount");
191 |                 return static_cast<float>(k) -
192 |                        static_cast<float>((k + 1) * t(n, 1) * t(n, k + 1)) / d;
193 |             }
194 |             return compute_discount(n, 3);
195 |         }
196 | 
197 |         void complain(uint64_t n, uint64_t k) {
198 |             auto check = [&](uint64_t n, uint64_t k) {
199 |                 if (!t(n, k)) std::cerr << k << "\n";
200 |             };
201 |             std::cerr << "Error: could not calculate Kneser-Ney discounts for "
202 |                       << int(n) << "-grams with adjusted count " << k << "\n"
203 |                       << "because it was not observed any " << int(n)
204 |                       << "-grams with adjusted count:\n";
205 |             check(n, 1);
206 |             check(n, 2);
207 |             check(n, 3);
208 |             std::cerr << "Is this small or artificial data?" << std::endl;
209 |         }
210 |     };
211 | 
212 |     statistics(uint64_t order)
213 |         : m_num_ngrams(order, 0)
214 |         , m_t(order, std::vector<uint64_t>(4, 0))
215 |         , m_D(order, std::vector<float>(4, 0))
216 |         , m_total_num_words(0)
217 |         , m_unk_prob(0.0) {}
218 | 
219 |     inline float D(uint64_t n, uint64_t k) const {
220 |         assert(k > 0);
221 |         assert(n >= 1 and n <= order());
222 |         if (k >= 3) return m_D[n - 1].back();
223 |         return m_D[n - 1][k - 1];
224 |     }
225 | 
226 |     inline uint64_t t(uint64_t n, uint64_t k) const {
227 |         assert(n >= 1 and n <= order());
228 |         assert(k > 0 and k <= 4);
229 |         return m_t[n - 1][k - 1];
230 |     }
231 | 
232 |     inline uint64_t& num_ngrams(uint64_t n) {
233 |         assert(n >= 1 and n <= order());
234 |         return m_num_ngrams[n - 1];
235 |     }
236 | 
237 |     inline uint64_t total_words() const {
238 |         return m_total_num_words;
239 |     }
240 | 
241 |     uint64_t total_grams() const {
242 |         return std::accumulate(m_num_ngrams.begin(), m_num_ngrams.end(),
243 |                                uint64_t(0));
244 |     }
245 | 
246 |     inline float unk_prob() const {
247 |         return m_unk_prob;
248 |     }
249 | 
250 |     uint64_t order() const {
251 |         return m_num_ngrams.size();
252 |     }
253 | 
254 |     void print() {
255 |         std::cerr << "number of ngrams:\n";
256 |         for (uint64_t n = 1; n <= order(); ++n) {
257 |             std::cerr << n << "-grams: " << num_ngrams(n) << "\n";
258 |         }
259 | 
260 |         std::cerr << "smoothing statistics:\n";
261 |         for (uint64_t n = 1; n <= order(); ++n) {
262 |             uint64_t sum = 0;
263 |             for (uint64_t k = 1; k <= 4; ++k) {
264 |                 std::cerr << "t_" << n << "(" << k << ") = " << t(n, k) << "\n";
265 |                 sum += t(n, k);
266 |             }
267 |             std::cerr << "sum: " << sum << "\n" << std::endl;
268 |         }
269 | 
270 |         std::cerr << "discounts:\n";
271 |         for (uint64_t n = 1; n <= order(); ++n) {
272 |             for (uint64_t k = 1; k <= 3; ++k) {
273 |                 std::cerr << "D_" << n << "(" << k << ") = " << D(n, k) << " ";
274 |             }
275 |             std::cerr << std::endl;
276 |         }
277 |     }
278 | 
279 | private:
280 |     std::vector<uint64_t> m_num_ngrams;
281 |     std::vector<std::vector<uint64_t>> m_t;
282 |     std::vector<std::vector<float>> m_D;
283 |     uint64_t m_total_num_words;
284 |     float m_unk_prob;
285 | };
286 | 
287 | }  // namespace tongrams
288 | 


--------------------------------------------------------------------------------
/include/stream.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <fstream>
  4 | #include <numeric>
  5 | 
  6 | #include "util.hpp"
  7 | #include "ngrams_block.hpp"
  8 | #include "front_coding.hpp"
  9 | 
 10 | #include "../external/tongrams/include/utils/util_types.hpp"
 11 | 
 12 | namespace tongrams::stream {
 13 | 
 14 | typedef ngrams_block uncompressed_block_type;
 15 | typedef fc::ngrams_block<context_order_comparator_type> compressed_block_type;
 16 | 
 17 | template <typename Block>
 18 | struct async_ngrams_file_source {
 19 |     async_ngrams_file_source() : m_file_size(0), m_handle_ptr(nullptr) {}
 20 | 
 21 |     async_ngrams_file_source(std::string const& filename)
 22 |         : m_file_size(0), m_handle_ptr(nullptr) {
 23 |         open(filename);
 24 |     }
 25 | 
 26 |     void open(std::string const& filename) {
 27 |         m_filename = filename;
 28 |         m_is.open(filename.c_str(), std::ifstream::binary);
 29 |         if (not m_is.good()) {
 30 |             throw std::runtime_error(
 31 |                 "Error in opening binary file, it may not exist or be "
 32 |                 "malformed.");
 33 |         }
 34 |         m_is.seekg(0, m_is.end);
 35 |         m_file_size = static_cast<size_t>(m_is.tellg());
 36 |         m_is.seekg(0, m_is.beg);
 37 |     }
 38 | 
 39 |     void close() {
 40 |         util::wait(m_handle_ptr);
 41 |         if (m_is.is_open()) m_is.close();
 42 |     }
 43 | 
 44 |     void close_and_remove() {
 45 |         close();
 46 |         std::remove(m_filename.c_str());
 47 |     }
 48 | 
 49 |     size_t size() const {
 50 |         return m_buffer.size();
 51 |     }
 52 | 
 53 |     bool empty() const {
 54 |         return m_buffer.empty();
 55 |     }
 56 | 
 57 |     Block* get_block() {
 58 |         if (empty()) util::wait(m_handle_ptr);
 59 |         assert(size());
 60 |         return &m_buffer.front();
 61 |     }
 62 | 
 63 |     void release_block() {
 64 |         m_buffer.front().release();
 65 |         m_buffer.pop_front();
 66 |     }
 67 | 
 68 | protected:
 69 |     std::string m_filename;
 70 |     std::ifstream m_is;
 71 |     size_t m_file_size;
 72 |     std::deque<Block> m_buffer;
 73 |     std::unique_ptr<std::thread> m_handle_ptr;
 74 | };
 75 | 
 76 | struct uncompressed_stream_generator
 77 |     : async_ngrams_file_source<uncompressed_block_type> {
 78 |     typedef uncompressed_block_type block_type;
 79 | 
 80 |     uncompressed_stream_generator() {}
 81 | 
 82 |     uncompressed_stream_generator(uint8_t ngram_order)
 83 |         : m_read_bytes(0), m_N(ngram_order), m_eos(false), m_I_time(0.0) {}
 84 | 
 85 |     void open(std::string const& filename) {
 86 |         async_ngrams_file_source::open(filename);
 87 |     }
 88 | 
 89 |     void async_fetch_next_block(size_t num_bytes) {
 90 |         util::wait(m_handle_ptr);
 91 |         m_handle_ptr =
 92 |             util::async_call(uncompressed_stream_generator::fetch, num_bytes);
 93 |     }
 94 | 
 95 |     void fetch_next_block(size_t num_bytes) {
 96 |         fetch(num_bytes);
 97 |     }
 98 | 
 99 |     double I_time() const {
100 |         return m_I_time;
101 |     }
102 | 
103 |     bool eos() const {
104 |         return m_eos;
105 |     }
106 | 
107 | private:
108 |     size_t m_read_bytes;
109 |     uint8_t m_N;
110 |     bool m_eos;
111 |     double m_I_time;
112 | 
113 |     std::function<void(size_t)> fetch = [&](size_t bytes) {
114 |         if (eos()) return;
115 |         auto s = clock_type::now();
116 |         block_type block(m_N);
117 |         if (m_read_bytes + bytes >= m_file_size) {
118 |             bytes = m_file_size - m_read_bytes;
119 |             m_eos = true;
120 |         }
121 |         m_read_bytes += bytes;
122 |         assert(bytes % block.record_size() == 0);
123 |         uint64_t num_ngrams = bytes / block.record_size();
124 |         char* begin = block.initialize_memory(bytes);
125 |         block.read_bytes(m_is, begin, bytes);
126 |         block.materialize_index(num_ngrams);
127 |         m_buffer.push_back(std::move(block));
128 |         auto e = clock_type::now();
129 |         std::chrono::duration<double> elapsed = e - s;
130 |         m_I_time += elapsed.count();
131 |     };
132 | };
133 | 
134 | struct compressed_stream_generator
135 |     : async_ngrams_file_source<compressed_block_type> {
136 |     typedef compressed_block_type block_type;
137 | 
138 |     compressed_stream_generator() {}
139 | 
140 |     compressed_stream_generator(uint8_t ngram_order)
141 |         : m_read_bytes(0)
142 |         , m_N(ngram_order)
143 |         , m_w(0)
144 |         , m_v(0)
145 |         , m_eos(false)
146 |         , m_I_time(0.0) {}
147 | 
148 |     void open(std::string const& filename) {
149 |         async_ngrams_file_source::open(filename);
150 |         essentials::load_pod(m_is, m_w);
151 |         essentials::load_pod(m_is, m_v);
152 |         m_read_bytes = sizeof(m_w) + sizeof(m_v);
153 |     }
154 | 
155 |     void async_fetch_next_block(size_t /*num_bytes*/) {
156 |         util::wait(m_handle_ptr);
157 |         m_handle_ptr = util::async_call(compressed_stream_generator::fetch);
158 |     }
159 | 
160 |     void fetch_next_block(size_t /*num_bytes*/) {
161 |         fetch();
162 |     }
163 | 
164 |     double I_time() const {
165 |         return m_I_time;
166 |     }
167 | 
168 |     bool eos() const {
169 |         return m_eos;
170 |     }
171 | 
172 | private:
173 |     size_t m_read_bytes;
174 |     uint8_t m_N;
175 |     uint8_t m_w;
176 |     uint8_t m_v;
177 |     bool m_eos;
178 |     double m_I_time;
179 | 
180 |     std::function<void(void)> fetch = [&]() {
181 |         if (eos()) return;
182 |         auto s = clock_type::now();
183 |         size_t size = 0;
184 |         essentials::load_pod(m_is, size);
185 |         m_read_bytes += sizeof(size);
186 |         assert(size > 0);
187 |         block_type block(m_N, size, m_w, m_v);
188 |         size_t bytes = fc::BLOCK_BYTES;
189 |         if (m_read_bytes + bytes >= m_file_size) {
190 |             bytes = m_file_size - m_read_bytes;
191 |             m_eos = true;
192 |         }
193 |         m_read_bytes += bytes;
194 |         block.read(m_is, bytes);
195 |         m_buffer.push_back(std::move(block));
196 |         auto e = clock_type::now();
197 |         std::chrono::duration<double> elapsed = e - s;
198 |         m_I_time += elapsed.count();
199 |     };
200 | };
201 | 
202 | struct writer {
203 |     writer(uint8_t order) : m_order(order) {}
204 | 
205 |     template <typename Iterator>
206 |     void write_block(std::ofstream& os, Iterator begin, Iterator end, size_t,
207 |                      ngrams_block_statistics const&) {
208 |         std::streamsize record_size = ngrams_block::record_size(m_order);
209 |         for (auto it = begin; it != end; ++it) {
210 |             auto ptr = *it;
211 |             os.write(reinterpret_cast<char const*>(ptr.data), record_size);
212 |         }
213 |     }
214 | 
215 | private:
216 |     uint8_t m_order;
217 | };
218 | 
219 | template <typename T = uint16_t>
220 | struct floats_vec {
221 |     typedef T value_type;
222 |     typedef typename std::vector<T>::iterator iterator;
223 | 
224 |     floats_vec(size_t n) : m_floats(n) {
225 |         m_reint.uint_value = 0;
226 |     }
227 | 
228 |     void clear() {
229 |         m_floats.clear();
230 |     }
231 | 
232 |     void reserve(size_t n) {
233 |         m_floats.reserve(n);
234 |     }
235 | 
236 |     void resize(size_t n) {
237 |         m_floats.resize(n);
238 |     }
239 | 
240 |     void push_back(float x) {
241 |         m_reint.uint_value = 0;
242 |         m_reint.float_value = x;
243 |         m_floats.push_back(m_reint.uint_value);
244 |     }
245 | 
246 |     inline float operator[](size_t i) {
247 |         assert(i < m_floats.size());
248 |         m_reint.uint_value = m_floats[i];
249 |         return m_reint.float_value;
250 |     }
251 | 
252 |     size_t size() const {
253 |         return m_floats.size();
254 |     }
255 | 
256 |     auto* data() {
257 |         return m_floats.data();
258 |     }
259 | 
260 |     void swap(floats_vec<T>& other) {
261 |         m_floats.swap(other.m_floats);
262 |         std::swap(m_reint, other.m_reint);
263 |     }
264 | 
265 |     iterator begin() {
266 |         return m_floats.begin();
267 |     }
268 | 
269 |     iterator end() {
270 |         return m_floats.end();
271 |     }
272 | 
273 | private:
274 |     bits::reinterpret<T> m_reint;
275 |     std::vector<T> m_floats;
276 | };
277 | 
278 | }  // namespace tongrams::stream
279 | 


--------------------------------------------------------------------------------
/include/tmp.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "ngrams_block.hpp"
  4 | #include "vocabulary.hpp"
  5 | 
  6 | #include <vector>
  7 | 
  8 | namespace tongrams {
  9 | namespace tmp {
 10 | 
 11 | struct statistics {
 12 |     static const word_id invalid_word_id = word_id(-1);
 13 |     static const range_id invalid_range_id = range_id(-1);
 14 | 
 15 |     struct word_statistic {
 16 |         range_id id;   // current range id to which the word belongs to
 17 |         word_id left;  // last seen word to the left of the word
 18 |     };
 19 | 
 20 |     statistics(uint64_t order)
 21 |         : t(order, std::vector<uint64_t>(5, 0))
 22 |         , r(order, std::vector<uint64_t>(5, 0))
 23 |         , current_range_id(order, 0)
 24 | 
 25 |         // order - 1 because modified counts for N-grams are the raw occurrence
 26 |         // counts in text
 27 |         , occs(order - 1,
 28 |                std::vector<occurrence>(
 29 |                    0, 0))  // num. of distinct words appearing
 30 |                            // to the left of the word (modified count)
 31 |         , stats(order - 1, std::vector<word_statistic>(
 32 |                                0, {invalid_range_id, invalid_word_id})) {}
 33 | 
 34 |     void release(uint64_t n) {
 35 |         assert(n > 0);
 36 |         stats[n - 1].resize(0, {invalid_range_id, invalid_word_id});
 37 |     }
 38 | 
 39 |     void resize(uint64_t n, size_t vocab_size) {
 40 |         assert(n > 0);
 41 |         occs[n - 1].resize(vocab_size, 0);
 42 |         stats[n - 1].resize(vocab_size, {invalid_range_id, invalid_word_id});
 43 |     }
 44 | 
 45 |     void clear() {
 46 |         for (uint64_t n = 0; n < t.size(); ++n) {
 47 |             for (uint64_t k = 0; k < 5; ++k) {
 48 |                 r[n][k] = 0;
 49 |                 t[n][k] = 0;
 50 |             }
 51 |         }
 52 |     }
 53 | 
 54 |     bool was_not_seen(uint64_t n, word_id right) {
 55 |         auto& stat = stats[n - 1][right];
 56 |         if (stat.id != current_range_id[n - 1]) {  // range changes
 57 |             stat.id = current_range_id[n - 1];
 58 |             return true;
 59 |         }
 60 |         return false;
 61 |     }
 62 | 
 63 |     bool update(uint64_t n, word_id left, word_id right) {
 64 |         assert(n > 0 and n < t.size());
 65 |         auto& stat = stats[n - 1][right];
 66 |         auto& occ = occs[n - 1][right];
 67 | 
 68 |         if (n != 1) {  // do not reset occurrence for uni-grams
 69 |             if (stat.id != current_range_id[n - 1]) {  // range changes
 70 |                 // update range id if different from the current one
 71 |                 // and reset number of occurrences
 72 |                 stat.id = current_range_id[n - 1];
 73 |                 occ = 0;
 74 |                 stat.left = invalid_word_id;
 75 |             }
 76 |         }
 77 | 
 78 |         if (stat.left != left) {
 79 |             stat.left = left;
 80 |             ++occ;
 81 |             assert(occ > 0);
 82 |             if (occ == 1) {
 83 |                 ++r[n - 1][0];
 84 |             } else if (occ > 1 and occ <= 5) {
 85 |                 ++r[n - 1][occ - 1];
 86 |                 --r[n - 1][occ - 2];
 87 |             }
 88 |             return true;
 89 |         }
 90 | 
 91 |         return false;
 92 |     }
 93 | 
 94 |     void combine(uint64_t n) {
 95 |         assert(n > 0);
 96 |         ++current_range_id[n - 1];
 97 |         for (uint64_t k = 0; k < 5; ++k) {
 98 |             uint64_t& c = r[n - 1][k];
 99 |             t[n - 1][k] += c;
100 |             c = 0;
101 |         }
102 |     }
103 | 
104 |     // void print_stats() {
105 |     //     std::cerr << "modified counts for unigrams" << std::endl;
106 |     //     for (auto x : occs[0]) {
107 |     //         std::cerr << x << std::endl;
108 |     //     }
109 |     //     for (uint64_t n = 1; n <= t.size(); ++n) {
110 |     //         for (uint64_t k = 1; k <= 5; ++k) {
111 |     //             std::cerr << "r_" << int(n) << "(" << k << ") = "
112 |     //                       << r[n - 1][k - 1] << std::endl;
113 |     //             std::cerr << "t_" << int(n) << "(" << k
114 |     //                       << ") = " << t[n - 1][k - 1] << std::endl;
115 |     //         }
116 |     //     }
117 |     // }
118 | 
119 |     std::vector<std::vector<uint64_t>>
120 |         t;  // number of n-grams, for n = 1,...,4, having modified count equal
121 |             // to 1, 2, 3, 4 and 4+ globally (i.e., all ranges)
122 |     std::vector<std::vector<uint64_t>>
123 |         r;  // number of n-grams, for n = 1,...,4, having modified count equal
124 |             // to 1, 2, 3, 4 and 4+ in a range
125 |     std::vector<range_id>
126 |         current_range_id;  // keep track of current range id to know when we
127 |                            // switch to the next range
128 |     std::vector<std::vector<occurrence>> occs;
129 |     std::vector<std::vector<word_statistic>> stats;
130 | };
131 | 
132 | struct data {
133 |     data() : vocab_builder(0) {
134 |         word_ids.set_empty_key(constants::invalid_hash);
135 |         assert(vocab_builder.size() == 0);
136 |     }
137 | 
138 |     words_map word_ids;  // map from unigrams' hashes to word ids
139 | 
140 |     vocabulary::builder vocab_builder;
141 | 
142 |     /*
143 |         Offsets at which we will write the computed probabilities
144 |         in the trie levels: these are equivalent to the counts that
145 |         counting sort would compute: we need them for 1 < n <= N.
146 |     */
147 |     std::vector<std::vector<uint64_t>> probs_offsets;
148 | 
149 |     /*
150 |         Block partitions' offsets.
151 |         Each block corresponds to a partition of the total N-grams file.
152 |     */
153 |     std::vector<std::vector<uint64_t>> blocks_offsets;
154 | };
155 | 
156 | }  // namespace tmp
157 | }  // namespace tongrams
158 | 


--------------------------------------------------------------------------------
/include/util.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "../external/tongrams/include/utils/iterators.hpp"
 4 | #include "../external/tongrams/include/utils/util_types.hpp"
 5 | 
 6 | #include <boost/iostreams/device/mapped_file.hpp>
 7 | #include <boost/filesystem.hpp>
 8 | 
 9 | #include <sys/mman.h>  // for POSIX_MADV_SEQUENTIAL and POSIX_MADV_RANDOM
10 | #include <thread>
11 | #include <fstream>
12 | 
13 | namespace tongrams::util {
14 | 
15 | void write(std::ofstream& os, byte_range br) {
16 |     os.write(reinterpret_cast<const char*>(br.first),
17 |              (br.second - br.first) * sizeof(char));
18 | }
19 | 
20 | size_t file_size(const char* filename) {
21 |     boost::filesystem::path filepath(filename);
22 |     return boost::filesystem::file_size(filepath);
23 | }
24 | 
25 | bool exists(const char* filename) {
26 |     boost::filesystem::path filepath(filename);
27 |     return boost::filesystem::exists(filepath);
28 | }
29 | 
30 | template <typename File>
31 | void check_file(File const& file) {
32 |     if (not file.is_open()) {
33 |         throw std::runtime_error(
34 |             "Error in opening file: it may not exist or be malformed.");
35 |     }
36 | }
37 | 
38 | template <typename Address>
39 | void optimize_access(Address addr, size_t len, int MODE) {
40 |     auto ret = posix_madvise((void*)addr, len, MODE);
41 |     if (ret) {
42 |         std::cerr << "Error in calling madvice: " << errno << std::endl;
43 |     }
44 | }
45 | 
46 | #define optimize_sequential_access(addr, len) \
47 |     optimize_access(addr, len, POSIX_MADV_SEQUENTIAL)
48 | #define optimize_random_access(addr, len) \
49 |     optimize_access(addr, len, POSIX_MADV_RANDOM)
50 | 
51 | template <typename File>
52 | uint8_t const* open_file_partition(File& file, std::string const& filename,
53 |                                    size_t partition_size,
54 |                                    size_t offset  // in bytes
55 | ) {
56 |     file.open(filename.c_str(), partition_size, offset);
57 |     util::check_file(file);
58 |     assert(file.size() == partition_size);
59 |     return reinterpret_cast<uint8_t const*>(file.data());
60 | }
61 | 
62 | void clean_temporaries(std::string const& tmp_dirname) {
63 |     boost::filesystem::remove_all(boost::filesystem::path(tmp_dirname.c_str()));
64 | }
65 | 
66 | template <typename Funct, typename... Args>
67 | auto async_call(Funct& f, Args&&... args) {
68 |     return std::make_unique<std::thread>(f, args...);
69 | }
70 | 
71 | void wait(std::unique_ptr<std::thread>& handle_ptr) {
72 |     if (handle_ptr and handle_ptr->joinable()) handle_ptr->join();
73 | }
74 | 
75 | }  // namespace tongrams::util
76 | 


--------------------------------------------------------------------------------
/include/util_types.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <deque>
  4 | #include <mutex>
  5 | #include <thread>
  6 | #include <chrono>
  7 | #include <vector>
  8 | #include <sparsehash/dense_hash_map>
  9 | 
 10 | #define BOOST_THREAD_VERSION 4
 11 | #define BOOST_THREAD_PROVIDES_EXECUTORS
 12 | 
 13 | #include <boost/config.hpp>
 14 | #include <boost/thread/executors/basic_thread_pool.hpp>
 15 | #include <boost/thread/experimental/parallel/v2/task_region.hpp>
 16 | 
 17 | #include "../external/tongrams/include/utils/util_types.hpp"
 18 | 
 19 | namespace tongrams {
 20 | 
 21 | typedef uint32_t ngram_id;
 22 | typedef uint32_t word_id;
 23 | typedef uint32_t range_id;
 24 | typedef uint32_t occurrence;
 25 | typedef uint64_t count_type;
 26 | typedef uint64_t iterator;
 27 | typedef std::vector<word_id> ngram_type;
 28 | typedef google::dense_hash_map<uint64_t, word_id> words_map;
 29 | typedef std::chrono::high_resolution_clock clock_type;
 30 | 
 31 | uint64_t sizeof_ngram(uint8_t order) {
 32 |     return sizeof(word_id) * order;
 33 | }
 34 | 
 35 | bool equal_to(word_id const* x, word_id const* y, size_t n) {
 36 |     return memcmp(x, y, n) == 0;
 37 | }
 38 | 
 39 | typedef boost::executors::basic_thread_pool executor_type;
 40 | typedef boost::experimental::parallel::v2::task_region_handle_gen<executor_type>
 41 |     task_region_handle;
 42 | using boost::experimental::parallel::v2::task_region;
 43 | 
 44 | struct parallel_executor {
 45 |     parallel_executor(
 46 |         size_t num_threads = std::thread::hardware_concurrency()) {
 47 |         executor.reset(new executor_type(num_threads));
 48 |     }
 49 |     std::unique_ptr<executor_type> executor;
 50 | };
 51 | 
 52 | template <typename Iterator>
 53 | struct iterator_range {
 54 |     iterator_range() {}
 55 |     iterator_range(Iterator const& begin, Iterator const& end)
 56 |         : begin(begin), end(end) {}
 57 | 
 58 |     Iterator begin;
 59 |     Iterator end;
 60 | };
 61 | 
 62 | template <typename T>
 63 | struct adaptor {
 64 |     byte_range operator()(T const& x) const {
 65 |         const uint8_t* buf = reinterpret_cast<const uint8_t*>(&x);
 66 |         return {buf, buf + sizeof(T)};
 67 |     }
 68 | };
 69 | 
 70 | struct filename_generator {
 71 |     filename_generator(std::string const& dir_name, std::string const& prefix,
 72 |                        std::string const& extension, int seed = -1)
 73 |         : m_seed(seed)
 74 |         , m_prefix(dir_name + "/.tmp." + prefix)
 75 |         , m_extension(extension) {
 76 |         next();
 77 |     }
 78 | 
 79 |     auto const& operator()() {
 80 |         return m_cur_filename;
 81 |     }
 82 | 
 83 |     auto const& prx() {
 84 |         return m_prefix;
 85 |     }
 86 | 
 87 |     auto const& ext() {
 88 |         return m_extension;
 89 |     }
 90 | 
 91 |     auto seed() const {
 92 |         return m_seed;
 93 |     }
 94 | 
 95 |     void next() {
 96 |         ++m_seed;
 97 |         m_cur_filename = prx() + std::to_string(m_seed) + "." + ext();
 98 |     }
 99 | 
100 | private:
101 |     int m_seed;
102 |     std::string m_prefix;
103 |     std::string m_extension;
104 |     std::string m_cur_filename;
105 | };
106 | 
107 | template <typename T>
108 | struct semi_sync_queue {
109 |     semi_sync_queue() {
110 |         open();
111 |     }
112 | 
113 |     void close() {
114 |         m_open = false;
115 |     }
116 | 
117 |     void open() {
118 |         m_open = true;
119 |     }
120 | 
121 |     void lock() {
122 |         m_mutex.lock();
123 |     }
124 | 
125 |     void unlock() {
126 |         m_mutex.unlock();
127 |     }
128 | 
129 |     void push(T& val) {
130 |         m_buffer.push_back(std::move(val));
131 |     }
132 | 
133 |     T& pick() {
134 |         return m_buffer.front();
135 |     }
136 | 
137 |     void pop() {
138 |         m_buffer.pop_front();
139 |     }
140 | 
141 |     bool active() const {
142 |         return m_open;
143 |     }
144 | 
145 |     bool empty() const {
146 |         return m_buffer.empty();
147 |     }
148 | 
149 |     size_t size() const {
150 |         return m_buffer.size();
151 |     }
152 | 
153 |     auto begin() {
154 |         return m_buffer.begin();
155 |     }
156 | 
157 |     auto end() {
158 |         return m_buffer.end();
159 |     }
160 | 
161 |     void release() {
162 |         std::deque<T>().swap(m_buffer);
163 |     }
164 | 
165 | private:
166 |     std::mutex m_mutex;
167 |     std::deque<T> m_buffer;
168 |     bool m_open;
169 | };
170 | 
171 | }  // namespace tongrams
172 | 


--------------------------------------------------------------------------------
/include/vocabulary.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "constants.hpp"
 4 | #include "util_types.hpp"
 5 | 
 6 | #include "../external/tongrams/include/utils/iterators.hpp"
 7 | #include "../external/tongrams/include/utils/pools.hpp"
 8 | 
 9 | namespace tongrams {
10 | 
11 | struct vocabulary {
12 |     struct builder {
13 |         builder() {}
14 | 
15 |         builder(size_t vocab_size, size_t bytes = 0)
16 |             : m_vocab_size(vocab_size) {
17 |             m_unigram_strings.reserve(bytes);
18 |             m_offsets.reserve(vocab_size + 1);
19 |             m_offsets.push_back(0);
20 |         }
21 | 
22 |         void reserve(size_t bytes) {
23 |             m_unigram_strings.reserve(bytes);
24 |         }
25 | 
26 |         void push_empty() {
27 |             m_offsets.push_back(m_unigram_strings.bytes());
28 |         }
29 | 
30 |         void push_back(byte_range br) {
31 |             m_unigram_strings.append(br);
32 |             m_offsets.push_back(m_unigram_strings.bytes());
33 |         }
34 | 
35 |         void load(std::string const& vocab_filename) {
36 |             text_lines it(vocab_filename.c_str());
37 |             for (uint64_t i = 0; i != m_vocab_size; ++i) {
38 |                 auto unigram = it.next_word();
39 |                 if (bytes::equal_bytes(unigram,
40 |                                        constants::empty_token_byte_range)) {
41 |                     push_empty();
42 |                 } else {
43 |                     push_back(unigram);
44 |                 }
45 |             }
46 |         }
47 | 
48 |         void swap(builder& other) {
49 |             std::swap(m_vocab_size, other.m_vocab_size);
50 |             m_unigram_strings.swap(other.m_unigram_strings);
51 |             m_offsets.swap(other.m_offsets);
52 |         }
53 | 
54 |         void build(vocabulary& vocab) {
55 |             vocab.m_unigram_strings.swap(m_unigram_strings);
56 |             vocab.m_unigram_strings.shrink_to_fit();
57 |             vocab.m_base_addr = vocab.m_unigram_strings.base_addr();
58 |             vocab.m_offsets.swap(m_offsets);
59 |             builder().swap(*this);
60 |         }
61 | 
62 |         size_t size() const {
63 |             return m_offsets.size() - 1;
64 |         }
65 | 
66 |     private:
67 |         size_t m_vocab_size;
68 |         strings_pool m_unigram_strings;
69 |         std::vector<size_t> m_offsets;
70 |     };
71 | 
72 |     vocabulary() {}
73 | 
74 |     byte_range operator[](word_id id) const {
75 |         assert(id < m_offsets.size() - 1);
76 |         uint64_t begin = m_offsets[id];
77 |         uint64_t end = m_offsets[id + 1];
78 |         assert(end >= begin);
79 |         if (LIKELY(begin != end)) {
80 |             return m_unigram_strings.get_bytes(m_base_addr, begin, end);
81 |         }
82 |         return constants::empty_token_byte_range;
83 |     }
84 | 
85 | private:
86 |     uint8_t const* m_base_addr;
87 |     strings_pool m_unigram_strings;
88 |     std::vector<size_t> m_offsets;
89 | };
90 | 
91 | }  // namespace tongrams
92 | 


--------------------------------------------------------------------------------
/src/count.cpp:
--------------------------------------------------------------------------------
  1 | #include <chrono>
  2 | #include <iostream>
  3 | 
  4 | #include "../external/tongrams/external/cmd_line_parser/include/parser.hpp"
  5 | 
  6 | #include "configuration.hpp"
  7 | #include "counter.hpp"
  8 | 
  9 | int main(int argc, char** argv) {
 10 |     using namespace tongrams;
 11 | 
 12 |     configuration config;
 13 |     cmd_line_parser::parser parser(argc, argv);
 14 |     parser.add("text_filename", "Input text filename.");
 15 |     parser.add("order", "Language model order. It must be > 2 and <= " +
 16 |                             std::to_string(global::max_order) + ".");
 17 |     parser.add("ram",
 18 |                "Amount to RAM in GiB. Default is " +
 19 |                    std::to_string(static_cast<uint64_t>(
 20 |                        static_cast<double>(config.RAM) / essentials::GiB)) +
 21 |                    " GiB.",
 22 |                "--ram", false);
 23 |     parser.add("tmp_dir",
 24 |                "Temporary directory used for counting. Default is directory '" +
 25 |                    constants::default_tmp_dirname + "'.",
 26 |                "--tmp", false);
 27 |     parser.add("num_threads",
 28 |                "Number of threads. Default is " +
 29 |                    std::to_string(config.num_threads) + " on this machine.",
 30 |                "--thr", false);
 31 |     parser.add("compress_blocks",
 32 |                "Compress temporary files during estimation. Default is " +
 33 |                    (config.compress_blocks ? std::string("true")
 34 |                                            : std::string("false")) +
 35 |                    ".",
 36 |                "--compress_blocks", true);
 37 |     parser.add("out",
 38 |                "Output filename. Default is '" +
 39 |                    constants::default_output_filename + "'.",
 40 |                "--out", false);
 41 |     if (!parser.parse()) return 1;
 42 | 
 43 |     config.text_filename = parser.get<std::string>("text_filename");
 44 |     if (!util::exists(config.text_filename.c_str())) {
 45 |         std::cerr << "Error: corpus file does not exist" << std::endl;
 46 |         return 1;
 47 |     }
 48 | 
 49 |     config.text_size = util::file_size(config.text_filename.c_str());
 50 |     std::cerr << "reading from '" << config.text_filename << "' ("
 51 |               << config.text_size << " bytes)" << std::endl;
 52 |     config.max_order = parser.get<uint64_t>("order");
 53 |     if (config.max_order <= 2 or config.max_order > global::max_order) {
 54 |         std::cerr << "invalid language model order" << std::endl;
 55 |         return 1;
 56 |     }
 57 | 
 58 |     size_t available_ram = sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES);
 59 | 
 60 |     if (parser.parsed("ram")) {
 61 |         uint64_t ram =
 62 |             static_cast<uint64_t>(parser.get<double>("ram") * essentials::GiB);
 63 |         if (ram > available_ram) {
 64 |             std::cerr << "Warning: this machine has "
 65 |                       << available_ram / essentials::GiB << " GiB of RAM."
 66 |                       << std::endl;
 67 |             std::cerr << "Thus, using defalt amount of "
 68 |                       << config.RAM / essentials::GiB << " GiB" << std::endl;
 69 |         } else {
 70 |             config.RAM = ram;
 71 |         }
 72 |     }
 73 |     if (parser.parsed("tmp_dir")) {
 74 |         config.tmp_dirname = parser.get<std::string>("tmp_dir");
 75 |     }
 76 |     if (parser.parsed("num_threads")) {
 77 |         config.num_threads = parser.get<uint64_t>("num_threads");
 78 |         if (config.num_threads == 0) {
 79 |             std::cerr << "number of threads must be > 0" << std::endl;
 80 |             return 1;
 81 |         }
 82 |     }
 83 |     if (parser.parsed("compress_blocks")) {
 84 |         config.compress_blocks = parser.get<bool>("compress_blocks");
 85 |     }
 86 |     if (parser.parsed("out")) {
 87 |         config.output_filename = parser.get<std::string>("out");
 88 |     }
 89 | 
 90 |     config.vocab_tmp_subdirname = config.tmp_dirname + "/vocab";
 91 |     bool ok = essentials::create_directory(config.tmp_dirname) and
 92 |               essentials::create_directory(config.vocab_tmp_subdirname);
 93 |     if (not ok) return 1;
 94 | 
 95 |     std::cerr << "counting with " << config.RAM << "/" << available_ram
 96 |               << " bytes of RAM"
 97 |               << " (" << config.RAM * 100.0 / available_ram << "\%)\n";
 98 | 
 99 |     std::ios_base::sync_with_stdio(false);
100 |     std::cin.tie(NULL);
101 | 
102 |     counter c(config);
103 |     c.run();
104 |     c.print_stats();
105 | 
106 |     return 0;
107 | }
108 | 


--------------------------------------------------------------------------------
/src/estimate.cpp:
--------------------------------------------------------------------------------
  1 | #include <chrono>
  2 | #include <iostream>
  3 | 
  4 | #include "../external/tongrams/external/cmd_line_parser/include/parser.hpp"
  5 | 
  6 | #include "configuration.hpp"
  7 | #include "estimation.hpp"
  8 | 
  9 | int main(int argc, char** argv) {
 10 |     using namespace tongrams;
 11 | 
 12 |     configuration config;
 13 |     cmd_line_parser::parser parser(argc, argv);
 14 |     parser.add("text_filename", "Input text filename.");
 15 |     parser.add("order", "Language model order. It must be > 2 and <= " +
 16 |                             std::to_string(global::max_order) + ".");
 17 |     parser.add("ram",
 18 |                "Amount to RAM dedicated to estimation in GiB. Default is " +
 19 |                    std::to_string(static_cast<uint64_t>(
 20 |                        static_cast<double>(config.RAM) / essentials::GiB)) +
 21 |                    " GiB.",
 22 |                "--ram", false);
 23 |     parser.add(
 24 |         "tmp_dir",
 25 |         "Temporary directory used for estimation. Default is directory '" +
 26 |             constants::default_tmp_dirname + "'.",
 27 |         "--tmp", false);
 28 |     parser.add("num_threads",
 29 |                "Number of threads. Default is " +
 30 |                    std::to_string(config.num_threads) + " on this machine.",
 31 |                "--thr", false);
 32 |     parser.add("compress_blocks",
 33 |                "Compress temporary files during estimation. Default is " +
 34 |                    (config.compress_blocks ? std::string("true")
 35 |                                            : std::string("false")) +
 36 |                    ".",
 37 |                "--compress_blocks", true);
 38 |     // parser.add("p",
 39 |     //            "Probability quantization bits.",
 40 |     //            "--p", false);
 41 |     // parser.add("b",
 42 |     //            "Backoff quantization bits.",
 43 |     //            "--b", false);
 44 |     parser.add("out",
 45 |                "Output filename. Default is '" +
 46 |                    constants::default_output_filename + "'.",
 47 |                "--out", false);
 48 |     if (!parser.parse()) return 1;
 49 | 
 50 |     config.text_filename = parser.get<std::string>("text_filename");
 51 |     if (!util::exists(config.text_filename.c_str())) {
 52 |         std::cerr << "Error: corpus file does not exist" << std::endl;
 53 |         return 1;
 54 |     }
 55 | 
 56 |     config.text_size = util::file_size(config.text_filename.c_str());
 57 |     std::cerr << "reading from '" << config.text_filename << "' ("
 58 |               << config.text_size << " bytes)" << std::endl;
 59 |     config.max_order = parser.get<uint64_t>("order");
 60 |     if (config.max_order <= 2 or config.max_order > global::max_order) {
 61 |         std::cerr << "invalid language model order" << std::endl;
 62 |         return 1;
 63 |     }
 64 | 
 65 |     size_t available_ram = sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES);
 66 | 
 67 |     if (parser.parsed("ram")) {
 68 |         uint64_t ram =
 69 |             static_cast<uint64_t>(parser.get<double>("ram") * essentials::GiB);
 70 |         if (ram > available_ram) {
 71 |             std::cerr << "Warning: this machine has "
 72 |                       << available_ram / essentials::GiB << " GiB of RAM."
 73 |                       << std::endl;
 74 |             std::cerr << "Thus, using defalt amount of "
 75 |                       << config.RAM / essentials::GiB << " GiB" << std::endl;
 76 |         } else {
 77 |             config.RAM = ram;
 78 |         }
 79 |     }
 80 |     if (parser.parsed("tmp_dir")) {
 81 |         config.tmp_dirname = parser.get<std::string>("tmp_dir");
 82 |     }
 83 |     if (parser.parsed("num_threads")) {
 84 |         config.num_threads = parser.get<uint64_t>("num_threads");
 85 |         if (config.num_threads == 0) {
 86 |             std::cerr << "number of threads must be > 0" << std::endl;
 87 |             return 1;
 88 |         }
 89 |     }
 90 |     if (parser.parsed("compress_blocks")) {
 91 |         config.compress_blocks = parser.get<bool>("compress_blocks");
 92 |     }
 93 |     if (parser.parsed("out")) {
 94 |         config.output_filename = parser.get<std::string>("out");
 95 |     }
 96 | 
 97 |     config.vocab_tmp_subdirname = config.tmp_dirname + "/vocab";
 98 |     bool ok = essentials::create_directory(config.tmp_dirname) and
 99 |               essentials::create_directory(config.vocab_tmp_subdirname);
100 |     if (not ok) return 1;
101 | 
102 |     std::cerr << "estimating with " << config.RAM << "/" << available_ram
103 |               << " bytes of RAM"
104 |               << " (" << config.RAM * 100.0 / available_ram << "\%)\n";
105 | 
106 |     std::ios_base::sync_with_stdio(false);
107 |     std::cin.tie(NULL);
108 | 
109 |     estimation e(config);
110 |     e.run();
111 |     e.print_stats();
112 | 
113 |     return 0;
114 | }
115 | 


--------------------------------------------------------------------------------
/test_data/1Billion.1M.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jermp/tongrams_estimation/d63e781983d6774f68f21f7ccb6396d2761b2131/test_data/1Billion.1M.gz


--------------------------------------------------------------------------------