├── README.md ├── cn_spelling.py ├── data ├── SimilarPronunciation.txt ├── SimilarShape.txt ├── bcmi_data │ ├── dev_input.txt │ └── input.txt ├── cncorpus │ ├── CorpusCharacterlist.xls │ ├── CorpusWordPOSlist.xls │ ├── CorpusWordlist.xls │ ├── 现代汉语常用字表.xls │ ├── 现代汉语通用字表.xls │ └── 通用规范汉字表.xls ├── common.pkl ├── sighan │ └── processed │ │ ├── clp14-C1-training.txt │ │ ├── clp14csc_C1_training.pkl │ │ ├── sighan15-A2-Training.txt │ │ └── sighan15_A2_training.pkl ├── simp.pickle ├── simp_simplified.pickle ├── simp_sm.pickle ├── sims.pickle ├── xingjinzi.txt ├── xjz.pickle └── xjz.pkl ├── feed_kenlm.py ├── kenlm ├── .gitignore ├── BUILDING ├── CMakeLists.txt ├── COPYING ├── COPYING.3 ├── COPYING.LESSER.3 ├── Doxyfile ├── GIT_REVISION ├── LICENSE ├── MANIFEST.in ├── README.md ├── clean_query_only.sh ├── cmake │ ├── KenLMFunctions.cmake │ └── modules │ │ └── FindEigen3.cmake ├── compile_query_only.sh ├── include │ ├── lm │ │ ├── bhiksha.hh │ │ ├── binary_format.hh │ │ ├── blank.hh │ │ ├── builder │ │ │ ├── adjust_counts.hh │ │ │ ├── corpus_count.hh │ │ │ ├── discount.hh │ │ │ ├── hash_gamma.hh │ │ │ ├── header_info.hh │ │ │ ├── initial_probabilities.hh │ │ │ ├── interpolate.hh │ │ │ ├── joint_order.hh │ │ │ ├── ngram.hh │ │ │ ├── ngram_stream.hh │ │ │ ├── output.hh │ │ │ ├── pipeline.hh │ │ │ ├── print.hh │ │ │ └── sort.hh │ │ ├── config.hh │ │ ├── enumerate_vocab.hh │ │ ├── facade.hh │ │ ├── filter │ │ │ ├── arpa_io.hh │ │ │ ├── count_io.hh │ │ │ ├── format.hh │ │ │ ├── phrase.hh │ │ │ ├── thread.hh │ │ │ ├── vocab.hh │ │ │ └── wrapper.hh │ │ ├── interpolate │ │ │ └── arpa_to_stream.hh │ │ ├── left.hh │ │ ├── lm_exception.hh │ │ ├── max_order.hh │ │ ├── model.hh │ │ ├── model_type.hh │ │ ├── neural │ │ │ └── wordvecs.hh │ │ ├── ngram_query.hh │ │ ├── partial.hh │ │ ├── quantize.hh │ │ ├── read_arpa.hh │ │ ├── return.hh │ │ ├── search_hashed.hh │ │ ├── search_trie.hh │ │ ├── sizes.hh │ │ ├── state.hh │ │ ├── trie.hh │ │ ├── trie_sort.hh │ │ ├── value.hh │ │ ├── value_build.hh │ │ ├── virtual_interface.hh │ │ ├── vocab.hh │ │ ├── weights.hh │ │ ├── word_index.hh │ │ └── wrappers │ │ │ └── nplm.hh │ └── util │ │ ├── bit_packing.hh │ │ ├── ersatz_progress.hh │ │ ├── exception.hh │ │ ├── fake_ofstream.hh │ │ ├── file.hh │ │ ├── file_piece.hh │ │ ├── fixed_array.hh │ │ ├── getopt.hh │ │ ├── have.hh │ │ ├── joint_sort.hh │ │ ├── mmap.hh │ │ ├── multi_intersection.hh │ │ ├── murmur_hash.hh │ │ ├── parallel_read.hh │ │ ├── pcqueue.hh │ │ ├── pool.hh │ │ ├── probing_hash_table.hh │ │ ├── proxy_iterator.hh │ │ ├── read_compressed.hh │ │ ├── scoped.hh │ │ ├── sized_iterator.hh │ │ ├── sorted_uniform.hh │ │ ├── stream │ │ ├── block.hh │ │ ├── chain.hh │ │ ├── config.hh │ │ ├── io.hh │ │ ├── line_input.hh │ │ ├── multi_progress.hh │ │ ├── multi_stream.hh │ │ ├── sort.hh │ │ ├── stream.hh │ │ └── timer.hh │ │ ├── string_piece.hh │ │ ├── string_piece_hash.hh │ │ ├── thread_pool.hh │ │ ├── tokenize_piece.hh │ │ ├── unistd.hh │ │ └── usage.hh ├── lm │ ├── CMakeLists.txt │ ├── bhiksha.cc │ ├── bhiksha.hh │ ├── binary_format.cc │ ├── binary_format.hh │ ├── blank.hh │ ├── build_binary_main.cc │ ├── builder │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ ├── TODO │ │ ├── adjust_counts.cc │ │ ├── adjust_counts.hh │ │ ├── adjust_counts_test.cc │ │ ├── combine_counts.hh │ │ ├── corpus_count.cc │ │ ├── corpus_count.hh │ │ ├── corpus_count_test.cc │ │ ├── count_ngrams_main.cc │ │ ├── debug_print.hh │ │ ├── discount.hh │ │ ├── dump_counts_main.cc │ │ ├── hash_gamma.hh │ │ ├── header_info.hh │ │ ├── initial_probabilities.cc │ │ ├── initial_probabilities.hh │ │ ├── interpolate.cc │ │ ├── interpolate.hh │ │ ├── lmplz_main.cc │ │ ├── output.cc │ │ ├── output.hh │ │ ├── payload.hh │ │ ├── pipeline.cc │ │ └── pipeline.hh │ ├── common │ │ ├── CMakeLists.txt │ │ ├── compare.hh │ │ ├── joint_order.hh │ │ ├── model_buffer.cc │ │ ├── model_buffer.hh │ │ ├── model_buffer_test.cc │ │ ├── ngram.hh │ │ ├── ngram_stream.hh │ │ ├── print.cc │ │ ├── print.hh │ │ ├── renumber.cc │ │ ├── renumber.hh │ │ ├── size_option.cc │ │ ├── size_option.hh │ │ ├── special.hh │ │ └── test_data │ │ │ ├── generate.sh │ │ │ ├── toy0.1 │ │ │ ├── toy0.2 │ │ │ ├── toy0.3 │ │ │ ├── toy0.arpa │ │ │ ├── toy0.kenlm_intermediate │ │ │ ├── toy0.vocab │ │ │ ├── toy1.1 │ │ │ ├── toy1.2 │ │ │ ├── toy1.3 │ │ │ ├── toy1.arpa │ │ │ ├── toy1.kenlm_intermediate │ │ │ └── toy1.vocab │ ├── config.cc │ ├── config.hh │ ├── enumerate_vocab.hh │ ├── facade.hh │ ├── filter │ │ ├── CMakeLists.txt │ │ ├── arpa_io.cc │ │ ├── arpa_io.hh │ │ ├── count_io.hh │ │ ├── filter_main.cc │ │ ├── format.hh │ │ ├── phrase.cc │ │ ├── phrase.hh │ │ ├── phrase_table_vocab_main.cc │ │ ├── thread.hh │ │ ├── vocab.cc │ │ ├── vocab.hh │ │ └── wrapper.hh │ ├── fragment_main.cc │ ├── interpolate │ │ ├── CMakeLists.txt │ │ ├── backoff_matrix.hh │ │ ├── backoff_reunification.cc │ │ ├── backoff_reunification.hh │ │ ├── backoff_reunification_test.cc │ │ ├── bounded_sequence_encoding.cc │ │ ├── bounded_sequence_encoding.hh │ │ ├── bounded_sequence_encoding_test.cc │ │ ├── interpolate_info.hh │ │ ├── interpolate_main.cc │ │ ├── merge_probabilities.cc │ │ ├── merge_probabilities.hh │ │ ├── merge_test │ │ │ ├── test1 │ │ │ ├── test2 │ │ │ ├── test3 │ │ │ ├── test_bad_order │ │ │ └── test_no_unk │ │ ├── merge_vocab.cc │ │ ├── merge_vocab.hh │ │ ├── merge_vocab_test.cc │ │ ├── normalize.cc │ │ ├── normalize.hh │ │ ├── normalize_test.cc │ │ ├── pipeline.cc │ │ ├── pipeline.hh │ │ ├── split_worker.cc │ │ ├── split_worker.hh │ │ ├── streaming_example_main.cc │ │ ├── tune_derivatives.cc │ │ ├── tune_derivatives.hh │ │ ├── tune_derivatives_test.cc │ │ ├── tune_instances.cc │ │ ├── tune_instances.hh │ │ ├── tune_instances_test.cc │ │ ├── tune_matrix.hh │ │ ├── tune_weights.cc │ │ ├── tune_weights.hh │ │ ├── universal_vocab.cc │ │ └── universal_vocab.hh │ ├── kenlm_benchmark_main.cc │ ├── left.hh │ ├── left_test.cc │ ├── lm_exception.cc │ ├── lm_exception.hh │ ├── max_order.hh │ ├── model.cc │ ├── model.hh │ ├── model_test.cc │ ├── model_type.hh │ ├── ngram_query.hh │ ├── partial.hh │ ├── partial_test.cc │ ├── quantize.cc │ ├── quantize.hh │ ├── query_main.cc │ ├── read_arpa.cc │ ├── read_arpa.hh │ ├── return.hh │ ├── search_hashed.cc │ ├── search_hashed.hh │ ├── search_trie.cc │ ├── search_trie.hh │ ├── sizes.cc │ ├── sizes.hh │ ├── state.hh │ ├── test.arpa │ ├── test_nounk.arpa │ ├── trie.cc │ ├── trie.hh │ ├── trie_sort.cc │ ├── trie_sort.hh │ ├── value.hh │ ├── value_build.cc │ ├── value_build.hh │ ├── virtual_interface.cc │ ├── virtual_interface.hh │ ├── vocab.cc │ ├── vocab.hh │ ├── weights.hh │ ├── word_index.hh │ └── wrappers │ │ ├── README │ │ ├── nplm.cc │ │ └── nplm.hh ├── python │ ├── _kenlm.pxd │ ├── example.py │ ├── kenlm.cpp │ └── kenlm.pyx ├── setup.py ├── util │ ├── CMakeLists.txt │ ├── bit_packing.cc │ ├── bit_packing.hh │ ├── bit_packing_test.cc │ ├── cat_compressed_main.cc │ ├── double-conversion │ │ ├── CMakeLists.txt │ │ ├── LICENSE │ │ ├── bignum-dtoa.cc │ │ ├── bignum-dtoa.h │ │ ├── bignum.cc │ │ ├── bignum.h │ │ ├── cached-powers.cc │ │ ├── cached-powers.h │ │ ├── diy-fp.cc │ │ ├── diy-fp.h │ │ ├── double-conversion.cc │ │ ├── double-conversion.h │ │ ├── fast-dtoa.cc │ │ ├── fast-dtoa.h │ │ ├── fixed-dtoa.cc │ │ ├── fixed-dtoa.h │ │ ├── ieee.h │ │ ├── strtod.cc │ │ ├── strtod.h │ │ └── utils.h │ ├── ersatz_progress.cc │ ├── ersatz_progress.hh │ ├── exception.cc │ ├── exception.hh │ ├── fake_ostream.hh │ ├── file.cc │ ├── file.hh │ ├── file_piece.cc │ ├── file_piece.hh │ ├── file_piece_test.cc │ ├── file_stream.hh │ ├── fixed_array.hh │ ├── float_to_string.cc │ ├── float_to_string.hh │ ├── getopt.c │ ├── getopt.hh │ ├── have.hh │ ├── integer_to_string.cc │ ├── integer_to_string.hh │ ├── integer_to_string_test.cc │ ├── joint_sort.hh │ ├── joint_sort_test.cc │ ├── mmap.cc │ ├── mmap.hh │ ├── multi_intersection.hh │ ├── multi_intersection_test.cc │ ├── murmur_hash.cc │ ├── murmur_hash.hh │ ├── parallel_read.cc │ ├── parallel_read.hh │ ├── pcqueue.hh │ ├── pcqueue_test.cc │ ├── pool.cc │ ├── pool.hh │ ├── probing_hash_table.hh │ ├── probing_hash_table_benchmark_main.cc │ ├── probing_hash_table_test.cc │ ├── proxy_iterator.hh │ ├── read_compressed.cc │ ├── read_compressed.hh │ ├── read_compressed_test.cc │ ├── scoped.cc │ ├── scoped.hh │ ├── sized_iterator.hh │ ├── sized_iterator_test.cc │ ├── sorted_uniform.hh │ ├── sorted_uniform_test.cc │ ├── spaces.cc │ ├── spaces.hh │ ├── stream │ │ ├── CMakeLists.txt │ │ ├── block.hh │ │ ├── chain.cc │ │ ├── chain.hh │ │ ├── config.hh │ │ ├── count_records.cc │ │ ├── count_records.hh │ │ ├── io.cc │ │ ├── io.hh │ │ ├── io_test.cc │ │ ├── line_input.cc │ │ ├── line_input.hh │ │ ├── multi_progress.cc │ │ ├── multi_progress.hh │ │ ├── multi_stream.hh │ │ ├── rewindable_stream.cc │ │ ├── rewindable_stream.hh │ │ ├── rewindable_stream_test.cc │ │ ├── sort.hh │ │ ├── sort_test.cc │ │ ├── stream.hh │ │ ├── stream_test.cc │ │ └── typed_stream.hh │ ├── string_piece.cc │ ├── string_piece.hh │ ├── string_piece_hash.hh │ ├── string_stream.hh │ ├── string_stream_test.cc │ ├── thread_pool.hh │ ├── tokenize_piece.hh │ ├── tokenize_piece_test.cc │ ├── usage.cc │ └── usage.hh └── windows │ ├── build_binary.vcxproj │ ├── kenlm.sln │ ├── kenlm.vcxproj │ ├── lmplz.vcxproj │ └── ngram_query.vcxproj ├── kenmodels ├── zhwiki_bigram.arpa ├── zhwiki_bigram.klm ├── zhwiki_trigram.arpa └── zhwiki_trigram.klm ├── langconv.py ├── train_kenlm.sh └── zh_wiki.py /data/cncorpus/CorpusCharacterlist.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/cncorpus/CorpusCharacterlist.xls -------------------------------------------------------------------------------- /data/cncorpus/CorpusWordPOSlist.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/cncorpus/CorpusWordPOSlist.xls -------------------------------------------------------------------------------- /data/cncorpus/CorpusWordlist.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/cncorpus/CorpusWordlist.xls -------------------------------------------------------------------------------- /data/cncorpus/现代汉语常用字表.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/cncorpus/现代汉语常用字表.xls -------------------------------------------------------------------------------- /data/cncorpus/现代汉语通用字表.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/cncorpus/现代汉语通用字表.xls -------------------------------------------------------------------------------- /data/cncorpus/通用规范汉字表.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/cncorpus/通用规范汉字表.xls -------------------------------------------------------------------------------- /data/common.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/common.pkl -------------------------------------------------------------------------------- /data/sighan/processed/clp14csc_C1_training.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/sighan/processed/clp14csc_C1_training.pkl -------------------------------------------------------------------------------- /data/sighan/processed/sighan15_A2_training.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/sighan/processed/sighan15_A2_training.pkl -------------------------------------------------------------------------------- /data/simp.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/simp.pickle -------------------------------------------------------------------------------- /data/simp_simplified.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/simp_simplified.pickle -------------------------------------------------------------------------------- /data/simp_sm.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/simp_sm.pickle -------------------------------------------------------------------------------- /data/sims.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/sims.pickle -------------------------------------------------------------------------------- /data/xjz.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/xjz.pickle -------------------------------------------------------------------------------- /data/xjz.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/xjz.pkl -------------------------------------------------------------------------------- /feed_kenlm.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import mmap 3 | 4 | fpath = "./data/wikipedia/cn_wiki.txt" 5 | with codecs.open(fpath, 'r', encoding='utf-8') as f: 6 | text = f.readlines() 7 | 8 | for line in text[:10]: 9 | print(' '.join(line.strip()), end=' ') 10 | # print(' '.join(line.strip().split(' / ')), end=' ') 11 | -------------------------------------------------------------------------------- /kenlm/.gitignore: -------------------------------------------------------------------------------- 1 | util/file_piece.cc.gz 2 | *.swp 3 | *.o 4 | doc/ 5 | build/ 6 | ._* 7 | windows/Win32 8 | windows/x64 9 | windows/*.user 10 | windows/*.sdf 11 | windows/*.opensdf 12 | windows/*.suo 13 | CMakeFiles 14 | cmake_install.cmake 15 | CMakeCache.txt 16 | CTestTestfile.cmake 17 | DartConfiguration.tcl 18 | Makefile 19 | -------------------------------------------------------------------------------- /kenlm/BUILDING: -------------------------------------------------------------------------------- 1 | KenLM has switched to cmake 2 | cmake . 3 | make -j 4 4 | But they recommend building out of tree 5 | mkdir -p build && cd build 6 | cmake .. 7 | make -j 4 8 | 9 | If you only want the query code and do not care about compression (.gz, .bz2, and .xz): 10 | ./compile_query_only.sh 11 | 12 | Windows: 13 | The windows directory has visual studio files. Note that you need to compile 14 | the kenlm project before build_binary and ngram_query projects. 15 | -------------------------------------------------------------------------------- /kenlm/GIT_REVISION: -------------------------------------------------------------------------------- 1 | cdd794598ea15dc23a7daaf7a8cf89423c97f7e6 2 | -------------------------------------------------------------------------------- /kenlm/LICENSE: -------------------------------------------------------------------------------- 1 | Most of the code here is licensed under the LGPL. There are exceptions that 2 | have their own licenses, listed below. See comments in those files for more 3 | details. 4 | 5 | util/getopt.* is getopt for Windows 6 | util/murmur_hash.cc 7 | util/string_piece.hh and util/string_piece.cc 8 | util/double-conversion/LICENSE covers util/double-conversion except the build files 9 | util/file.cc contains a modified implementation of mkstemp under the LGPL 10 | util/integer_to_string.* is BSD 11 | 12 | For the rest: 13 | 14 | KenLM is free software: you can redistribute it and/or modify 15 | it under the terms of the GNU Lesser General Public License as published 16 | by the Free Software Foundation, either version 2.1 of the License, or 17 | (at your option) any later version. 18 | 19 | KenLM is distributed in the hope that it will be useful, 20 | but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | GNU Lesser General Public License for more details. 23 | 24 | You should have received a copy of the GNU Lesser General Public License 2.1 25 | along with KenLM code. If not, see . 26 | -------------------------------------------------------------------------------- /kenlm/MANIFEST.in: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | include setup.py 3 | include lm/*.cc 4 | include lm/*.hh 5 | include python/*.cpp 6 | include util/*.cc 7 | include util/*.hh 8 | include util/double-conversion/*.cc 9 | include util/double-conversion/*.h 10 | -------------------------------------------------------------------------------- /kenlm/clean_query_only.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | rm -rf {lm,util,util/double-conversion}/*.o bin/{query,build_binary} 3 | -------------------------------------------------------------------------------- /kenlm/compile_query_only.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #This is just an example compilation. You should integrate these files into your build system. Boost jam is provided and preferred. 3 | 4 | echo You must use ./bjam if you want language model estimation, filtering, or support for compressed files \(.gz, .bz2, .xz\) 1>&2 5 | 6 | rm {lm,util}/*.o 2>/dev/null 7 | set -e 8 | 9 | CXX=${CXX:-g++} 10 | 11 | CXXFLAGS+=" -I. -O3 -DNDEBUG -DKENLM_MAX_ORDER=6" 12 | 13 | #If this fails for you, consider using bjam. 14 | if [ ${#NPLM} != 0 ]; then 15 | CXXFLAGS+=" -DHAVE_NPLM -lneuralLM -L$NPLM/src -I$NPLM/src -lboost_thread-mt -fopenmp" 16 | ADDED_PATHS="lm/wrappers/*.cc" 17 | fi 18 | echo 'Compiling with '$CXX $CXXFLAGS 19 | 20 | #Grab all cc files in these directories except those ending in test.cc or main.cc 21 | objects="" 22 | for i in util/double-conversion/*.cc util/*.cc lm/*.cc $ADDED_PATHS; do 23 | if [ "${i%test.cc}" == "$i" ] && [ "${i%main.cc}" == "$i" ]; then 24 | $CXX $CXXFLAGS -c $i -o ${i%.cc}.o 25 | objects="$objects ${i%.cc}.o" 26 | fi 27 | done 28 | 29 | mkdir -p bin 30 | if [ "$(uname)" != Darwin ]; then 31 | CXXFLAGS="$CXXFLAGS -lrt" 32 | fi 33 | $CXX lm/build_binary_main.cc $objects -o bin/build_binary $CXXFLAGS $LDFLAGS 34 | $CXX lm/query_main.cc $objects -o bin/query $CXXFLAGS $LDFLAGS 35 | -------------------------------------------------------------------------------- /kenlm/include/lm/blank.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_BLANK_H 2 | #define LM_BLANK_H 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | namespace lm { 10 | namespace ngram { 11 | 12 | /* Suppose "foo bar" appears with zero backoff but there is no trigram 13 | * beginning with these words. Then, when scoring "foo bar", the model could 14 | * return out_state containing "bar" or even null context if "bar" also has no 15 | * backoff and is never followed by another word. Then the backoff is set to 16 | * kNoExtensionBackoff. If the n-gram might be extended, then out_state must 17 | * contain the full n-gram, in which case kExtensionBackoff is set. In any 18 | * case, if an n-gram has non-zero backoff, the full state is returned so 19 | * backoff can be properly charged. 20 | * These differ only in sign bit because the backoff is in fact zero in either 21 | * case. 22 | */ 23 | const float kNoExtensionBackoff = -0.0; 24 | const float kExtensionBackoff = 0.0; 25 | const uint64_t kNoExtensionQuant = 0; 26 | const uint64_t kExtensionQuant = 1; 27 | 28 | inline void SetExtension(float &backoff) { 29 | if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff; 30 | } 31 | 32 | // This compiles down nicely. 33 | inline bool HasExtension(const float &backoff) { 34 | typedef union { float f; uint32_t i; } UnionValue; 35 | UnionValue compare, interpret; 36 | compare.f = kNoExtensionBackoff; 37 | interpret.f = backoff; 38 | return compare.i != interpret.i; 39 | } 40 | 41 | } // namespace ngram 42 | } // namespace lm 43 | #endif // LM_BLANK_H 44 | -------------------------------------------------------------------------------- /kenlm/include/lm/builder/adjust_counts.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_BUILDER_ADJUST_COUNTS_H 2 | #define LM_BUILDER_ADJUST_COUNTS_H 3 | 4 | #include "lm/builder/discount.hh" 5 | #include "lm/lm_exception.hh" 6 | #include "util/exception.hh" 7 | 8 | #include 9 | 10 | #include 11 | 12 | namespace util { namespace stream { class ChainPositions; } } 13 | 14 | namespace lm { 15 | namespace builder { 16 | 17 | class BadDiscountException : public util::Exception { 18 | public: 19 | BadDiscountException() throw(); 20 | ~BadDiscountException() throw(); 21 | }; 22 | 23 | struct DiscountConfig { 24 | // Overrides discounts for orders [1,discount_override.size()]. 25 | std::vector overwrite; 26 | // If discounting fails for an order, copy them from here. 27 | Discount fallback; 28 | // What to do when discounts are out of range or would trigger divison by 29 | // zero. It it does something other than THROW_UP, use fallback_discount. 30 | WarningAction bad_action; 31 | }; 32 | 33 | /* Compute adjusted counts. 34 | * Input: unique suffix sorted N-grams (and just the N-grams) with raw counts. 35 | * Output: [1,N]-grams with adjusted counts. 36 | * [1,N)-grams are in suffix order 37 | * N-grams are in undefined order (they're going to be sorted anyway). 38 | */ 39 | class AdjustCounts { 40 | public: 41 | // counts: output 42 | // counts_pruned: output 43 | // discounts: mostly output. If the input already has entries, they will be kept. 44 | // prune_thresholds: input. n-grams with normal (not adjusted) count below this will be pruned. 45 | AdjustCounts( 46 | const std::vector &prune_thresholds, 47 | std::vector &counts, 48 | std::vector &counts_pruned, 49 | const std::vector &prune_words, 50 | const DiscountConfig &discount_config, 51 | std::vector &discounts) 52 | : prune_thresholds_(prune_thresholds), counts_(counts), counts_pruned_(counts_pruned), 53 | prune_words_(prune_words), discount_config_(discount_config), discounts_(discounts) 54 | {} 55 | 56 | void Run(const util::stream::ChainPositions &positions); 57 | 58 | private: 59 | const std::vector &prune_thresholds_; 60 | std::vector &counts_; 61 | std::vector &counts_pruned_; 62 | const std::vector &prune_words_; 63 | 64 | DiscountConfig discount_config_; 65 | std::vector &discounts_; 66 | }; 67 | 68 | } // namespace builder 69 | } // namespace lm 70 | 71 | #endif // LM_BUILDER_ADJUST_COUNTS_H 72 | 73 | -------------------------------------------------------------------------------- /kenlm/include/lm/builder/corpus_count.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_BUILDER_CORPUS_COUNT_H 2 | #define LM_BUILDER_CORPUS_COUNT_H 3 | 4 | #include "lm/lm_exception.hh" 5 | #include "lm/word_index.hh" 6 | #include "util/scoped.hh" 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace util { 14 | class FilePiece; 15 | namespace stream { 16 | class ChainPosition; 17 | } // namespace stream 18 | } // namespace util 19 | 20 | namespace lm { 21 | namespace builder { 22 | 23 | class CorpusCount { 24 | public: 25 | // Memory usage will be DedupeMultipler(order) * block_size + total_chain_size + unknown vocab_hash_size 26 | static float DedupeMultiplier(std::size_t order); 27 | 28 | // How much memory vocabulary will use based on estimated size of the vocab. 29 | static std::size_t VocabUsage(std::size_t vocab_estimate); 30 | 31 | // token_count: out. 32 | // type_count aka vocabulary size. Initialize to an estimate. It is set to the exact value. 33 | CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::vector &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol); 34 | 35 | void Run(const util::stream::ChainPosition &position); 36 | 37 | private: 38 | util::FilePiece &from_; 39 | int vocab_write_; 40 | uint64_t &token_count_; 41 | WordIndex &type_count_; 42 | std::vector& prune_words_; 43 | const std::string& prune_vocab_filename_; 44 | 45 | std::size_t dedupe_mem_size_; 46 | util::scoped_malloc dedupe_mem_; 47 | 48 | WarningAction disallowed_symbol_action_; 49 | }; 50 | 51 | } // namespace builder 52 | } // namespace lm 53 | #endif // LM_BUILDER_CORPUS_COUNT_H 54 | -------------------------------------------------------------------------------- /kenlm/include/lm/builder/discount.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_BUILDER_DISCOUNT_H 2 | #define LM_BUILDER_DISCOUNT_H 3 | 4 | #include 5 | 6 | #include 7 | 8 | namespace lm { 9 | namespace builder { 10 | 11 | struct Discount { 12 | float amount[4]; 13 | 14 | float Get(uint64_t count) const { 15 | return amount[std::min(count, 3)]; 16 | } 17 | 18 | float Apply(uint64_t count) const { 19 | return static_cast(count) - Get(count); 20 | } 21 | }; 22 | 23 | } // namespace builder 24 | } // namespace lm 25 | 26 | #endif // LM_BUILDER_DISCOUNT_H 27 | -------------------------------------------------------------------------------- /kenlm/include/lm/builder/hash_gamma.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_BUILDER_HASH_GAMMA__ 2 | #define LM_BUILDER_HASH_GAMMA__ 3 | 4 | #include 5 | 6 | namespace lm { namespace builder { 7 | 8 | #pragma pack(push) 9 | #pragma pack(4) 10 | 11 | struct HashGamma { 12 | uint64_t hash_value; 13 | float gamma; 14 | }; 15 | 16 | #pragma pack(pop) 17 | 18 | }} // namespaces 19 | #endif // LM_BUILDER_HASH_GAMMA__ 20 | -------------------------------------------------------------------------------- /kenlm/include/lm/builder/header_info.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_BUILDER_HEADER_INFO_H 2 | #define LM_BUILDER_HEADER_INFO_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | // Some configuration info that is used to add 9 | // comments to the beginning of an ARPA file 10 | struct HeaderInfo { 11 | std::string input_file; 12 | uint64_t token_count; 13 | std::vector counts_pruned; 14 | 15 | HeaderInfo() {} 16 | 17 | HeaderInfo(const std::string& input_file_in, uint64_t token_count_in, const std::vector &counts_pruned_in) 18 | : input_file(input_file_in), token_count(token_count_in), counts_pruned(counts_pruned_in) {} 19 | 20 | // TODO: Add smoothing type 21 | // TODO: More info if multiple models were interpolated 22 | }; 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /kenlm/include/lm/builder/initial_probabilities.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_BUILDER_INITIAL_PROBABILITIES_H 2 | #define LM_BUILDER_INITIAL_PROBABILITIES_H 3 | 4 | #include "lm/builder/discount.hh" 5 | #include "util/stream/config.hh" 6 | 7 | #include 8 | 9 | namespace util { namespace stream { class Chains; } } 10 | 11 | namespace lm { 12 | namespace builder { 13 | 14 | struct InitialProbabilitiesConfig { 15 | // These should be small buffers to keep the adder from getting too far ahead 16 | util::stream::ChainConfig adder_in; 17 | util::stream::ChainConfig adder_out; 18 | // SRILM doesn't normally interpolate unigrams. 19 | bool interpolate_unigrams; 20 | }; 21 | 22 | /* Compute initial (uninterpolated) probabilities 23 | * primary: the normal chain of n-grams. Incoming is context sorted adjusted 24 | * counts. Outgoing has uninterpolated probabilities for use by Interpolate. 25 | * second_in: a second copy of the primary input. Discard the output. 26 | * gamma_out: Computed gamma values are output on these chains in suffix order. 27 | * The values are bare floats and should be buffered for interpolation to 28 | * use. 29 | */ 30 | void InitialProbabilities( 31 | const InitialProbabilitiesConfig &config, 32 | const std::vector &discounts, 33 | util::stream::Chains &primary, 34 | util::stream::Chains &second_in, 35 | util::stream::Chains &gamma_out, 36 | const std::vector &prune_thresholds, 37 | bool prune_vocab); 38 | 39 | } // namespace builder 40 | } // namespace lm 41 | 42 | #endif // LM_BUILDER_INITIAL_PROBABILITIES_H 43 | -------------------------------------------------------------------------------- /kenlm/include/lm/builder/interpolate.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_BUILDER_INTERPOLATE_H 2 | #define LM_BUILDER_INTERPOLATE_H 3 | 4 | #include "util/stream/multi_stream.hh" 5 | 6 | #include 7 | 8 | #include 9 | 10 | namespace lm { namespace builder { 11 | 12 | /* Interpolate step. 13 | * Input: suffix sorted n-grams with (p_uninterpolated, gamma) from 14 | * InitialProbabilities. 15 | * Output: suffix sorted n-grams with complete probability 16 | */ 17 | class Interpolate { 18 | public: 19 | // Normally vocab_size is the unigram count-1 (since p() = 0) but might 20 | // be larger when the user specifies a consistent vocabulary size. 21 | explicit Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector &prune_thresholds, bool prune_vocab, bool output_q_); 22 | 23 | void Run(const util::stream::ChainPositions &positions); 24 | 25 | private: 26 | float uniform_prob_; 27 | util::stream::ChainPositions backoffs_; 28 | const std::vector prune_thresholds_; 29 | bool prune_vocab_; 30 | bool output_q_; 31 | }; 32 | 33 | }} // namespaces 34 | #endif // LM_BUILDER_INTERPOLATE_H 35 | -------------------------------------------------------------------------------- /kenlm/include/lm/builder/joint_order.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_BUILDER_JOINT_ORDER_H 2 | #define LM_BUILDER_JOINT_ORDER_H 3 | 4 | #include "lm/builder/ngram_stream.hh" 5 | #include "lm/lm_exception.hh" 6 | 7 | #ifdef DEBUG 8 | #include "util/fixed_array.hh" 9 | #include 10 | #endif 11 | 12 | #include 13 | 14 | namespace lm { namespace builder { 15 | 16 | template void JointOrder(const util::stream::ChainPositions &positions, Callback &callback) { 17 | // Allow matching to reference streams[-1]. 18 | NGramStreams streams_with_dummy; 19 | streams_with_dummy.InitWithDummy(positions); 20 | NGramStream *streams = streams_with_dummy.begin() + 1; 21 | 22 | unsigned int order; 23 | for (order = 0; order < positions.size() && streams[order]; ++order) {} 24 | assert(order); // should always have . 25 | 26 | // Debugging only: call comparison function to sanity check order. 27 | #ifdef DEBUG 28 | util::FixedArray less_compare(order); 29 | for (unsigned i = 0; i < order; ++i) 30 | less_compare.push_back(i + 1); 31 | #endif // DEBUG 32 | 33 | unsigned int current = 0; 34 | while (true) { 35 | // Does the context match the lower one? 36 | if (!memcmp(streams[static_cast(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset, sizeof(WordIndex) * current)) { 37 | callback.Enter(current, *streams[current]); 38 | // Transition to looking for extensions. 39 | if (++current < order) continue; 40 | } 41 | #ifdef DEBUG 42 | // match_check[current - 1] matches current-grams 43 | // The lower-order stream (which skips fewer current-grams) should always be <= the higher order-stream (which can skip current-grams). 44 | else if (!less_compare[current - 1](streams[static_cast(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset)) { 45 | std::cerr << "Stream out of order detected" << std::endl; 46 | abort(); 47 | } 48 | #endif // DEBUG 49 | // No extension left. 50 | while(true) { 51 | assert(current > 0); 52 | --current; 53 | callback.Exit(current, *streams[current]); 54 | 55 | if (++streams[current]) break; 56 | 57 | UTIL_THROW_IF(order != current + 1, FormatLoadException, "Detected n-gram without matching suffix"); 58 | 59 | order = current; 60 | if (!order) return; 61 | } 62 | } 63 | } 64 | 65 | }} // namespaces 66 | 67 | #endif // LM_BUILDER_JOINT_ORDER_H 68 | -------------------------------------------------------------------------------- /kenlm/include/lm/builder/ngram_stream.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_BUILDER_NGRAM_STREAM_H 2 | #define LM_BUILDER_NGRAM_STREAM_H 3 | 4 | #include "lm/builder/ngram.hh" 5 | #include "util/stream/chain.hh" 6 | #include "util/stream/multi_stream.hh" 7 | #include "util/stream/stream.hh" 8 | 9 | #include 10 | 11 | namespace lm { namespace builder { 12 | 13 | class NGramStream { 14 | public: 15 | NGramStream() : gram_(NULL, 0) {} 16 | 17 | NGramStream(const util::stream::ChainPosition &position) : gram_(NULL, 0) { 18 | Init(position); 19 | } 20 | 21 | void Init(const util::stream::ChainPosition &position) { 22 | stream_.Init(position); 23 | gram_ = NGram(stream_.Get(), NGram::OrderFromSize(position.GetChain().EntrySize())); 24 | } 25 | 26 | NGram &operator*() { return gram_; } 27 | const NGram &operator*() const { return gram_; } 28 | 29 | NGram *operator->() { return &gram_; } 30 | const NGram *operator->() const { return &gram_; } 31 | 32 | void *Get() { return stream_.Get(); } 33 | const void *Get() const { return stream_.Get(); } 34 | 35 | operator bool() const { return stream_; } 36 | bool operator!() const { return !stream_; } 37 | void Poison() { stream_.Poison(); } 38 | 39 | NGramStream &operator++() { 40 | ++stream_; 41 | gram_.ReBase(stream_.Get()); 42 | return *this; 43 | } 44 | 45 | private: 46 | NGram gram_; 47 | util::stream::Stream stream_; 48 | }; 49 | 50 | inline util::stream::Chain &operator>>(util::stream::Chain &chain, NGramStream &str) { 51 | str.Init(chain.Add()); 52 | return chain; 53 | } 54 | 55 | typedef util::stream::GenericStreams NGramStreams; 56 | 57 | }} // namespaces 58 | #endif // LM_BUILDER_NGRAM_STREAM_H 59 | -------------------------------------------------------------------------------- /kenlm/include/lm/enumerate_vocab.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_ENUMERATE_VOCAB_H 2 | #define LM_ENUMERATE_VOCAB_H 3 | 4 | #include "lm/word_index.hh" 5 | #include "util/string_piece.hh" 6 | 7 | namespace lm { 8 | 9 | /* If you need the actual strings in the vocabulary, inherit from this class 10 | * and implement Add. Then put a pointer in Config.enumerate_vocab; it does 11 | * not take ownership. Add is called once per vocab word. index starts at 0 12 | * and increases by 1 each time. This is only used by the Model constructor; 13 | * the pointer is not retained by the class. 14 | */ 15 | class EnumerateVocab { 16 | public: 17 | virtual ~EnumerateVocab() {} 18 | 19 | virtual void Add(WordIndex index, const StringPiece &str) = 0; 20 | 21 | protected: 22 | EnumerateVocab() {} 23 | }; 24 | 25 | } // namespace lm 26 | 27 | #endif // LM_ENUMERATE_VOCAB_H 28 | 29 | -------------------------------------------------------------------------------- /kenlm/include/lm/filter/wrapper.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_FILTER_WRAPPER_H 2 | #define LM_FILTER_WRAPPER_H 3 | 4 | #include "util/string_piece.hh" 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | namespace lm { 11 | 12 | // Provide a single-output filter with the same interface as a 13 | // multiple-output filter so clients code against one interface. 14 | template class BinaryFilter { 15 | public: 16 | // Binary modes are just references (and a set) and it makes the API cleaner to copy them. 17 | explicit BinaryFilter(Binary binary) : binary_(binary) {} 18 | 19 | template void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) { 20 | if (binary_.PassNGram(begin, end)) 21 | output.AddNGram(line); 22 | } 23 | 24 | template void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { 25 | AddNGram(util::TokenIter(ngram, ' '), util::TokenIter::end(), line, output); 26 | } 27 | 28 | void Flush() const {} 29 | 30 | private: 31 | Binary binary_; 32 | }; 33 | 34 | // Wrap another filter to pay attention only to context words 35 | template class ContextFilter { 36 | public: 37 | typedef FilterT Filter; 38 | 39 | explicit ContextFilter(Filter &backend) : backend_(backend) {} 40 | 41 | template void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { 42 | // Find beginning of string or last space. 43 | const char *last_space; 44 | for (last_space = ngram.data() + ngram.size() - 1; last_space > ngram.data() && *last_space != ' '; --last_space) {} 45 | backend_.AddNGram(StringPiece(ngram.data(), last_space - ngram.data()), line, output); 46 | } 47 | 48 | void Flush() const {} 49 | 50 | private: 51 | Filter backend_; 52 | }; 53 | 54 | } // namespace lm 55 | 56 | #endif // LM_FILTER_WRAPPER_H 57 | -------------------------------------------------------------------------------- /kenlm/include/lm/interpolate/arpa_to_stream.hh: -------------------------------------------------------------------------------- 1 | #include "lm/read_arpa.hh" 2 | #include "util/file_piece.hh" 3 | 4 | #include 5 | 6 | #include 7 | 8 | namespace util { namespace stream { class ChainPositions; } } 9 | 10 | namespace lm { 11 | 12 | namespace ngram { 13 | template class GrowableVocab; 14 | class WriteUniqueWords; 15 | } // namespace ngram 16 | 17 | namespace interpolate { 18 | 19 | class ARPAToStream { 20 | public: 21 | // Takes ownership of fd. 22 | explicit ARPAToStream(int fd, ngram::GrowableVocab &vocab); 23 | 24 | std::size_t Order() const { return counts_.size(); } 25 | 26 | const std::vector &Counts() const { return counts_; } 27 | 28 | void Run(const util::stream::ChainPositions &positions); 29 | 30 | private: 31 | util::FilePiece in_; 32 | 33 | std::vector counts_; 34 | 35 | ngram::GrowableVocab &vocab_; 36 | }; 37 | 38 | }} // namespaces 39 | -------------------------------------------------------------------------------- /kenlm/include/lm/lm_exception.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_LM_EXCEPTION_H 2 | #define LM_LM_EXCEPTION_H 3 | 4 | // Named to avoid conflict with util/exception.hh. 5 | 6 | #include "util/exception.hh" 7 | #include "util/string_piece.hh" 8 | 9 | #include 10 | #include 11 | 12 | namespace lm { 13 | 14 | typedef enum {THROW_UP, COMPLAIN, SILENT} WarningAction; 15 | 16 | class ConfigException : public util::Exception { 17 | public: 18 | ConfigException() throw(); 19 | ~ConfigException() throw(); 20 | }; 21 | 22 | class LoadException : public util::Exception { 23 | public: 24 | virtual ~LoadException() throw(); 25 | 26 | protected: 27 | LoadException() throw(); 28 | }; 29 | 30 | class FormatLoadException : public LoadException { 31 | public: 32 | FormatLoadException() throw(); 33 | ~FormatLoadException() throw(); 34 | }; 35 | 36 | class VocabLoadException : public LoadException { 37 | public: 38 | virtual ~VocabLoadException() throw(); 39 | VocabLoadException() throw(); 40 | }; 41 | 42 | class SpecialWordMissingException : public VocabLoadException { 43 | public: 44 | explicit SpecialWordMissingException() throw(); 45 | ~SpecialWordMissingException() throw(); 46 | }; 47 | 48 | } // namespace lm 49 | 50 | #endif // LM_LM_EXCEPTION 51 | -------------------------------------------------------------------------------- /kenlm/include/lm/max_order.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_MAX_ORDER_H 2 | #define LM_MAX_ORDER_H 3 | /* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER, THEN CHANGE THE BUILD SYSTEM. 4 | * If not, this is the default maximum order. 5 | * Having this limit means that State can be 6 | * (kMaxOrder - 1) * sizeof(float) bytes instead of 7 | * sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead 8 | */ 9 | #ifndef KENLM_ORDER_MESSAGE 10 | #define KENLM_ORDER_MESSAGE "If your build system supports changing KENLM_MAX_ORDER, change it there and recompile. In the KenLM tarball or Moses, use e.g. `bjam --max-kenlm-order=6 -a'. Otherwise, edit lm/max_order.hh." 11 | #endif 12 | 13 | #endif // LM_MAX_ORDER_H 14 | -------------------------------------------------------------------------------- /kenlm/include/lm/model_type.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_MODEL_TYPE_H 2 | #define LM_MODEL_TYPE_H 3 | 4 | namespace lm { 5 | namespace ngram { 6 | 7 | /* Not the best numbering system, but it grew this way for historical reasons 8 | * and I want to preserve existing binary files. */ 9 | typedef enum {PROBING=0, REST_PROBING=1, TRIE=2, QUANT_TRIE=3, ARRAY_TRIE=4, QUANT_ARRAY_TRIE=5} ModelType; 10 | 11 | // Historical names. 12 | const ModelType HASH_PROBING = PROBING; 13 | const ModelType TRIE_SORTED = TRIE; 14 | const ModelType QUANT_TRIE_SORTED = QUANT_TRIE; 15 | const ModelType ARRAY_TRIE_SORTED = ARRAY_TRIE; 16 | const ModelType QUANT_ARRAY_TRIE_SORTED = QUANT_ARRAY_TRIE; 17 | 18 | const static ModelType kQuantAdd = static_cast(QUANT_TRIE - TRIE); 19 | const static ModelType kArrayAdd = static_cast(ARRAY_TRIE - TRIE); 20 | 21 | } // namespace ngram 22 | } // namespace lm 23 | #endif // LM_MODEL_TYPE_H 24 | -------------------------------------------------------------------------------- /kenlm/include/lm/neural/wordvecs.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_NEURAL_WORDVECS_H 2 | #define LM_NEURAL_WORDVECS_H 3 | 4 | #include "util/scoped.hh" 5 | #include "lm/vocab.hh" 6 | 7 | #include 8 | 9 | namespace util { class FilePiece; } 10 | 11 | namespace lm { 12 | namespace neural { 13 | 14 | class WordVecs { 15 | public: 16 | // Columns of the matrix are word vectors. The column index is the word. 17 | typedef Eigen::Matrix Storage; 18 | 19 | /* The file should begin with a line stating the number of word vectors and 20 | * the length of the vectors. Then it's followed by lines containing a 21 | * word followed by floating-point values. 22 | */ 23 | explicit WordVecs(util::FilePiece &in); 24 | 25 | const Storage &Vectors() const { return vecs_; } 26 | 27 | WordIndex Index(StringPiece str) const { return vocab_.Index(str); } 28 | 29 | private: 30 | util::scoped_malloc vocab_backing_; 31 | ngram::ProbingVocabulary vocab_; 32 | 33 | Storage vecs_; 34 | }; 35 | 36 | }} // namespaces 37 | 38 | #endif // LM_NEURAL_WORDVECS_H 39 | -------------------------------------------------------------------------------- /kenlm/include/lm/return.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_RETURN_H 2 | #define LM_RETURN_H 3 | 4 | #include 5 | 6 | namespace lm { 7 | /* Structure returned by scoring routines. */ 8 | struct FullScoreReturn { 9 | // log10 probability 10 | float prob; 11 | 12 | /* The length of n-gram matched. Do not use this for recombination. 13 | * Consider a model containing only the following n-grams: 14 | * -1 foo 15 | * -3.14 bar 16 | * -2.718 baz -5 17 | * -6 foo bar 18 | * 19 | * If you score ``bar'' then ngram_length is 1 and recombination state is the 20 | * empty string because bar has zero backoff and does not extend to the 21 | * right. 22 | * If you score ``foo'' then ngram_length is 1 and recombination state is 23 | * ``foo''. 24 | * 25 | * Ideally, keep output states around and compare them. Failing that, 26 | * get out_state.ValidLength() and use that length for recombination. 27 | */ 28 | unsigned char ngram_length; 29 | 30 | /* Left extension information. If independent_left is set, then prob is 31 | * independent of words to the left (up to additional backoff). Otherwise, 32 | * extend_left indicates how to efficiently extend further to the left. 33 | */ 34 | bool independent_left; 35 | uint64_t extend_left; // Defined only if independent_left 36 | 37 | // Rest cost for extension to the left. 38 | float rest; 39 | }; 40 | 41 | } // namespace lm 42 | #endif // LM_RETURN_H 43 | -------------------------------------------------------------------------------- /kenlm/include/lm/sizes.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_SIZES_H 2 | #define LM_SIZES_H 3 | 4 | #include 5 | 6 | #include 7 | 8 | namespace lm { namespace ngram { 9 | 10 | struct Config; 11 | 12 | void ShowSizes(const std::vector &counts, const lm::ngram::Config &config); 13 | void ShowSizes(const std::vector &counts); 14 | void ShowSizes(const char *file, const lm::ngram::Config &config); 15 | 16 | }} // namespaces 17 | #endif // LM_SIZES_H 18 | -------------------------------------------------------------------------------- /kenlm/include/lm/weights.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_WEIGHTS_H 2 | #define LM_WEIGHTS_H 3 | 4 | // Weights for n-grams. Probability and possibly a backoff. 5 | 6 | namespace lm { 7 | struct Prob { 8 | float prob; 9 | }; 10 | // No inheritance so this will be a POD. 11 | struct ProbBackoff { 12 | float prob; 13 | float backoff; 14 | }; 15 | struct RestWeights { 16 | float prob; 17 | float backoff; 18 | float rest; 19 | }; 20 | 21 | } // namespace lm 22 | #endif // LM_WEIGHTS_H 23 | -------------------------------------------------------------------------------- /kenlm/include/lm/word_index.hh: -------------------------------------------------------------------------------- 1 | // Separate header because this is used often. 2 | #ifndef LM_WORD_INDEX_H 3 | #define LM_WORD_INDEX_H 4 | 5 | #include 6 | 7 | namespace lm { 8 | typedef unsigned int WordIndex; 9 | const WordIndex kMaxWordIndex = UINT_MAX; 10 | } // namespace lm 11 | 12 | typedef lm::WordIndex LMWordIndex; 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /kenlm/include/lm/wrappers/nplm.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_WRAPPERS_NPLM_H 2 | #define LM_WRAPPERS_NPLM_H 3 | 4 | #include "lm/facade.hh" 5 | #include "lm/max_order.hh" 6 | #include "util/string_piece.hh" 7 | 8 | #include 9 | #include 10 | 11 | /* Wrapper to NPLM "by Ashish Vaswani, with contributions from David Chiang 12 | * and Victoria Fossum." 13 | * http://nlg.isi.edu/software/nplm/ 14 | */ 15 | 16 | namespace nplm { 17 | class vocabulary; 18 | class neuralLM; 19 | } // namespace nplm 20 | 21 | namespace lm { 22 | namespace np { 23 | 24 | class Vocabulary : public base::Vocabulary { 25 | public: 26 | Vocabulary(const nplm::vocabulary &vocab); 27 | 28 | ~Vocabulary(); 29 | 30 | WordIndex Index(const std::string &str) const; 31 | 32 | // TODO: lobby them to support StringPiece 33 | WordIndex Index(const StringPiece &str) const { 34 | return Index(std::string(str.data(), str.size())); 35 | } 36 | 37 | lm::WordIndex NullWord() const { return null_word_; } 38 | 39 | private: 40 | const nplm::vocabulary &vocab_; 41 | 42 | const lm::WordIndex null_word_; 43 | }; 44 | 45 | // Sorry for imposing my limitations on your code. 46 | #define NPLM_MAX_ORDER 7 47 | 48 | struct State { 49 | WordIndex words[NPLM_MAX_ORDER - 1]; 50 | }; 51 | 52 | class Model : public lm::base::ModelFacade { 53 | private: 54 | typedef lm::base::ModelFacade P; 55 | 56 | public: 57 | // Does this look like an NPLM? 58 | static bool Recognize(const std::string &file); 59 | 60 | explicit Model(const std::string &file, std::size_t cache_size = 1 << 20); 61 | 62 | ~Model(); 63 | 64 | FullScoreReturn FullScore(const State &from, const WordIndex new_word, State &out_state) const; 65 | 66 | FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const; 67 | 68 | private: 69 | boost::scoped_ptr base_instance_; 70 | 71 | mutable boost::thread_specific_ptr backend_; 72 | 73 | Vocabulary vocab_; 74 | 75 | lm::WordIndex null_word_; 76 | 77 | const std::size_t cache_size_; 78 | }; 79 | 80 | } // namespace np 81 | } // namespace lm 82 | 83 | #endif // LM_WRAPPERS_NPLM_H 84 | -------------------------------------------------------------------------------- /kenlm/include/util/ersatz_progress.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_ERSATZ_PROGRESS_H 2 | #define UTIL_ERSATZ_PROGRESS_H 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | // Ersatz version of boost::progress so core language model doesn't depend on 10 | // boost. Also adds option to print nothing. 11 | 12 | namespace util { 13 | 14 | extern const char kProgressBanner[]; 15 | 16 | class ErsatzProgress { 17 | public: 18 | // No output. 19 | ErsatzProgress(); 20 | 21 | // Null means no output. The null value is useful for passing along the ostream pointer from another caller. 22 | explicit ErsatzProgress(uint64_t complete, std::ostream *to = &std::cerr, const std::string &message = ""); 23 | 24 | ~ErsatzProgress(); 25 | 26 | ErsatzProgress &operator++() { 27 | if (++current_ >= next_) Milestone(); 28 | return *this; 29 | } 30 | 31 | ErsatzProgress &operator+=(uint64_t amount) { 32 | if ((current_ += amount) >= next_) Milestone(); 33 | return *this; 34 | } 35 | 36 | void Set(uint64_t to) { 37 | if ((current_ = to) >= next_) Milestone(); 38 | } 39 | 40 | void Finished() { 41 | Set(complete_); 42 | } 43 | 44 | private: 45 | void Milestone(); 46 | 47 | uint64_t current_, next_, complete_; 48 | unsigned char stones_written_; 49 | std::ostream *out_; 50 | 51 | // noncopyable 52 | ErsatzProgress(const ErsatzProgress &other); 53 | ErsatzProgress &operator=(const ErsatzProgress &other); 54 | }; 55 | 56 | } // namespace util 57 | 58 | #endif // UTIL_ERSATZ_PROGRESS_H 59 | -------------------------------------------------------------------------------- /kenlm/include/util/getopt.hh: -------------------------------------------------------------------------------- 1 | /* 2 | POSIX getopt for Windows 3 | 4 | AT&T Public License 5 | 6 | Code given out at the 1985 UNIFORUM conference in Dallas. 7 | */ 8 | 9 | #ifdef __GNUC__ 10 | #include 11 | #endif 12 | #ifndef __GNUC__ 13 | 14 | #ifndef UTIL_GETOPT_H 15 | #define UTIL_GETOPT_H 16 | 17 | #ifdef __cplusplus 18 | extern "C" { 19 | #endif 20 | 21 | extern int opterr; 22 | extern int optind; 23 | extern int optopt; 24 | extern char *optarg; 25 | extern int getopt(int argc, char **argv, char *opts); 26 | 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif /* UTIL_GETOPT_H */ 32 | #endif /* __GNUC__ */ 33 | 34 | -------------------------------------------------------------------------------- /kenlm/include/util/have.hh: -------------------------------------------------------------------------------- 1 | /* Optional packages. You might want to integrate this with your build system e.g. config.h from ./configure. */ 2 | #ifndef UTIL_HAVE_H 3 | #define UTIL_HAVE_H 4 | 5 | #ifdef HAVE_CONFIG_H 6 | #include "config.h" 7 | #endif 8 | 9 | #ifndef HAVE_ICU 10 | //#define HAVE_ICU 11 | #endif 12 | 13 | #endif // UTIL_HAVE_H 14 | -------------------------------------------------------------------------------- /kenlm/include/util/murmur_hash.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_MURMUR_HASH_H 2 | #define UTIL_MURMUR_HASH_H 3 | #include 4 | #include 5 | 6 | namespace util { 7 | 8 | // 64-bit machine version 9 | uint64_t MurmurHash64A(const void * key, std::size_t len, uint64_t seed = 0); 10 | // 32-bit machine version (not the same function as above) 11 | uint64_t MurmurHash64B(const void * key, std::size_t len, uint64_t seed = 0); 12 | // Use the version for this arch. Because the values differ across 13 | // architectures, really only use it for in-memory structures. 14 | uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed = 0); 15 | 16 | } // namespace util 17 | 18 | #endif // UTIL_MURMUR_HASH_H 19 | -------------------------------------------------------------------------------- /kenlm/include/util/parallel_read.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_PARALLEL_READ__ 2 | #define UTIL_PARALLEL_READ__ 3 | 4 | /* Read pieces of a file in parallel. This has a very specific use case: 5 | * reading files from Lustre is CPU bound so multiple threads actually 6 | * increases throughput. Speed matters when an LM takes a terabyte. 7 | */ 8 | 9 | #include 10 | #include 11 | 12 | namespace util { 13 | void ParallelRead(int fd, void *to, std::size_t amount, uint64_t offset); 14 | } // namespace util 15 | 16 | #endif // UTIL_PARALLEL_READ__ 17 | -------------------------------------------------------------------------------- /kenlm/include/util/pool.hh: -------------------------------------------------------------------------------- 1 | // Very simple pool. It can only allocate memory. And all of the memory it 2 | // allocates must be freed at the same time. 3 | 4 | #ifndef UTIL_POOL_H 5 | #define UTIL_POOL_H 6 | 7 | #include 8 | 9 | #include 10 | 11 | namespace util { 12 | 13 | class Pool { 14 | public: 15 | Pool(); 16 | 17 | ~Pool(); 18 | 19 | void *Allocate(std::size_t size) { 20 | void *ret = current_; 21 | current_ += size; 22 | if (current_ < current_end_) { 23 | return ret; 24 | } else { 25 | return More(size); 26 | } 27 | } 28 | 29 | void FreeAll(); 30 | 31 | private: 32 | void *More(std::size_t size); 33 | 34 | std::vector free_list_; 35 | 36 | uint8_t *current_, *current_end_; 37 | 38 | // no copying 39 | Pool(const Pool &); 40 | Pool &operator=(const Pool &); 41 | }; 42 | 43 | } // namespace util 44 | 45 | #endif // UTIL_POOL_H 46 | -------------------------------------------------------------------------------- /kenlm/include/util/read_compressed.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_READ_COMPRESSED_H 2 | #define UTIL_READ_COMPRESSED_H 3 | 4 | #include "util/exception.hh" 5 | #include "util/scoped.hh" 6 | 7 | #include 8 | 9 | #include 10 | 11 | namespace util { 12 | 13 | class CompressedException : public Exception { 14 | public: 15 | CompressedException() throw(); 16 | virtual ~CompressedException() throw(); 17 | }; 18 | 19 | class GZException : public CompressedException { 20 | public: 21 | GZException() throw(); 22 | ~GZException() throw(); 23 | }; 24 | 25 | class BZException : public CompressedException { 26 | public: 27 | BZException() throw(); 28 | ~BZException() throw(); 29 | }; 30 | 31 | class XZException : public CompressedException { 32 | public: 33 | XZException() throw(); 34 | ~XZException() throw(); 35 | }; 36 | 37 | class ReadBase; 38 | 39 | class ReadCompressed { 40 | public: 41 | static const std::size_t kMagicSize = 6; 42 | // Must have at least kMagicSize bytes. 43 | static bool DetectCompressedMagic(const void *from); 44 | 45 | // Takes ownership of fd. 46 | explicit ReadCompressed(int fd); 47 | 48 | // Try to avoid using this. Use the fd instead. 49 | // There is no decompression support for istreams. 50 | explicit ReadCompressed(std::istream &in); 51 | 52 | // Must call Reset later. 53 | ReadCompressed(); 54 | 55 | ~ReadCompressed(); 56 | 57 | // Takes ownership of fd. 58 | void Reset(int fd); 59 | 60 | // Same advice as the constructor. 61 | void Reset(std::istream &in); 62 | 63 | std::size_t Read(void *to, std::size_t amount); 64 | 65 | // Repeatedly call read to fill a buffer unless EOF is hit. 66 | // Return number of bytes read. 67 | std::size_t ReadOrEOF(void *const to, std::size_t amount); 68 | 69 | uint64_t RawAmount() const { return raw_amount_; } 70 | 71 | private: 72 | friend class ReadBase; 73 | 74 | scoped_ptr internal_; 75 | 76 | uint64_t raw_amount_; 77 | 78 | // No copying. 79 | ReadCompressed(const ReadCompressed &); 80 | void operator=(const ReadCompressed &); 81 | }; 82 | 83 | } // namespace util 84 | 85 | #endif // UTIL_READ_COMPRESSED_H 86 | -------------------------------------------------------------------------------- /kenlm/include/util/stream/config.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_STREAM_CONFIG_H 2 | #define UTIL_STREAM_CONFIG_H 3 | 4 | #include 5 | #include 6 | 7 | namespace util { namespace stream { 8 | 9 | /** 10 | * Represents how a chain should be configured. 11 | */ 12 | struct ChainConfig { 13 | 14 | /** Constructs an configuration with underspecified (or default) parameters. */ 15 | ChainConfig() {} 16 | 17 | /** 18 | * Constructs a chain configuration object. 19 | * 20 | * @param [in] in_entry_size Number of bytes in each record. 21 | * @param [in] in_block_count Number of blocks in the chain. 22 | * @param [in] in_total_memory Total number of bytes available to the chain. 23 | * This value will be divided amongst the blocks in the chain. 24 | */ 25 | ChainConfig(std::size_t in_entry_size, std::size_t in_block_count, std::size_t in_total_memory) 26 | : entry_size(in_entry_size), block_count(in_block_count), total_memory(in_total_memory) {} 27 | 28 | /** 29 | * Number of bytes in each record. 30 | */ 31 | std::size_t entry_size; 32 | 33 | /** 34 | * Number of blocks in the chain. 35 | */ 36 | std::size_t block_count; 37 | 38 | /** 39 | * Total number of bytes available to the chain. 40 | * This value will be divided amongst the blocks in the chain. 41 | * Chain's constructor will make this a multiple of entry_size. 42 | */ 43 | std::size_t total_memory; 44 | }; 45 | 46 | 47 | /** 48 | * Represents how a sorter should be configured. 49 | */ 50 | struct SortConfig { 51 | 52 | /** Filename prefix where temporary files should be placed. */ 53 | std::string temp_prefix; 54 | 55 | /** Size of each input/output buffer. */ 56 | std::size_t buffer_size; 57 | 58 | /** Total memory to use when running alone. */ 59 | std::size_t total_memory; 60 | }; 61 | 62 | }} // namespaces 63 | #endif // UTIL_STREAM_CONFIG_H 64 | -------------------------------------------------------------------------------- /kenlm/include/util/stream/io.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_STREAM_IO_H 2 | #define UTIL_STREAM_IO_H 3 | 4 | #include "util/exception.hh" 5 | #include "util/file.hh" 6 | 7 | namespace util { 8 | namespace stream { 9 | 10 | class ChainPosition; 11 | 12 | class ReadSizeException : public util::Exception { 13 | public: 14 | ReadSizeException() throw(); 15 | ~ReadSizeException() throw(); 16 | }; 17 | 18 | class Read { 19 | public: 20 | explicit Read(int fd) : file_(fd) {} 21 | void Run(const ChainPosition &position); 22 | private: 23 | int file_; 24 | }; 25 | 26 | // Like read but uses pread so that the file can be accessed from multiple threads. 27 | class PRead { 28 | public: 29 | explicit PRead(int fd, bool take_own = false) : file_(fd), own_(take_own) {} 30 | void Run(const ChainPosition &position); 31 | private: 32 | int file_; 33 | bool own_; 34 | }; 35 | 36 | class Write { 37 | public: 38 | explicit Write(int fd) : file_(fd) {} 39 | void Run(const ChainPosition &position); 40 | private: 41 | int file_; 42 | }; 43 | 44 | // It's a common case that stuff is written and then recycled. So rather than 45 | // spawn another thread to Recycle, this combines the two roles. 46 | class WriteAndRecycle { 47 | public: 48 | explicit WriteAndRecycle(int fd) : file_(fd) {} 49 | void Run(const ChainPosition &position); 50 | private: 51 | int file_; 52 | }; 53 | 54 | class PWriteAndRecycle { 55 | public: 56 | explicit PWriteAndRecycle(int fd) : file_(fd) {} 57 | void Run(const ChainPosition &position); 58 | private: 59 | int file_; 60 | }; 61 | 62 | 63 | // Reuse the same file over and over again to buffer output. 64 | class FileBuffer { 65 | public: 66 | explicit FileBuffer(int fd) : file_(fd) {} 67 | 68 | PWriteAndRecycle Sink() const { 69 | util::SeekOrThrow(file_.get(), 0); 70 | return PWriteAndRecycle(file_.get()); 71 | } 72 | 73 | PRead Source() const { 74 | return PRead(file_.get()); 75 | } 76 | 77 | uint64_t Size() const { 78 | return SizeOrThrow(file_.get()); 79 | } 80 | 81 | private: 82 | scoped_fd file_; 83 | }; 84 | 85 | } // namespace stream 86 | } // namespace util 87 | #endif // UTIL_STREAM_IO_H 88 | -------------------------------------------------------------------------------- /kenlm/include/util/stream/line_input.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_STREAM_LINE_INPUT_H 2 | #define UTIL_STREAM_LINE_INPUT_H 3 | namespace util {namespace stream { 4 | 5 | class ChainPosition; 6 | 7 | /* Worker that reads input into blocks, ensuring that blocks contain whole 8 | * lines. Assumes that the maximum size of a line is less than the block size 9 | */ 10 | class LineInput { 11 | public: 12 | // Takes ownership upon thread execution. 13 | explicit LineInput(int fd); 14 | 15 | void Run(const ChainPosition &position); 16 | 17 | private: 18 | int fd_; 19 | }; 20 | 21 | }} // namespaces 22 | #endif // UTIL_STREAM_LINE_INPUT_H 23 | -------------------------------------------------------------------------------- /kenlm/include/util/stream/multi_progress.hh: -------------------------------------------------------------------------------- 1 | /* Progress bar suitable for chains of workers */ 2 | #ifndef UTIL_STREAM_MULTI_PROGRESS_H 3 | #define UTIL_STREAM_MULTI_PROGRESS_H 4 | 5 | #include 6 | 7 | #include 8 | 9 | #include 10 | 11 | namespace util { namespace stream { 12 | 13 | class WorkerProgress; 14 | 15 | class MultiProgress { 16 | public: 17 | static const unsigned char kWidth = 100; 18 | 19 | MultiProgress(); 20 | 21 | ~MultiProgress(); 22 | 23 | // Turns on showing (requires SetTarget too). 24 | void Activate(); 25 | 26 | void SetTarget(uint64_t complete); 27 | 28 | WorkerProgress Add(); 29 | 30 | void Finished(); 31 | 32 | private: 33 | friend class WorkerProgress; 34 | void Milestone(WorkerProgress &worker); 35 | 36 | bool active_; 37 | 38 | uint64_t complete_; 39 | 40 | boost::mutex mutex_; 41 | 42 | // \0 at the end. 43 | char display_[kWidth + 1]; 44 | 45 | std::size_t character_handout_; 46 | 47 | MultiProgress(const MultiProgress &); 48 | MultiProgress &operator=(const MultiProgress &); 49 | }; 50 | 51 | class WorkerProgress { 52 | public: 53 | // Default contrutor must be initialized with operator= later. 54 | WorkerProgress() : parent_(NULL) {} 55 | 56 | // Not threadsafe for the same worker by default. 57 | WorkerProgress &operator++() { 58 | if (++current_ >= next_) { 59 | parent_->Milestone(*this); 60 | } 61 | return *this; 62 | } 63 | 64 | WorkerProgress &operator+=(uint64_t amount) { 65 | current_ += amount; 66 | if (current_ >= next_) { 67 | parent_->Milestone(*this); 68 | } 69 | return *this; 70 | } 71 | 72 | private: 73 | friend class MultiProgress; 74 | WorkerProgress(uint64_t next, MultiProgress &parent, char character) 75 | : current_(0), next_(next), parent_(&parent), stone_(0), character_(character) {} 76 | 77 | uint64_t current_, next_; 78 | 79 | MultiProgress *parent_; 80 | 81 | // Previous milestone reached. 82 | unsigned char stone_; 83 | 84 | // Character to display in bar. 85 | char character_; 86 | }; 87 | 88 | }} // namespaces 89 | 90 | #endif // UTIL_STREAM_MULTI_PROGRESS_H 91 | -------------------------------------------------------------------------------- /kenlm/include/util/stream/stream.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_STREAM_STREAM_H 2 | #define UTIL_STREAM_STREAM_H 3 | 4 | #include "util/stream/chain.hh" 5 | 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | namespace util { 12 | namespace stream { 13 | 14 | class Stream : boost::noncopyable { 15 | public: 16 | Stream() : current_(NULL), end_(NULL) {} 17 | 18 | void Init(const ChainPosition &position) { 19 | entry_size_ = position.GetChain().EntrySize(); 20 | block_size_ = position.GetChain().BlockSize(); 21 | block_it_.Init(position); 22 | StartBlock(); 23 | } 24 | 25 | explicit Stream(const ChainPosition &position) { 26 | Init(position); 27 | } 28 | 29 | operator bool() const { return current_ != NULL; } 30 | bool operator!() const { return current_ == NULL; } 31 | 32 | const void *Get() const { return current_; } 33 | void *Get() { return current_; } 34 | 35 | void Poison() { 36 | block_it_->SetValidSize(current_ - static_cast(block_it_->Get())); 37 | ++block_it_; 38 | block_it_.Poison(); 39 | } 40 | 41 | Stream &operator++() { 42 | assert(*this); 43 | assert(current_ < end_); 44 | current_ += entry_size_; 45 | if (current_ == end_) { 46 | ++block_it_; 47 | StartBlock(); 48 | } 49 | return *this; 50 | } 51 | 52 | private: 53 | void StartBlock() { 54 | for (; block_it_ && !block_it_->ValidSize(); ++block_it_) {} 55 | current_ = static_cast(block_it_->Get()); 56 | end_ = current_ + block_it_->ValidSize(); 57 | } 58 | 59 | // The following are pointers to raw memory 60 | // current_ is the current record 61 | // end_ is the end of the block (so we know when to move to the next block) 62 | uint8_t *current_, *end_; 63 | 64 | std::size_t entry_size_; 65 | std::size_t block_size_; 66 | 67 | Link block_it_; 68 | }; 69 | 70 | inline Chain &operator>>(Chain &chain, Stream &stream) { 71 | stream.Init(chain.Add()); 72 | return chain; 73 | } 74 | 75 | } // namespace stream 76 | } // namespace util 77 | #endif // UTIL_STREAM_STREAM_H 78 | -------------------------------------------------------------------------------- /kenlm/include/util/stream/timer.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_STREAM_TIMER_H 2 | #define UTIL_STREAM_TIMER_H 3 | 4 | // Sorry Jon, this was adding library dependencies in Moses and people complained. 5 | 6 | /*#include 7 | 8 | #if BOOST_VERSION >= 104800 9 | #include 10 | #define UTIL_TIMER(str) boost::timer::auto_cpu_timer timer(std::cerr, 1, (str)) 11 | #else 12 | //#warning Using Boost older than 1.48. Timing information will not be available.*/ 13 | #define UTIL_TIMER(str) 14 | //#endif 15 | 16 | #endif // UTIL_STREAM_TIMER_H 17 | -------------------------------------------------------------------------------- /kenlm/include/util/string_piece_hash.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_STRING_PIECE_HASH_H 2 | #define UTIL_STRING_PIECE_HASH_H 3 | 4 | #include "util/string_piece.hh" 5 | 6 | #include 7 | #include 8 | 9 | inline size_t hash_value(const StringPiece &str) { 10 | return boost::hash_range(str.data(), str.data() + str.length()); 11 | } 12 | 13 | /* Support for lookup of StringPiece in boost::unordered_map */ 14 | struct StringPieceCompatibleHash : public std::unary_function { 15 | size_t operator()(const StringPiece &str) const { 16 | return hash_value(str); 17 | } 18 | }; 19 | 20 | struct StringPieceCompatibleEquals : public std::binary_function { 21 | bool operator()(const StringPiece &first, const StringPiece &second) const { 22 | return first == second; 23 | } 24 | }; 25 | template typename T::const_iterator FindStringPiece(const T &t, const StringPiece &key) { 26 | #if BOOST_VERSION < 104200 27 | std::string temp(key.data(), key.size()); 28 | return t.find(temp); 29 | #else 30 | return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); 31 | #endif 32 | } 33 | 34 | template typename T::iterator FindStringPiece(T &t, const StringPiece &key) { 35 | #if BOOST_VERSION < 104200 36 | std::string temp(key.data(), key.size()); 37 | return t.find(temp); 38 | #else 39 | return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); 40 | #endif 41 | } 42 | 43 | #endif // UTIL_STRING_PIECE_HASH_H 44 | -------------------------------------------------------------------------------- /kenlm/include/util/unistd.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_UNISTD_H 2 | #define UTIL_UNISTD_H 3 | 4 | #if defined(_WIN32) || defined(_WIN64) 5 | 6 | // Windows doesn't define 7 | // 8 | // So we define what we need here instead: 9 | // 10 | #define STDIN_FILENO=0 11 | #define STDOUT_FILENO=1 12 | 13 | 14 | #else // Huzzah for POSIX! 15 | 16 | #include 17 | 18 | #endif 19 | 20 | 21 | 22 | #endif // UTIL_UNISTD_H 23 | -------------------------------------------------------------------------------- /kenlm/include/util/usage.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_USAGE_H 2 | #define UTIL_USAGE_H 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace util { 10 | // Time in seconds since process started. Zero on unsupported platforms. 11 | double WallTime(); 12 | 13 | void PrintUsage(std::ostream &to); 14 | 15 | // Determine how much physical memory there is. Return 0 on failure. 16 | uint64_t GuessPhysicalMemory(); 17 | 18 | // Parse a size like unix sort. Sadly, this means the default multiplier is K. 19 | uint64_t ParseSize(const std::string &arg); 20 | } // namespace util 21 | #endif // UTIL_USAGE_H 22 | -------------------------------------------------------------------------------- /kenlm/lm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Explicitly list the source files for this subdirectory 2 | # 3 | # If you add any source files to this subdirectory 4 | # that should be included in the kenlm library, 5 | # (this excludes any unit test files) 6 | # you should add them to the following list: 7 | set(KENLM_LM_SOURCE 8 | bhiksha.cc 9 | binary_format.cc 10 | config.cc 11 | lm_exception.cc 12 | model.cc 13 | quantize.cc 14 | read_arpa.cc 15 | search_hashed.cc 16 | search_trie.cc 17 | sizes.cc 18 | trie.cc 19 | trie_sort.cc 20 | value_build.cc 21 | virtual_interface.cc 22 | vocab.cc 23 | ) 24 | 25 | 26 | # Group these objects together for later use. 27 | # 28 | # Given add_library(foo OBJECT ${my_foo_sources}), 29 | # refer to these objects as $ 30 | # 31 | add_subdirectory(common) 32 | 33 | if (NOT MSVC) 34 | set(THREADS pthread) 35 | endif() 36 | 37 | add_library(kenlm ${KENLM_LM_SOURCE} ${KENLM_LM_COMMON_SOURCE}) 38 | target_link_libraries(kenlm kenlm_util ${Boost_LIBRARIES} ${THREADS}) 39 | 40 | set(KENLM_MAX_ORDER 6 CACHE STRING "Maximum supported ngram order") 41 | target_compile_definitions(kenlm PUBLIC -DKENLM_MAX_ORDER=${KENLM_MAX_ORDER}) 42 | 43 | # This directory has children that need to be processed 44 | add_subdirectory(builder) 45 | add_subdirectory(filter) 46 | add_subdirectory(interpolate) 47 | 48 | # Explicitly list the executable files to be compiled 49 | set(EXE_LIST 50 | query 51 | fragment 52 | build_binary 53 | kenlm_benchmark 54 | ) 55 | 56 | set(LM_LIBS kenlm kenlm_util ${Boost_LIBRARIES} ${THREADS}) 57 | 58 | AddExes(EXES ${EXE_LIST} 59 | LIBRARIES ${LM_LIBS}) 60 | 61 | if(BUILD_TESTING) 62 | 63 | set(KENLM_BOOST_TESTS_LIST left_test partial_test) 64 | AddTests(TESTS ${KENLM_BOOST_TESTS_LIST} 65 | LIBRARIES ${LM_LIBS} 66 | TEST_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/test.arpa) 67 | 68 | # model_test requires an extra command line parameter 69 | KenLMAddTest(TEST model_test 70 | LIBRARIES ${LM_LIBS} 71 | TEST_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/test.arpa 72 | ${CMAKE_CURRENT_SOURCE_DIR}/test_nounk.arpa) 73 | endif() 74 | -------------------------------------------------------------------------------- /kenlm/lm/blank.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_BLANK_H 2 | #define LM_BLANK_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace lm { 9 | namespace ngram { 10 | 11 | /* Suppose "foo bar" appears with zero backoff but there is no trigram 12 | * beginning with these words. Then, when scoring "foo bar", the model could 13 | * return out_state containing "bar" or even null context if "bar" also has no 14 | * backoff and is never followed by another word. Then the backoff is set to 15 | * kNoExtensionBackoff. If the n-gram might be extended, then out_state must 16 | * contain the full n-gram, in which case kExtensionBackoff is set. In any 17 | * case, if an n-gram has non-zero backoff, the full state is returned so 18 | * backoff can be properly charged. 19 | * These differ only in sign bit because the backoff is in fact zero in either 20 | * case. 21 | */ 22 | const float kNoExtensionBackoff = -0.0; 23 | const float kExtensionBackoff = 0.0; 24 | const uint64_t kNoExtensionQuant = 0; 25 | const uint64_t kExtensionQuant = 1; 26 | 27 | inline void SetExtension(float &backoff) { 28 | if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff; 29 | } 30 | 31 | // This compiles down nicely. 32 | inline bool HasExtension(const float &backoff) { 33 | typedef union { float f; uint32_t i; } UnionValue; 34 | UnionValue compare, interpret; 35 | compare.f = kNoExtensionBackoff; 36 | interpret.f = backoff; 37 | return compare.i != interpret.i; 38 | } 39 | 40 | } // namespace ngram 41 | } // namespace lm 42 | #endif // LM_BLANK_H 43 | -------------------------------------------------------------------------------- /kenlm/lm/builder/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # This CMake file was created by Lane Schwartz 2 | 3 | # Explicitly list the source files for this subdirectory 4 | # 5 | # If you add any source files to this subdirectory 6 | # that should be included in the kenlm library, 7 | # (this excludes any unit test files) 8 | # you should add them to the following list: 9 | # 10 | # In order to set correct paths to these files 11 | # in case this variable is referenced by CMake files in the parent directory, 12 | # we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}. 13 | # 14 | set(KENLM_BUILDER_SOURCE 15 | ${CMAKE_CURRENT_SOURCE_DIR}/adjust_counts.cc 16 | ${CMAKE_CURRENT_SOURCE_DIR}/corpus_count.cc 17 | ${CMAKE_CURRENT_SOURCE_DIR}/initial_probabilities.cc 18 | ${CMAKE_CURRENT_SOURCE_DIR}/interpolate.cc 19 | ${CMAKE_CURRENT_SOURCE_DIR}/output.cc 20 | ${CMAKE_CURRENT_SOURCE_DIR}/pipeline.cc 21 | ) 22 | 23 | 24 | # Group these objects together for later use. 25 | # 26 | # Given add_library(foo OBJECT ${my_foo_sources}), 27 | # refer to these objects as $ 28 | # 29 | add_library(kenlm_builder ${KENLM_BUILDER_SOURCE}) 30 | 31 | if (NOT MSVC) 32 | set(THREADS pthread) 33 | endif() 34 | 35 | AddExes(EXES lmplz 36 | LIBRARIES kenlm_builder kenlm kenlm_util ${Boost_LIBRARIES} ${THREADS}) 37 | AddExes(EXES count_ngrams 38 | LIBRARIES kenlm_builder kenlm kenlm_util ${Boost_LIBRARIES} ${THREADS}) 39 | 40 | if(BUILD_TESTING) 41 | 42 | # Explicitly list the Boost test files to be compiled 43 | set(KENLM_BOOST_TESTS_LIST 44 | adjust_counts_test 45 | corpus_count_test 46 | ) 47 | 48 | AddTests(TESTS ${KENLM_BOOST_TESTS_LIST} 49 | LIBRARIES kenlm_builder kenlm kenlm_util ${Boost_LIBRARIES} ${THREADS}) 50 | endif() 51 | -------------------------------------------------------------------------------- /kenlm/lm/builder/README.md: -------------------------------------------------------------------------------- 1 | Dependencies 2 | ============ 3 | 4 | Boost >= 1.42.0 is required. 5 | 6 | For Ubuntu, 7 | ```bash 8 | sudo apt-get install libboost1.48-all-dev 9 | ``` 10 | 11 | Alternatively, you can download, compile, and install it yourself: 12 | 13 | ```bash 14 | wget http://sourceforge.net/projects/boost/files/boost/1.52.0/boost_1_52_0.tar.gz/download -O boost_1_52_0.tar.gz 15 | tar -xvzf boost_1_52_0.tar.gz 16 | cd boost_1_52_0 17 | ./bootstrap.sh 18 | ./b2 19 | sudo ./b2 install 20 | ``` 21 | 22 | Local install options (in a user-space prefix directory) are also possible. See http://www.boost.org/doc/libs/1_52_0/doc/html/bbv2/installation.html. 23 | 24 | 25 | Building 26 | ======== 27 | 28 | ```bash 29 | bjam 30 | ``` 31 | Your distribution might package bjam and boost-build separately from Boost. Both are required. 32 | 33 | Usage 34 | ===== 35 | 36 | Run 37 | ```bash 38 | $ bin/lmplz 39 | ``` 40 | to see command line arguments 41 | 42 | Running 43 | ======= 44 | 45 | ```bash 46 | bin/lmplz -o 5 text.arpa 47 | ``` 48 | -------------------------------------------------------------------------------- /kenlm/lm/builder/TODO: -------------------------------------------------------------------------------- 1 | More tests! 2 | Sharding. 3 | Some way to manage all the crazy config options. 4 | Option to build the binary file directly. 5 | Interpolation of different orders. 6 | -------------------------------------------------------------------------------- /kenlm/lm/builder/adjust_counts.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_BUILDER_ADJUST_COUNTS_H 2 | #define LM_BUILDER_ADJUST_COUNTS_H 3 | 4 | #include "lm/builder/discount.hh" 5 | #include "lm/lm_exception.hh" 6 | #include "util/exception.hh" 7 | 8 | #include 9 | 10 | #include 11 | 12 | namespace util { namespace stream { class ChainPositions; } } 13 | 14 | namespace lm { 15 | namespace builder { 16 | 17 | class BadDiscountException : public util::Exception { 18 | public: 19 | BadDiscountException() throw(); 20 | ~BadDiscountException() throw(); 21 | }; 22 | 23 | struct DiscountConfig { 24 | // Overrides discounts for orders [1,discount_override.size()]. 25 | std::vector overwrite; 26 | // If discounting fails for an order, copy them from here. 27 | Discount fallback; 28 | // What to do when discounts are out of range or would trigger divison by 29 | // zero. It it does something other than THROW_UP, use fallback_discount. 30 | WarningAction bad_action; 31 | }; 32 | 33 | /* Compute adjusted counts. 34 | * Input: unique suffix sorted N-grams (and just the N-grams) with raw counts. 35 | * Output: [1,N]-grams with adjusted counts. 36 | * [1,N)-grams are in suffix order 37 | * N-grams are in undefined order (they're going to be sorted anyway). 38 | */ 39 | class AdjustCounts { 40 | public: 41 | // counts: output 42 | // counts_pruned: output 43 | // discounts: mostly output. If the input already has entries, they will be kept. 44 | // prune_thresholds: input. n-grams with normal (not adjusted) count below this will be pruned. 45 | AdjustCounts( 46 | const std::vector &prune_thresholds, 47 | std::vector &counts, 48 | std::vector &counts_pruned, 49 | const std::vector &prune_words, 50 | const DiscountConfig &discount_config, 51 | std::vector &discounts) 52 | : prune_thresholds_(prune_thresholds), counts_(counts), counts_pruned_(counts_pruned), 53 | prune_words_(prune_words), discount_config_(discount_config), discounts_(discounts) 54 | {} 55 | 56 | void Run(const util::stream::ChainPositions &positions); 57 | 58 | private: 59 | const std::vector &prune_thresholds_; 60 | std::vector &counts_; 61 | std::vector &counts_pruned_; 62 | const std::vector &prune_words_; 63 | 64 | DiscountConfig discount_config_; 65 | std::vector &discounts_; 66 | }; 67 | 68 | } // namespace builder 69 | } // namespace lm 70 | 71 | #endif // LM_BUILDER_ADJUST_COUNTS_H 72 | 73 | -------------------------------------------------------------------------------- /kenlm/lm/builder/combine_counts.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_BUILDER_COMBINE_COUNTS_H 2 | #define LM_BUILDER_COMBINE_COUNTS_H 3 | 4 | #include "lm/builder/payload.hh" 5 | #include "lm/common/ngram.hh" 6 | #include "lm/common/compare.hh" 7 | #include "lm/word_index.hh" 8 | #include "util/stream/sort.hh" 9 | 10 | #include 11 | #include 12 | 13 | namespace lm { 14 | namespace builder { 15 | 16 | // Sum counts for the same n-gram. 17 | struct CombineCounts { 18 | bool operator()(void *first_void, const void *second_void, const SuffixOrder &compare) const { 19 | NGram first(first_void, compare.Order()); 20 | // There isn't a const version of NGram. 21 | NGram second(const_cast(second_void), compare.Order()); 22 | if (memcmp(first.begin(), second.begin(), sizeof(WordIndex) * compare.Order())) return false; 23 | first.Value().count += second.Value().count; 24 | return true; 25 | } 26 | }; 27 | 28 | } // namespace builder 29 | } // namespace lm 30 | 31 | #endif // LM_BUILDER_COMBINE_COUNTS_H 32 | -------------------------------------------------------------------------------- /kenlm/lm/builder/corpus_count.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_BUILDER_CORPUS_COUNT_H 2 | #define LM_BUILDER_CORPUS_COUNT_H 3 | 4 | #include "lm/lm_exception.hh" 5 | #include "lm/word_index.hh" 6 | #include "util/scoped.hh" 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace util { 14 | class FilePiece; 15 | namespace stream { 16 | class ChainPosition; 17 | } // namespace stream 18 | } // namespace util 19 | 20 | namespace lm { 21 | namespace builder { 22 | 23 | class CorpusCount { 24 | public: 25 | // Memory usage will be DedupeMultipler(order) * block_size + total_chain_size + unknown vocab_hash_size 26 | static float DedupeMultiplier(std::size_t order); 27 | 28 | // How much memory vocabulary will use based on estimated size of the vocab. 29 | static std::size_t VocabUsage(std::size_t vocab_estimate); 30 | 31 | // token_count: out. 32 | // type_count aka vocabulary size. Initialize to an estimate. It is set to the exact value. 33 | CorpusCount(util::FilePiece &from, int vocab_write, bool dynamic_vocab, uint64_t &token_count, WordIndex &type_count, std::vector &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol); 34 | 35 | void Run(const util::stream::ChainPosition &position); 36 | 37 | private: 38 | template void RunWithVocab(const util::stream::ChainPosition &position, Vocab &vocab); 39 | 40 | util::FilePiece &from_; 41 | int vocab_write_; 42 | bool dynamic_vocab_; 43 | uint64_t &token_count_; 44 | WordIndex &type_count_; 45 | std::vector& prune_words_; 46 | const std::string& prune_vocab_filename_; 47 | 48 | std::size_t dedupe_mem_size_; 49 | util::scoped_malloc dedupe_mem_; 50 | 51 | WarningAction disallowed_symbol_action_; 52 | }; 53 | 54 | } // namespace builder 55 | } // namespace lm 56 | #endif // LM_BUILDER_CORPUS_COUNT_H 57 | -------------------------------------------------------------------------------- /kenlm/lm/builder/discount.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_BUILDER_DISCOUNT_H 2 | #define LM_BUILDER_DISCOUNT_H 3 | 4 | #include 5 | 6 | #include 7 | 8 | namespace lm { 9 | namespace builder { 10 | 11 | struct Discount { 12 | float amount[4]; 13 | 14 | float Get(uint64_t count) const { 15 | return amount[std::min(count, 3)]; 16 | } 17 | 18 | float Apply(uint64_t count) const { 19 | return static_cast(count) - Get(count); 20 | } 21 | }; 22 | 23 | } // namespace builder 24 | } // namespace lm 25 | 26 | #endif // LM_BUILDER_DISCOUNT_H 27 | -------------------------------------------------------------------------------- /kenlm/lm/builder/dump_counts_main.cc: -------------------------------------------------------------------------------- 1 | #include "lm/common/print.hh" 2 | #include "lm/word_index.hh" 3 | #include "util/file.hh" 4 | #include "util/read_compressed.hh" 5 | 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | int main(int argc, char *argv[]) { 12 | if (argc != 4) { 13 | std::cerr << "Usage: " << argv[0] << " counts vocabulary order\n" 14 | "The counts file contains records with 4-byte vocabulary ids followed by 8-byte\n" 15 | "counts. Each record has order many vocabulary ids.\n" 16 | "The vocabulary file contains the words delimited by NULL in order of id.\n" 17 | "The vocabulary file may not be compressed because it is mmapped but the counts\n" 18 | "file can be compressed.\n"; 19 | return 1; 20 | } 21 | util::ReadCompressed counts(util::OpenReadOrThrow(argv[1])); 22 | util::scoped_fd vocab_file(util::OpenReadOrThrow(argv[2])); 23 | lm::VocabReconstitute vocab(vocab_file.get()); 24 | unsigned int order = boost::lexical_cast(argv[3]); 25 | std::vector record(sizeof(uint32_t) * order + sizeof(uint64_t)); 26 | while (std::size_t got = counts.ReadOrEOF(&*record.begin(), record.size())) { 27 | UTIL_THROW_IF(got != record.size(), util::Exception, "Read " << got << " bytes at the end of file, which is not a complete record of length " << record.size()); 28 | const lm::WordIndex *words = reinterpret_cast(&*record.begin()); 29 | for (const lm::WordIndex *i = words; i != words + order; ++i) { 30 | UTIL_THROW_IF(*i >= vocab.Size(), util::Exception, "Vocab ID " << *i << " is larger than the vocab file's maximum of " << vocab.Size() << ". Are you sure you have the right order and vocab file for these counts?"); 31 | std::cout << vocab.Lookup(*i) << ' '; 32 | } 33 | // TODO don't use std::cout because it is slow. Add fast uint64_t printing support to FileStream. 34 | std::cout << *reinterpret_cast(words + order) << '\n'; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /kenlm/lm/builder/hash_gamma.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_BUILDER_HASH_GAMMA__ 2 | #define LM_BUILDER_HASH_GAMMA__ 3 | 4 | #include 5 | 6 | namespace lm { namespace builder { 7 | 8 | #pragma pack(push) 9 | #pragma pack(4) 10 | 11 | struct HashGamma { 12 | uint64_t hash_value; 13 | float gamma; 14 | }; 15 | 16 | #pragma pack(pop) 17 | 18 | }} // namespaces 19 | #endif // LM_BUILDER_HASH_GAMMA__ 20 | -------------------------------------------------------------------------------- /kenlm/lm/builder/header_info.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_BUILDER_HEADER_INFO_H 2 | #define LM_BUILDER_HEADER_INFO_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace lm { namespace builder { 9 | 10 | // Some configuration info that is used to add 11 | // comments to the beginning of an ARPA file 12 | struct HeaderInfo { 13 | std::string input_file; 14 | uint64_t token_count; 15 | std::vector counts_pruned; 16 | 17 | HeaderInfo() {} 18 | 19 | HeaderInfo(const std::string& input_file_in, uint64_t token_count_in, const std::vector &counts_pruned_in) 20 | : input_file(input_file_in), token_count(token_count_in), counts_pruned(counts_pruned_in) {} 21 | 22 | // TODO: Add smoothing type 23 | // TODO: More info if multiple models were interpolated 24 | }; 25 | 26 | }} // namespaces 27 | 28 | #endif 29 | -------------------------------------------------------------------------------- /kenlm/lm/builder/initial_probabilities.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_BUILDER_INITIAL_PROBABILITIES_H 2 | #define LM_BUILDER_INITIAL_PROBABILITIES_H 3 | 4 | #include "lm/builder/discount.hh" 5 | #include "lm/word_index.hh" 6 | #include "util/stream/config.hh" 7 | 8 | #include 9 | 10 | namespace util { namespace stream { class Chains; } } 11 | 12 | namespace lm { 13 | class SpecialVocab; 14 | namespace builder { 15 | 16 | struct InitialProbabilitiesConfig { 17 | // These should be small buffers to keep the adder from getting too far ahead 18 | util::stream::ChainConfig adder_in; 19 | util::stream::ChainConfig adder_out; 20 | // SRILM doesn't normally interpolate unigrams. 21 | bool interpolate_unigrams; 22 | }; 23 | 24 | /* Compute initial (uninterpolated) probabilities 25 | * primary: the normal chain of n-grams. Incoming is context sorted adjusted 26 | * counts. Outgoing has uninterpolated probabilities for use by Interpolate. 27 | * second_in: a second copy of the primary input. Discard the output. 28 | * gamma_out: Computed gamma values are output on these chains in suffix order. 29 | * The values are bare floats and should be buffered for interpolation to 30 | * use. 31 | */ 32 | void InitialProbabilities( 33 | const InitialProbabilitiesConfig &config, 34 | const std::vector &discounts, 35 | util::stream::Chains &primary, 36 | util::stream::Chains &second_in, 37 | util::stream::Chains &gamma_out, 38 | const std::vector &prune_thresholds, 39 | bool prune_vocab, 40 | const SpecialVocab &vocab); 41 | 42 | } // namespace builder 43 | } // namespace lm 44 | 45 | #endif // LM_BUILDER_INITIAL_PROBABILITIES_H 46 | -------------------------------------------------------------------------------- /kenlm/lm/builder/interpolate.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_BUILDER_INTERPOLATE_H 2 | #define LM_BUILDER_INTERPOLATE_H 3 | 4 | #include "lm/common/special.hh" 5 | #include "lm/word_index.hh" 6 | #include "util/stream/multi_stream.hh" 7 | 8 | #include 9 | 10 | #include 11 | 12 | namespace lm { namespace builder { 13 | 14 | /* Interpolate step. 15 | * Input: suffix sorted n-grams with (p_uninterpolated, gamma) from 16 | * InitialProbabilities. 17 | * Output: suffix sorted n-grams with complete probability 18 | */ 19 | class Interpolate { 20 | public: 21 | // Normally vocab_size is the unigram count-1 (since p() = 0) but might 22 | // be larger when the user specifies a consistent vocabulary size. 23 | explicit Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector &prune_thresholds, bool prune_vocab, bool output_q, const SpecialVocab &specials); 24 | 25 | void Run(const util::stream::ChainPositions &positions); 26 | 27 | private: 28 | float uniform_prob_; 29 | util::stream::ChainPositions backoffs_; 30 | const std::vector prune_thresholds_; 31 | bool prune_vocab_; 32 | bool output_q_; 33 | const SpecialVocab specials_; 34 | }; 35 | 36 | }} // namespaces 37 | #endif // LM_BUILDER_INTERPOLATE_H 38 | -------------------------------------------------------------------------------- /kenlm/lm/builder/output.cc: -------------------------------------------------------------------------------- 1 | #include "lm/builder/output.hh" 2 | 3 | #include "lm/common/model_buffer.hh" 4 | #include "lm/common/print.hh" 5 | #include "util/file_stream.hh" 6 | #include "util/stream/multi_stream.hh" 7 | 8 | #include 9 | 10 | namespace lm { namespace builder { 11 | 12 | OutputHook::~OutputHook() {} 13 | 14 | Output::Output(StringPiece file_base, bool keep_buffer, bool output_q) 15 | : buffer_(file_base, keep_buffer, output_q) {} 16 | 17 | void Output::SinkProbs(util::stream::Chains &chains) { 18 | Apply(PROB_PARALLEL_HOOK, chains); 19 | if (!buffer_.Keep() && !Have(PROB_SEQUENTIAL_HOOK)) { 20 | chains >> util::stream::kRecycle; 21 | chains.Wait(true); 22 | return; 23 | } 24 | buffer_.Sink(chains, header_.counts_pruned); 25 | chains >> util::stream::kRecycle; 26 | chains.Wait(false); 27 | if (Have(PROB_SEQUENTIAL_HOOK)) { 28 | std::cerr << "=== 5/5 Writing ARPA model ===" << std::endl; 29 | buffer_.Source(chains); 30 | Apply(PROB_SEQUENTIAL_HOOK, chains); 31 | chains >> util::stream::kRecycle; 32 | chains.Wait(true); 33 | } 34 | } 35 | 36 | void Output::Apply(HookType hook_type, util::stream::Chains &chains) { 37 | for (boost::ptr_vector::iterator entry = outputs_[hook_type].begin(); entry != outputs_[hook_type].end(); ++entry) { 38 | entry->Sink(header_, VocabFile(), chains); 39 | } 40 | } 41 | 42 | void PrintHook::Sink(const HeaderInfo &info, int vocab_file, util::stream::Chains &chains) { 43 | if (verbose_header_) { 44 | util::FileStream out(file_.get(), 50); 45 | out << "# Input file: " << info.input_file << '\n'; 46 | out << "# Token count: " << info.token_count << '\n'; 47 | out << "# Smoothing: Modified Kneser-Ney" << '\n'; 48 | } 49 | chains >> PrintARPA(vocab_file, file_.get(), info.counts_pruned); 50 | } 51 | 52 | }} // namespaces 53 | -------------------------------------------------------------------------------- /kenlm/lm/builder/payload.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_BUILDER_PAYLOAD_H 2 | #define LM_BUILDER_PAYLOAD_H 3 | 4 | #include "lm/weights.hh" 5 | #include "lm/word_index.hh" 6 | #include 7 | 8 | namespace lm { namespace builder { 9 | 10 | struct Uninterpolated { 11 | float prob; // Uninterpolated probability. 12 | float gamma; // Interpolation weight for lower order. 13 | }; 14 | 15 | union BuildingPayload { 16 | uint64_t count; 17 | Uninterpolated uninterp; 18 | ProbBackoff complete; 19 | 20 | /*mjd**********************************************************************/ 21 | bool IsMarked() const { 22 | return count >> (sizeof(count) * 8 - 1); 23 | } 24 | 25 | void Mark() { 26 | count |= (1ULL << (sizeof(count) * 8 - 1)); 27 | } 28 | 29 | void Unmark() { 30 | count &= ~(1ULL << (sizeof(count) * 8 - 1)); 31 | } 32 | 33 | uint64_t UnmarkedCount() const { 34 | return count & ~(1ULL << (sizeof(count) * 8 - 1)); 35 | } 36 | 37 | uint64_t CutoffCount() const { 38 | return IsMarked() ? 0 : UnmarkedCount(); 39 | } 40 | /*mjd**********************************************************************/ 41 | }; 42 | 43 | const WordIndex kBOS = 1; 44 | const WordIndex kEOS = 2; 45 | 46 | }} // namespaces 47 | 48 | #endif // LM_BUILDER_PAYLOAD_H 49 | -------------------------------------------------------------------------------- /kenlm/lm/common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # This CMake file was created by Lane Schwartz 2 | 3 | # Explicitly list the source files for this subdirectory 4 | # 5 | # If you add any source files to this subdirectory 6 | # that should be included in the kenlm library, 7 | # (this excludes any unit test files) 8 | # you should add them to the following list: 9 | # 10 | # In order to set correct paths to these files 11 | # in case this variable is referenced by CMake files in the parent directory, 12 | # we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}. 13 | # 14 | set(KENLM_LM_COMMON_SOURCE 15 | ${CMAKE_CURRENT_SOURCE_DIR}/model_buffer.cc 16 | ${CMAKE_CURRENT_SOURCE_DIR}/print.cc 17 | ${CMAKE_CURRENT_SOURCE_DIR}/renumber.cc 18 | ${CMAKE_CURRENT_SOURCE_DIR}/size_option.cc 19 | PARENT_SCOPE) 20 | 21 | if(BUILD_TESTING) 22 | KenLMAddTest(TEST model_buffer_test 23 | LIBRARIES kenlm 24 | TEST_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/test_data) 25 | endif() 26 | -------------------------------------------------------------------------------- /kenlm/lm/common/joint_order.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_COMMON_JOINT_ORDER_H 2 | #define LM_COMMON_JOINT_ORDER_H 3 | 4 | #include "lm/common/ngram_stream.hh" 5 | #include "lm/lm_exception.hh" 6 | 7 | #ifdef DEBUG 8 | #include "util/fixed_array.hh" 9 | #include 10 | #endif 11 | 12 | #include 13 | 14 | namespace lm { 15 | 16 | template void JointOrder(const util::stream::ChainPositions &positions, Callback &callback) { 17 | // Allow matching to reference streams[-1]. 18 | util::FixedArray > streams_with_dummy(positions.size() + 1); 19 | // A bogus stream for [-1]. 20 | streams_with_dummy.push_back(); 21 | for (std::size_t i = 0; i < positions.size(); ++i) { 22 | streams_with_dummy.push_back(positions[i], NGramHeader(NULL, i + 1)); 23 | } 24 | ProxyStream *streams = streams_with_dummy.begin() + 1; 25 | 26 | std::size_t order; 27 | for (order = 0; order < positions.size() && streams[order]; ++order) {} 28 | assert(order); // should always have . 29 | 30 | // Debugging only: call comparison function to sanity check order. 31 | #ifdef DEBUG 32 | util::FixedArray less_compare(order); 33 | for (unsigned i = 0; i < order; ++i) 34 | less_compare.push_back(i + 1); 35 | #endif // DEBUG 36 | 37 | std::size_t current = 0; 38 | while (true) { 39 | // Does the context match the lower one? 40 | if (!memcmp(streams[static_cast(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset, sizeof(WordIndex) * current)) { 41 | callback.Enter(current, streams[current].Get()); 42 | // Transition to looking for extensions. 43 | if (++current < order) continue; 44 | } 45 | #ifdef DEBUG 46 | // match_check[current - 1] matches current-grams 47 | // The lower-order stream (which skips fewer current-grams) should always be <= the higher order-stream (which can skip current-grams). 48 | else if (!less_compare[current - 1](streams[static_cast(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset)) { 49 | std::cerr << "Stream out of order detected" << std::endl; 50 | abort(); 51 | } 52 | #endif // DEBUG 53 | // No extension left. 54 | while(true) { 55 | assert(current > 0); 56 | --current; 57 | callback.Exit(current, streams[current].Get()); 58 | 59 | if (++streams[current]) break; 60 | 61 | UTIL_THROW_IF(order != current + 1, FormatLoadException, "Detected n-gram without matching suffix"); 62 | 63 | order = current; 64 | if (!order) return; 65 | } 66 | } 67 | } 68 | 69 | } // namespaces 70 | 71 | #endif // LM_COMMON_JOINT_ORDER_H 72 | -------------------------------------------------------------------------------- /kenlm/lm/common/model_buffer.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_COMMON_MODEL_BUFFER_H 2 | #define LM_COMMON_MODEL_BUFFER_H 3 | 4 | /* Format with separate files in suffix order. Each file contains 5 | * n-grams of the same order. 6 | */ 7 | #include "lm/word_index.hh" 8 | #include "util/file.hh" 9 | #include "util/fixed_array.hh" 10 | #include "util/string_piece.hh" 11 | 12 | #include 13 | #include 14 | 15 | namespace util { namespace stream { 16 | class Chains; 17 | class Chain; 18 | }} // namespaces 19 | 20 | namespace lm { 21 | 22 | namespace ngram { class State; } 23 | 24 | class ModelBuffer { 25 | public: 26 | // Construct for writing. Must call VocabFile() and fill it with null-delimited vocab words. 27 | ModelBuffer(StringPiece file_base, bool keep_buffer, bool output_q); 28 | 29 | // Load from file. 30 | explicit ModelBuffer(StringPiece file_base); 31 | 32 | // Must call VocabFile and populate before calling this function. 33 | void Sink(util::stream::Chains &chains, const std::vector &counts); 34 | 35 | // Read files and write to the given chains. If fewer chains are provided, 36 | // only do the lower orders. 37 | void Source(util::stream::Chains &chains); 38 | 39 | void Source(std::size_t order_minus_1, util::stream::Chain &chain); 40 | 41 | // The order of the n-gram model that is associated with the model buffer. 42 | std::size_t Order() const { return counts_.size(); } 43 | // Requires Sink or load from file. 44 | const std::vector &Counts() const { 45 | assert(!counts_.empty()); 46 | return counts_; 47 | } 48 | 49 | int VocabFile() const { return vocab_file_.get(); } 50 | 51 | int RawFile(std::size_t order_minus_1) const { 52 | return files_[order_minus_1].get(); 53 | } 54 | 55 | bool Keep() const { return keep_buffer_; } 56 | 57 | // Slowly execute a language model query with binary search. 58 | // This is used by interpolation to gather tuning probabilities rather than 59 | // scanning the files. 60 | float SlowQuery(const ngram::State &context, WordIndex word, ngram::State &out) const; 61 | 62 | private: 63 | const std::string file_base_; 64 | const bool keep_buffer_; 65 | bool output_q_; 66 | std::vector counts_; 67 | 68 | util::scoped_fd vocab_file_; 69 | util::FixedArray files_; 70 | }; 71 | 72 | } // namespace lm 73 | 74 | #endif // LM_COMMON_MODEL_BUFFER_H 75 | -------------------------------------------------------------------------------- /kenlm/lm/common/model_buffer_test.cc: -------------------------------------------------------------------------------- 1 | #include "lm/common/model_buffer.hh" 2 | #include "lm/model.hh" 3 | #include "lm/state.hh" 4 | 5 | #define BOOST_TEST_MODULE ModelBufferTest 6 | #include 7 | 8 | namespace lm { namespace { 9 | 10 | BOOST_AUTO_TEST_CASE(Query) { 11 | std::string dir("test_data/"); 12 | if (boost::unit_test::framework::master_test_suite().argc == 2) { 13 | dir = boost::unit_test::framework::master_test_suite().argv[1]; 14 | } 15 | ngram::Model ref((dir + "/toy0.arpa").c_str()); 16 | ModelBuffer test(dir + "/toy0"); 17 | ngram::State ref_state, test_state; 18 | WordIndex a = ref.GetVocabulary().Index("a"); 19 | BOOST_CHECK_CLOSE( 20 | ref.FullScore(ref.BeginSentenceState(), a, ref_state).prob, 21 | test.SlowQuery(ref.BeginSentenceState(), a, test_state), 22 | 0.001); 23 | BOOST_CHECK_EQUAL((unsigned)ref_state.length, (unsigned)test_state.length); 24 | BOOST_CHECK_EQUAL(ref_state.words[0], test_state.words[0]); 25 | BOOST_CHECK_EQUAL(ref_state.backoff[0], test_state.backoff[0]); 26 | BOOST_CHECK(ref_state == test_state); 27 | 28 | ngram::State ref_state2, test_state2; 29 | WordIndex b = ref.GetVocabulary().Index("b"); 30 | BOOST_CHECK_CLOSE( 31 | ref.FullScore(ref_state, b, ref_state2).prob, 32 | test.SlowQuery(test_state, b, test_state2), 33 | 0.001); 34 | BOOST_CHECK(ref_state2 == test_state2); 35 | BOOST_CHECK_EQUAL(ref_state2.backoff[0], test_state2.backoff[0]); 36 | 37 | BOOST_CHECK_CLOSE( 38 | ref.FullScore(ref_state2, 0, ref_state).prob, 39 | test.SlowQuery(test_state2, 0, test_state), 40 | 0.001); 41 | // The reference does state minimization but this doesn't. 42 | } 43 | 44 | }} // namespaces 45 | -------------------------------------------------------------------------------- /kenlm/lm/common/ngram.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_COMMON_NGRAM_H 2 | #define LM_COMMON_NGRAM_H 3 | 4 | #include "lm/weights.hh" 5 | #include "lm/word_index.hh" 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace lm { 13 | 14 | class NGramHeader { 15 | public: 16 | NGramHeader(void *begin, std::size_t order) 17 | : begin_(static_cast(begin)), end_(begin_ + order) {} 18 | 19 | NGramHeader() : begin_(NULL), end_(NULL) {} 20 | 21 | const uint8_t *Base() const { return reinterpret_cast(begin_); } 22 | uint8_t *Base() { return reinterpret_cast(begin_); } 23 | 24 | void ReBase(void *to) { 25 | std::size_t difference = end_ - begin_; 26 | begin_ = reinterpret_cast(to); 27 | end_ = begin_ + difference; 28 | } 29 | 30 | // These are for the vocab index. 31 | // Lower-case in deference to STL. 32 | const WordIndex *begin() const { return begin_; } 33 | WordIndex *begin() { return begin_; } 34 | const WordIndex *end() const { return end_; } 35 | WordIndex *end() { return end_; } 36 | 37 | std::size_t size() const { return end_ - begin_; } 38 | std::size_t Order() const { return end_ - begin_; } 39 | 40 | private: 41 | WordIndex *begin_, *end_; 42 | }; 43 | 44 | template class NGram : public NGramHeader { 45 | public: 46 | typedef PayloadT Payload; 47 | 48 | NGram() : NGramHeader(NULL, 0) {} 49 | 50 | NGram(void *begin, std::size_t order) : NGramHeader(begin, order) {} 51 | 52 | // Would do operator++ but that can get confusing for a stream. 53 | void NextInMemory() { 54 | ReBase(&Value() + 1); 55 | } 56 | 57 | static std::size_t TotalSize(std::size_t order) { 58 | return order * sizeof(WordIndex) + sizeof(Payload); 59 | } 60 | std::size_t TotalSize() const { 61 | // Compiler should optimize this. 62 | return TotalSize(Order()); 63 | } 64 | 65 | static std::size_t OrderFromSize(std::size_t size) { 66 | std::size_t ret = (size - sizeof(Payload)) / sizeof(WordIndex); 67 | assert(size == TotalSize(ret)); 68 | return ret; 69 | } 70 | 71 | const Payload &Value() const { return *reinterpret_cast(end()); } 72 | Payload &Value() { return *reinterpret_cast(end()); } 73 | }; 74 | 75 | } // namespace lm 76 | 77 | #endif // LM_COMMON_NGRAM_H 78 | -------------------------------------------------------------------------------- /kenlm/lm/common/ngram_stream.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_BUILDER_NGRAM_STREAM_H 2 | #define LM_BUILDER_NGRAM_STREAM_H 3 | 4 | #include "lm/common/ngram.hh" 5 | #include "util/stream/chain.hh" 6 | #include "util/stream/multi_stream.hh" 7 | #include "util/stream/stream.hh" 8 | 9 | #include 10 | 11 | namespace lm { 12 | 13 | template class ProxyStream { 14 | public: 15 | // Make an invalid stream. 16 | ProxyStream() {} 17 | 18 | explicit ProxyStream(const util::stream::ChainPosition &position, const Proxy &proxy = Proxy()) 19 | : proxy_(proxy), stream_(position) { 20 | proxy_.ReBase(stream_.Get()); 21 | } 22 | 23 | Proxy &operator*() { return proxy_; } 24 | const Proxy &operator*() const { return proxy_; } 25 | 26 | Proxy *operator->() { return &proxy_; } 27 | const Proxy *operator->() const { return &proxy_; } 28 | 29 | void *Get() { return stream_.Get(); } 30 | const void *Get() const { return stream_.Get(); } 31 | 32 | operator bool() const { return stream_; } 33 | bool operator!() const { return !stream_; } 34 | void Poison() { stream_.Poison(); } 35 | 36 | ProxyStream &operator++() { 37 | ++stream_; 38 | proxy_.ReBase(stream_.Get()); 39 | return *this; 40 | } 41 | 42 | private: 43 | Proxy proxy_; 44 | util::stream::Stream stream_; 45 | }; 46 | 47 | template class NGramStream : public ProxyStream > { 48 | public: 49 | // Make an invalid stream. 50 | NGramStream() {} 51 | 52 | explicit NGramStream(const util::stream::ChainPosition &position) : 53 | ProxyStream >(position, NGram(NULL, NGram::OrderFromSize(position.GetChain().EntrySize()))) {} 54 | }; 55 | 56 | template class NGramStreams : public util::stream::GenericStreams > { 57 | private: 58 | typedef util::stream::GenericStreams > P; 59 | public: 60 | NGramStreams() : P() {} 61 | NGramStreams(const util::stream::ChainPositions &positions) : P(positions) {} 62 | }; 63 | 64 | } // namespace 65 | #endif // LM_BUILDER_NGRAM_STREAM_H 66 | -------------------------------------------------------------------------------- /kenlm/lm/common/print.cc: -------------------------------------------------------------------------------- 1 | #include "lm/common/print.hh" 2 | 3 | #include "lm/common/ngram_stream.hh" 4 | #include "util/file_stream.hh" 5 | #include "util/file.hh" 6 | #include "util/mmap.hh" 7 | #include "util/scoped.hh" 8 | 9 | #include 10 | #include 11 | 12 | namespace lm { 13 | 14 | VocabReconstitute::VocabReconstitute(int fd) { 15 | uint64_t size = util::SizeOrThrow(fd); 16 | util::MapRead(util::POPULATE_OR_READ, fd, 0, size, memory_); 17 | const char *const start = static_cast(memory_.get()); 18 | const char *i; 19 | for (i = start; i != start + size; i += strlen(i) + 1) { 20 | map_.push_back(i); 21 | } 22 | // Last one for LookupPiece. 23 | map_.push_back(i); 24 | } 25 | 26 | namespace { 27 | template void PrintLead(const VocabReconstitute &vocab, ProxyStream &stream, util::FileStream &out) { 28 | out << stream->Value().prob << '\t' << vocab.Lookup(*stream->begin()); 29 | for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) { 30 | out << ' ' << vocab.Lookup(*i); 31 | } 32 | } 33 | } // namespace 34 | 35 | void PrintARPA::Run(const util::stream::ChainPositions &positions) { 36 | VocabReconstitute vocab(vocab_fd_); 37 | util::FileStream out(out_fd_); 38 | out << "\\data\\\n"; 39 | for (size_t i = 0; i < positions.size(); ++i) { 40 | out << "ngram " << (i+1) << '=' << counts_[i] << '\n'; 41 | } 42 | out << '\n'; 43 | 44 | for (unsigned order = 1; order < positions.size(); ++order) { 45 | out << "\\" << order << "-grams:" << '\n'; 46 | for (ProxyStream > stream(positions[order - 1], NGram(NULL, order)); stream; ++stream) { 47 | PrintLead(vocab, stream, out); 48 | out << '\t' << stream->Value().backoff << '\n'; 49 | } 50 | out << '\n'; 51 | } 52 | 53 | out << "\\" << positions.size() << "-grams:" << '\n'; 54 | for (ProxyStream > stream(positions.back(), NGram(NULL, positions.size())); stream; ++stream) { 55 | PrintLead(vocab, stream, out); 56 | out << '\n'; 57 | } 58 | out << '\n'; 59 | out << "\\end\\\n"; 60 | } 61 | 62 | } // namespace lm 63 | -------------------------------------------------------------------------------- /kenlm/lm/common/print.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_COMMON_PRINT_H 2 | #define LM_COMMON_PRINT_H 3 | 4 | #include "lm/word_index.hh" 5 | #include "util/mmap.hh" 6 | #include "util/string_piece.hh" 7 | 8 | #include 9 | #include 10 | 11 | namespace util { namespace stream { class ChainPositions; }} 12 | 13 | // Warning: PrintARPA routines read all unigrams before all bigrams before all 14 | // trigrams etc. So if other parts of the chain move jointly, you'll have to 15 | // buffer. 16 | 17 | namespace lm { 18 | 19 | class VocabReconstitute { 20 | public: 21 | // fd must be alive for life of this object; does not take ownership. 22 | explicit VocabReconstitute(int fd); 23 | 24 | const char *Lookup(WordIndex index) const { 25 | assert(index < map_.size() - 1); 26 | return map_[index]; 27 | } 28 | 29 | StringPiece LookupPiece(WordIndex index) const { 30 | return StringPiece(map_[index], map_[index + 1] - 1 - map_[index]); 31 | } 32 | 33 | std::size_t Size() const { 34 | // There's an extra entry to support StringPiece lengths. 35 | return map_.size() - 1; 36 | } 37 | 38 | private: 39 | util::scoped_memory memory_; 40 | std::vector map_; 41 | }; 42 | 43 | class PrintARPA { 44 | public: 45 | // Does not take ownership of vocab_fd or out_fd. 46 | explicit PrintARPA(int vocab_fd, int out_fd, const std::vector &counts) 47 | : vocab_fd_(vocab_fd), out_fd_(out_fd), counts_(counts) {} 48 | 49 | void Run(const util::stream::ChainPositions &positions); 50 | 51 | private: 52 | int vocab_fd_; 53 | int out_fd_; 54 | std::vector counts_; 55 | }; 56 | 57 | } // namespace lm 58 | #endif // LM_COMMON_PRINT_H 59 | -------------------------------------------------------------------------------- /kenlm/lm/common/renumber.cc: -------------------------------------------------------------------------------- 1 | #include "lm/common/renumber.hh" 2 | #include "lm/common/ngram.hh" 3 | 4 | #include "util/stream/stream.hh" 5 | 6 | namespace lm { 7 | 8 | void Renumber::Run(const util::stream::ChainPosition &position) { 9 | for (util::stream::Stream stream(position); stream; ++stream) { 10 | NGramHeader gram(stream.Get(), order_); 11 | for (WordIndex *w = gram.begin(); w != gram.end(); ++w) { 12 | *w = new_numbers_[*w]; 13 | } 14 | } 15 | } 16 | 17 | } // namespace lm 18 | -------------------------------------------------------------------------------- /kenlm/lm/common/renumber.hh: -------------------------------------------------------------------------------- 1 | /* Map vocab ids. This is useful to merge independently collected counts or 2 | * change the vocab ids to the order used by the trie. 3 | */ 4 | #ifndef LM_COMMON_RENUMBER_H 5 | #define LM_COMMON_RENUMBER_H 6 | 7 | #include "lm/word_index.hh" 8 | 9 | #include 10 | 11 | namespace util { namespace stream { class ChainPosition; }} 12 | 13 | namespace lm { 14 | 15 | class Renumber { 16 | public: 17 | // Assumes the array is large enough to map all words and stays alive while 18 | // the thread is active. 19 | Renumber(const WordIndex *new_numbers, std::size_t order) 20 | : new_numbers_(new_numbers), order_(order) {} 21 | 22 | void Run(const util::stream::ChainPosition &position); 23 | 24 | private: 25 | const WordIndex *new_numbers_; 26 | std::size_t order_; 27 | }; 28 | 29 | } // namespace lm 30 | #endif // LM_COMMON_RENUMBER_H 31 | -------------------------------------------------------------------------------- /kenlm/lm/common/size_option.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include "util/usage.hh" 3 | 4 | namespace lm { 5 | 6 | namespace { 7 | class SizeNotify { 8 | public: 9 | explicit SizeNotify(std::size_t &out) : behind_(out) {} 10 | 11 | void operator()(const std::string &from) { 12 | behind_ = util::ParseSize(from); 13 | } 14 | 15 | private: 16 | std::size_t &behind_; 17 | }; 18 | } 19 | 20 | boost::program_options::typed_value *SizeOption(std::size_t &to, const char *default_value) { 21 | return boost::program_options::value()->notifier(SizeNotify(to))->default_value(default_value); 22 | } 23 | 24 | } // namespace lm 25 | -------------------------------------------------------------------------------- /kenlm/lm/common/size_option.hh: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | namespace lm { 7 | 8 | // Create a boost program option for data sizes. This parses sizes like 1T and 10k. 9 | boost::program_options::typed_value *SizeOption(std::size_t &to, const char *default_value); 10 | 11 | } // namespace lm 12 | -------------------------------------------------------------------------------- /kenlm/lm/common/special.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_COMMON_SPECIAL_H 2 | #define LM_COMMON_SPECIAL_H 3 | 4 | #include "lm/word_index.hh" 5 | 6 | namespace lm { 7 | 8 | class SpecialVocab { 9 | public: 10 | SpecialVocab(WordIndex bos, WordIndex eos) : bos_(bos), eos_(eos) {} 11 | 12 | bool IsSpecial(WordIndex word) const { 13 | return word == kUNK || word == bos_ || word == eos_; 14 | } 15 | 16 | WordIndex UNK() const { return kUNK; } 17 | WordIndex BOS() const { return bos_; } 18 | WordIndex EOS() const { return eos_; } 19 | 20 | private: 21 | WordIndex bos_; 22 | WordIndex eos_; 23 | }; 24 | 25 | } // namespace lm 26 | 27 | #endif // LM_COMMON_SPECIAL_H 28 | -------------------------------------------------------------------------------- /kenlm/lm/common/test_data/generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ../../../bin/lmplz --discount_fallback -o 3 -S 100M --intermediate toy0 --arpa toy0.arpa < 0 8 | 0 -0.30103 9 | -0.46943438 a -0.30103 10 | -0.5720968 0 11 | -0.5720968 b -0.30103 12 | 13 | \2-grams: 14 | -0.37712017 a -0.30103 15 | -0.37712017 a a -0.30103 16 | -0.2984526 b a -0.30103 17 | -0.58682007 a 0 18 | -0.52201796 b 0 19 | -0.41574955 b -0.30103 20 | -0.58682007 a b -0.30103 21 | 22 | \3-grams: 23 | -0.14885087 a a 24 | -0.33741078 b a a 25 | -0.124077894 b a 26 | -0.2997394 a b a 27 | -0.42082912 b a 28 | -0.397617 a b 29 | -0.20102891 a a b 30 | 31 | \end\ 32 | -------------------------------------------------------------------------------- /kenlm/lm/common/test_data/toy0.kenlm_intermediate: -------------------------------------------------------------------------------- 1 | KenLM intermediate binary file 2 | Counts 5 7 7 3 | Payload pb 4 | -------------------------------------------------------------------------------- /kenlm/lm/common/test_data/toy0.vocab: -------------------------------------------------------------------------------- 1 | ab -------------------------------------------------------------------------------- /kenlm/lm/common/test_data/toy1.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/kenlm/lm/common/test_data/toy1.1 -------------------------------------------------------------------------------- /kenlm/lm/common/test_data/toy1.2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/kenlm/lm/common/test_data/toy1.2 -------------------------------------------------------------------------------- /kenlm/lm/common/test_data/toy1.3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/kenlm/lm/common/test_data/toy1.3 -------------------------------------------------------------------------------- /kenlm/lm/common/test_data/toy1.arpa: -------------------------------------------------------------------------------- 1 | \data\ 2 | ngram 1=6 3 | ngram 2=7 4 | ngram 3=6 5 | 6 | \1-grams: 7 | -1 0 8 | 0 -0.30103 9 | -0.6146491 a -0.30103 10 | -0.6146491 0 11 | -0.7659168 c -0.30103 12 | -0.6146491 b -0.30103 13 | 14 | \2-grams: 15 | -0.4301247 a -0.30103 16 | -0.4301247 a a -0.30103 17 | -0.20660876 c 0 18 | -0.5404639 b 0 19 | -0.4740302 c -0.30103 20 | -0.4301247 a b -0.30103 21 | -0.3422159 b b -0.47712123 22 | 23 | \3-grams: 24 | -0.1638568 a a 25 | -0.09113217 c 26 | -0.7462621 b b 27 | -0.1638568 a a b 28 | -0.13823806 a b b 29 | -0.13375957 b b b 30 | 31 | \end\ 32 | -------------------------------------------------------------------------------- /kenlm/lm/common/test_data/toy1.kenlm_intermediate: -------------------------------------------------------------------------------- 1 | KenLM intermediate binary file 2 | Counts 6 7 6 3 | Payload pb 4 | -------------------------------------------------------------------------------- /kenlm/lm/common/test_data/toy1.vocab: -------------------------------------------------------------------------------- 1 | acb -------------------------------------------------------------------------------- /kenlm/lm/config.cc: -------------------------------------------------------------------------------- 1 | #include "lm/config.hh" 2 | 3 | #include 4 | 5 | namespace lm { 6 | namespace ngram { 7 | 8 | Config::Config() : 9 | show_progress(true), 10 | messages(&std::cerr), 11 | enumerate_vocab(NULL), 12 | unknown_missing(COMPLAIN), 13 | sentence_marker_missing(THROW_UP), 14 | positive_log_probability(THROW_UP), 15 | unknown_missing_logprob(-100.0), 16 | probing_multiplier(1.5), 17 | building_memory(1073741824ULL), // 1 GB 18 | temporary_directory_prefix(""), 19 | arpa_complain(ALL), 20 | write_mmap(NULL), 21 | write_method(WRITE_AFTER), 22 | include_vocab(true), 23 | rest_function(REST_MAX), 24 | prob_bits(8), 25 | backoff_bits(8), 26 | pointer_bhiksha_bits(22), 27 | load_method(util::POPULATE_OR_READ) {} 28 | 29 | } // namespace ngram 30 | } // namespace lm 31 | -------------------------------------------------------------------------------- /kenlm/lm/enumerate_vocab.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_ENUMERATE_VOCAB_H 2 | #define LM_ENUMERATE_VOCAB_H 3 | 4 | #include "lm/word_index.hh" 5 | #include "util/string_piece.hh" 6 | 7 | namespace lm { 8 | 9 | /* If you need the actual strings in the vocabulary, inherit from this class 10 | * and implement Add. Then put a pointer in Config.enumerate_vocab; it does 11 | * not take ownership. Add is called once per vocab word. index starts at 0 12 | * and increases by 1 each time. This is only used by the Model constructor; 13 | * the pointer is not retained by the class. 14 | */ 15 | class EnumerateVocab { 16 | public: 17 | virtual ~EnumerateVocab() {} 18 | 19 | virtual void Add(WordIndex index, const StringPiece &str) = 0; 20 | 21 | protected: 22 | EnumerateVocab() {} 23 | }; 24 | 25 | } // namespace lm 26 | 27 | #endif // LM_ENUMERATE_VOCAB_H 28 | 29 | -------------------------------------------------------------------------------- /kenlm/lm/filter/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # This CMake file was created by Lane Schwartz 2 | 3 | # Explicitly list the source files for this subdirectory 4 | # 5 | # If you add any source files to this subdirectory 6 | # that should be included in the kenlm library, 7 | # (this excludes any unit test files) 8 | # you should add them to the following list: 9 | # 10 | # In order to set correct paths to these files 11 | # in case this variable is referenced by CMake files in the parent directory, 12 | # we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}. 13 | # 14 | set(KENLM_FILTER_SOURCE 15 | ${CMAKE_CURRENT_SOURCE_DIR}/arpa_io.cc 16 | ${CMAKE_CURRENT_SOURCE_DIR}/phrase.cc 17 | ${CMAKE_CURRENT_SOURCE_DIR}/vocab.cc 18 | ) 19 | 20 | # Group these objects together for later use. 21 | # 22 | # Given add_library(foo OBJECT ${my_foo_sources}), 23 | # refer to these objects as $ 24 | # 25 | add_library(kenlm_filter ${KENLM_FILTER_SOURCE}) 26 | 27 | if (NOT MSVC) 28 | set(THREADS pthread) 29 | endif() 30 | 31 | AddExes(EXES filter phrase_table_vocab 32 | LIBRARIES kenlm_filter kenlm kenlm_util ${Boost_LIBRARIES} ${THREADS}) 33 | 34 | -------------------------------------------------------------------------------- /kenlm/lm/filter/arpa_io.cc: -------------------------------------------------------------------------------- 1 | #include "lm/filter/arpa_io.hh" 2 | #include "util/file_piece.hh" 3 | #include "util/string_stream.hh" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | namespace lm { 15 | 16 | ARPAInputException::ARPAInputException(const StringPiece &message) throw() { 17 | *this << message; 18 | } 19 | 20 | ARPAInputException::ARPAInputException(const StringPiece &message, const StringPiece &line) throw() { 21 | *this << message << " in line " << line; 22 | } 23 | 24 | ARPAInputException::~ARPAInputException() throw() {} 25 | 26 | // Seeking is the responsibility of the caller. 27 | template void WriteCounts(Stream &out, const std::vector &number) { 28 | out << "\n\\data\\\n"; 29 | for (unsigned int i = 0; i < number.size(); ++i) { 30 | out << "ngram " << i+1 << "=" << number[i] << '\n'; 31 | } 32 | out << '\n'; 33 | } 34 | 35 | size_t SizeNeededForCounts(const std::vector &number) { 36 | util::StringStream stream; 37 | WriteCounts(stream, number); 38 | return stream.str().size(); 39 | } 40 | 41 | bool IsEntirelyWhiteSpace(const StringPiece &line) { 42 | for (size_t i = 0; i < static_cast(line.size()); ++i) { 43 | if (!isspace(line.data()[i])) return false; 44 | } 45 | return true; 46 | } 47 | 48 | ARPAOutput::ARPAOutput(const char *name, size_t buffer_size) 49 | : file_backing_(util::CreateOrThrow(name)), file_(file_backing_.get(), buffer_size) {} 50 | 51 | void ARPAOutput::ReserveForCounts(std::streampos reserve) { 52 | for (std::streampos i = 0; i < reserve; i += std::streampos(1)) { 53 | file_ << '\n'; 54 | } 55 | } 56 | 57 | void ARPAOutput::BeginLength(unsigned int length) { 58 | file_ << '\\' << length << "-grams:" << '\n'; 59 | fast_counter_ = 0; 60 | } 61 | 62 | void ARPAOutput::EndLength(unsigned int length) { 63 | file_ << '\n'; 64 | if (length > counts_.size()) { 65 | counts_.resize(length); 66 | } 67 | counts_[length - 1] = fast_counter_; 68 | } 69 | 70 | void ARPAOutput::Finish() { 71 | file_ << "\\end\\\n"; 72 | file_.seekp(0); 73 | WriteCounts(file_, counts_); 74 | file_.flush(); 75 | } 76 | 77 | } // namespace lm 78 | -------------------------------------------------------------------------------- /kenlm/lm/filter/vocab.cc: -------------------------------------------------------------------------------- 1 | #include "lm/filter/vocab.hh" 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | namespace lm { 9 | namespace vocab { 10 | 11 | void ReadSingle(std::istream &in, boost::unordered_set &out) { 12 | in.exceptions(std::istream::badbit); 13 | std::string word; 14 | while (in >> word) { 15 | out.insert(word); 16 | } 17 | } 18 | 19 | namespace { 20 | bool IsLineEnd(std::istream &in) { 21 | int got; 22 | do { 23 | got = in.get(); 24 | if (!in) return true; 25 | if (got == '\n') return true; 26 | } while (isspace(got)); 27 | in.unget(); 28 | return false; 29 | } 30 | }// namespace 31 | 32 | // Read space separated words in enter separated lines. These lines can be 33 | // very long, so don't read an entire line at a time. 34 | unsigned int ReadMultiple(std::istream &in, boost::unordered_map > &out) { 35 | in.exceptions(std::istream::badbit); 36 | unsigned int sentence = 0; 37 | bool used_id = false; 38 | std::string word; 39 | while (in >> word) { 40 | used_id = true; 41 | std::vector &posting = out[word]; 42 | if (posting.empty() || (posting.back() != sentence)) 43 | posting.push_back(sentence); 44 | if (IsLineEnd(in)) { 45 | ++sentence; 46 | used_id = false; 47 | } 48 | } 49 | return sentence + used_id; 50 | } 51 | 52 | } // namespace vocab 53 | } // namespace lm 54 | -------------------------------------------------------------------------------- /kenlm/lm/filter/wrapper.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_FILTER_WRAPPER_H 2 | #define LM_FILTER_WRAPPER_H 3 | 4 | #include "util/string_piece.hh" 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | namespace lm { 11 | 12 | // Provide a single-output filter with the same interface as a 13 | // multiple-output filter so clients code against one interface. 14 | template class BinaryFilter { 15 | public: 16 | // Binary modes are just references (and a set) and it makes the API cleaner to copy them. 17 | explicit BinaryFilter(Binary binary) : binary_(binary) {} 18 | 19 | template void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) { 20 | if (binary_.PassNGram(begin, end)) 21 | output.AddNGram(line); 22 | } 23 | 24 | template void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { 25 | AddNGram(util::TokenIter(ngram, ' '), util::TokenIter::end(), line, output); 26 | } 27 | 28 | void Flush() const {} 29 | 30 | private: 31 | Binary binary_; 32 | }; 33 | 34 | // Wrap another filter to pay attention only to context words 35 | template class ContextFilter { 36 | public: 37 | typedef FilterT Filter; 38 | 39 | explicit ContextFilter(Filter &backend) : backend_(backend) {} 40 | 41 | template void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) { 42 | // Find beginning of string or last space. 43 | const char *last_space; 44 | for (last_space = ngram.data() + ngram.size() - 1; last_space > ngram.data() && *last_space != ' '; --last_space) {} 45 | backend_.AddNGram(StringPiece(ngram.data(), last_space - ngram.data()), line, output); 46 | } 47 | 48 | void Flush() const {} 49 | 50 | private: 51 | Filter backend_; 52 | }; 53 | 54 | } // namespace lm 55 | 56 | #endif // LM_FILTER_WRAPPER_H 57 | -------------------------------------------------------------------------------- /kenlm/lm/fragment_main.cc: -------------------------------------------------------------------------------- 1 | #include "lm/binary_format.hh" 2 | #include "lm/model.hh" 3 | #include "lm/left.hh" 4 | #include "util/tokenize_piece.hh" 5 | 6 | template void Query(const char *name) { 7 | Model model(name); 8 | std::string line; 9 | lm::ngram::ChartState ignored; 10 | while (getline(std::cin, line)) { 11 | lm::ngram::RuleScore scorer(model, ignored); 12 | for (util::TokenIter i(line, ' '); i; ++i) { 13 | scorer.Terminal(model.GetVocabulary().Index(*i)); 14 | } 15 | std::cout << scorer.Finish() << '\n'; 16 | } 17 | } 18 | 19 | int main(int argc, char *argv[]) { 20 | if (argc != 2) { 21 | std::cerr << "Expected model file name." << std::endl; 22 | return 1; 23 | } 24 | const char *name = argv[1]; 25 | lm::ngram::ModelType model_type = lm::ngram::PROBING; 26 | lm::ngram::RecognizeBinary(name, model_type); 27 | switch (model_type) { 28 | case lm::ngram::PROBING: 29 | Query(name); 30 | break; 31 | case lm::ngram::REST_PROBING: 32 | Query(name); 33 | break; 34 | default: 35 | std::cerr << "Model type not supported yet." << std::endl; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /kenlm/lm/interpolate/backoff_matrix.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_INTERPOLATE_BACKOFF_MATRIX_H 2 | #define LM_INTERPOLATE_BACKOFF_MATRIX_H 3 | 4 | #include 5 | #include 6 | 7 | namespace lm { namespace interpolate { 8 | 9 | class BackoffMatrix { 10 | public: 11 | BackoffMatrix(std::size_t num_models, std::size_t max_order) 12 | : max_order_(max_order), backing_(num_models * max_order) {} 13 | 14 | float &Backoff(std::size_t model, std::size_t order_minus_1) { 15 | return backing_[model * max_order_ + order_minus_1]; 16 | } 17 | 18 | float Backoff(std::size_t model, std::size_t order_minus_1) const { 19 | return backing_[model * max_order_ + order_minus_1]; 20 | } 21 | 22 | private: 23 | const std::size_t max_order_; 24 | std::vector backing_; 25 | }; 26 | 27 | }} // namespaces 28 | 29 | #endif // LM_INTERPOLATE_BACKOFF_MATRIX_H 30 | -------------------------------------------------------------------------------- /kenlm/lm/interpolate/backoff_reunification.cc: -------------------------------------------------------------------------------- 1 | #include "lm/interpolate/backoff_reunification.hh" 2 | #include "lm/common/model_buffer.hh" 3 | #include "lm/common/ngram_stream.hh" 4 | #include "lm/common/ngram.hh" 5 | #include "lm/common/compare.hh" 6 | 7 | #include 8 | #include 9 | 10 | namespace lm { 11 | namespace interpolate { 12 | 13 | namespace { 14 | class MergeWorker { 15 | public: 16 | MergeWorker(std::size_t order, const util::stream::ChainPosition &prob_pos, 17 | const util::stream::ChainPosition &boff_pos) 18 | : order_(order), prob_pos_(prob_pos), boff_pos_(boff_pos) { 19 | // nothing 20 | } 21 | 22 | void Run(const util::stream::ChainPosition &position) { 23 | lm::NGramStream stream(position); 24 | 25 | lm::NGramStream prob_input(prob_pos_); 26 | util::stream::Stream boff_input(boff_pos_); 27 | for (; prob_input && boff_input; ++prob_input, ++boff_input, ++stream) { 28 | std::copy(prob_input->begin(), prob_input->end(), stream->begin()); 29 | stream->Value().prob = std::min(0.0f, prob_input->Value()); 30 | stream->Value().backoff = *reinterpret_cast(boff_input.Get()); 31 | } 32 | UTIL_THROW_IF2(prob_input || boff_input, 33 | "Streams were not the same size during merging"); 34 | stream.Poison(); 35 | } 36 | 37 | private: 38 | std::size_t order_; 39 | util::stream::ChainPosition prob_pos_; 40 | util::stream::ChainPosition boff_pos_; 41 | }; 42 | } 43 | 44 | // Since we are *adding* something to the output chain here, we pass in the 45 | // chain itself so that we can safely add a new step to the chain without 46 | // creating a deadlock situation (since creating a new ChainPosition will 47 | // make a new input/output pair---we want that position to be created 48 | // *here*, not before). 49 | void ReunifyBackoff(util::stream::ChainPositions &prob_pos, 50 | util::stream::ChainPositions &boff_pos, 51 | util::stream::Chains &output_chains) { 52 | assert(prob_pos.size() == boff_pos.size()); 53 | 54 | for (size_t i = 0; i < prob_pos.size(); ++i) 55 | output_chains[i] >> MergeWorker(i + 1, prob_pos[i], boff_pos[i]); 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /kenlm/lm/interpolate/backoff_reunification.hh: -------------------------------------------------------------------------------- 1 | #ifndef KENLM_INTERPOLATE_BACKOFF_REUNIFICATION_ 2 | #define KENLM_INTERPOLATE_BACKOFF_REUNIFICATION_ 3 | 4 | #include "util/stream/stream.hh" 5 | #include "util/stream/multi_stream.hh" 6 | 7 | namespace lm { 8 | namespace interpolate { 9 | 10 | /** 11 | * The third pass for the offline log-linear interpolation algorithm. This 12 | * reads **suffix-ordered** probability values (ngram-id, float) and 13 | * **suffix-ordered** backoff values (float) and writes the merged contents 14 | * to the output. 15 | * 16 | * @param prob_pos The chain position for each order from which to read 17 | * the probability values 18 | * @param boff_pos The chain position for each order from which to read 19 | * the backoff values 20 | * @param output_chains The output chains for each order 21 | */ 22 | void ReunifyBackoff(util::stream::ChainPositions &prob_pos, 23 | util::stream::ChainPositions &boff_pos, 24 | util::stream::Chains &output_chains); 25 | } 26 | } 27 | #endif 28 | -------------------------------------------------------------------------------- /kenlm/lm/interpolate/bounded_sequence_encoding.cc: -------------------------------------------------------------------------------- 1 | #include "lm/interpolate/bounded_sequence_encoding.hh" 2 | 3 | #include 4 | 5 | namespace lm { namespace interpolate { 6 | 7 | BoundedSequenceEncoding::BoundedSequenceEncoding(const unsigned char *bound_begin, const unsigned char *bound_end) 8 | : entries_(bound_end - bound_begin) { 9 | std::size_t full = 0; 10 | Entry entry; 11 | entry.shift = 0; 12 | for (const unsigned char *i = bound_begin; i != bound_end; ++i) { 13 | uint8_t length; 14 | if (*i <= 1) { 15 | length = 0; 16 | } else { 17 | length = sizeof(unsigned int) * 8 - __builtin_clz((unsigned int)*i); 18 | } 19 | entry.mask = (1ULL << length) - 1ULL; 20 | if (entry.shift + length > 64) { 21 | entry.shift = 0; 22 | entry.next = true; 23 | ++full; 24 | } else { 25 | entry.next = false; 26 | } 27 | entries_.push_back(entry); 28 | entry.shift += length; 29 | } 30 | byte_length_ = full * sizeof(uint64_t) + (entry.shift + 7) / 8; 31 | first_copy_ = std::min(byte_length_, sizeof(uint64_t)); 32 | // Size of last uint64_t. Zero if empty, otherwise [1,8] depending on mod. 33 | overhang_ = byte_length_ == 0 ? 0 : ((byte_length_ - 1) % 8 + 1); 34 | } 35 | 36 | }} // namespaces 37 | -------------------------------------------------------------------------------- /kenlm/lm/interpolate/bounded_sequence_encoding.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_INTERPOLATE_BOUNDED_SEQUENCE_ENCODING_H 2 | #define LM_INTERPOLATE_BOUNDED_SEQUENCE_ENCODING_H 3 | 4 | /* Encodes fixed-length sequences of integers with known bounds on each entry. 5 | * This is used to encode how far each model has backed off. 6 | * TODO: make this class efficient. Bit-level packing or multiply by bound and 7 | * add. 8 | */ 9 | 10 | #include "util/exception.hh" 11 | #include "util/fixed_array.hh" 12 | 13 | #if BYTE_ORDER != LITTLE_ENDIAN 14 | #warning The interpolation code assumes little endian for now. 15 | #endif 16 | 17 | #include 18 | #include 19 | 20 | namespace lm { 21 | namespace interpolate { 22 | 23 | class BoundedSequenceEncoding { 24 | public: 25 | // Encode [0, bound_begin[0]) x [0, bound_begin[1]) x [0, bound_begin[2]) x ... x [0, *(bound_end - 1)) for entries in the sequence 26 | BoundedSequenceEncoding(const unsigned char *bound_begin, const unsigned char *bound_end); 27 | 28 | std::size_t Entries() const { return entries_.size(); } 29 | 30 | std::size_t EncodedLength() const { return byte_length_; } 31 | 32 | void Encode(const unsigned char *from, void *to_void) const { 33 | uint8_t *to = static_cast(to_void); 34 | uint64_t cur = 0; 35 | for (const Entry *i = entries_.begin(); i != entries_.end(); ++i, ++from) { 36 | if (UTIL_UNLIKELY(i->next)) { 37 | std::memcpy(to, &cur, sizeof(uint64_t)); 38 | to += sizeof(uint64_t); 39 | cur = 0; 40 | } 41 | cur |= static_cast(*from) << i->shift; 42 | } 43 | memcpy(to, &cur, overhang_); 44 | } 45 | 46 | void Decode(const void *from_void, unsigned char *to) const { 47 | const uint8_t *from = static_cast(from_void); 48 | uint64_t cur = 0; 49 | memcpy(&cur, from, first_copy_); 50 | for (const Entry *i = entries_.begin(); i != entries_.end(); ++i, ++to) { 51 | if (UTIL_UNLIKELY(i->next)) { 52 | from += sizeof(uint64_t); 53 | cur = 0; 54 | std::memcpy(&cur, from, 55 | std::min(sizeof(uint64_t), static_cast(from_void) + byte_length_ - from)); 56 | } 57 | *to = (cur >> i->shift) & i->mask; 58 | } 59 | } 60 | 61 | private: 62 | struct Entry { 63 | bool next; 64 | uint8_t shift; 65 | uint64_t mask; 66 | }; 67 | util::FixedArray entries_; 68 | std::size_t byte_length_; 69 | std::size_t first_copy_; 70 | std::size_t overhang_; 71 | }; 72 | 73 | 74 | }} // namespaces 75 | 76 | #endif // LM_INTERPOLATE_BOUNDED_SEQUENCE_ENCODING_H 77 | -------------------------------------------------------------------------------- /kenlm/lm/interpolate/interpolate_info.hh: -------------------------------------------------------------------------------- 1 | #ifndef KENLM_INTERPOLATE_INTERPOLATE_INFO_H 2 | #define KENLM_INTERPOLATE_INTERPOLATE_INFO_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace lm { 9 | namespace interpolate { 10 | 11 | /** 12 | * Stores relevant info for interpolating several language models, for use 13 | * during the three-pass offline log-linear interpolation algorithm. 14 | */ 15 | struct InterpolateInfo { 16 | /** 17 | * @return the number of models being interpolated 18 | */ 19 | std::size_t Models() const { 20 | return orders.size(); 21 | } 22 | 23 | /** 24 | * The lambda (interpolation weight) for each model. 25 | */ 26 | std::vector lambdas; 27 | 28 | /** 29 | * The maximum ngram order for each model. 30 | */ 31 | std::vector orders; 32 | }; 33 | } 34 | } 35 | #endif 36 | -------------------------------------------------------------------------------- /kenlm/lm/interpolate/merge_test/test1: -------------------------------------------------------------------------------- 1 | athiscutisfirst -------------------------------------------------------------------------------- /kenlm/lm/interpolate/merge_test/test2: -------------------------------------------------------------------------------- 1 | is thisthis afirst cuta first -------------------------------------------------------------------------------- /kenlm/lm/interpolate/merge_test/test3: -------------------------------------------------------------------------------- 1 | isisecd -------------------------------------------------------------------------------- /kenlm/lm/interpolate/merge_test/test_bad_order: -------------------------------------------------------------------------------- 1 | secdis -------------------------------------------------------------------------------- /kenlm/lm/interpolate/merge_test/test_no_unk: -------------------------------------------------------------------------------- 1 | toto 2 | -------------------------------------------------------------------------------- /kenlm/lm/interpolate/merge_vocab.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_INTERPOLATE_MERGE_VOCAB_H 2 | #define LM_INTERPOLATE_MERGE_VOCAB_H 3 | 4 | #include "lm/word_index.hh" 5 | #include "util/file.hh" 6 | #include "util/fixed_array.hh" 7 | 8 | namespace lm { 9 | 10 | class EnumerateVocab; 11 | 12 | namespace interpolate { 13 | 14 | class UniversalVocab; 15 | 16 | // The combined vocabulary is enumerated with enumerate. 17 | // Returns the size of the combined vocabulary. 18 | // Does not take ownership of vocab_files. 19 | WordIndex MergeVocab(util::FixedArray &vocab_files, UniversalVocab &vocab, EnumerateVocab &enumerate); 20 | 21 | }} // namespaces 22 | 23 | #endif // LM_INTERPOLATE_MERGE_VOCAB_H 24 | -------------------------------------------------------------------------------- /kenlm/lm/interpolate/normalize.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_INTERPOLATE_NORMALIZE_H 2 | #define LM_INTERPOLATE_NORMALIZE_H 3 | 4 | #include "util/fixed_array.hh" 5 | 6 | /* Pass 2: 7 | * - Multiply backoff weights by the backed off probabilities from pass 1. 8 | * - Compute the normalization factor Z. 9 | * - Send Z to the next highest order. 10 | * - Rewind and divide by Z. 11 | */ 12 | 13 | namespace util { namespace stream { 14 | class ChainPositions; 15 | class Chains; 16 | }} // namespaces 17 | 18 | namespace lm { namespace interpolate { 19 | 20 | struct InterpolateInfo; 21 | 22 | void Normalize( 23 | const InterpolateInfo &info, 24 | // Input full models for backoffs. Assumes that renumbering has been done. Suffix order. 25 | util::FixedArray &models_by_order, 26 | // Input PartialProbGamma from MergeProbabilities. Context order. 27 | util::stream::Chains &merged_probabilities, 28 | // Output NGram with normalized probabilities. Context order. 29 | util::stream::Chains &probabilities_out, 30 | // Output bare floats with backoffs. Note backoffs.size() == order - 1. Suffix order. 31 | util::stream::Chains &backoffs_out); 32 | 33 | }} // namespaces 34 | 35 | #endif // LM_INTERPOLATE_NORMALIZE_H 36 | -------------------------------------------------------------------------------- /kenlm/lm/interpolate/pipeline.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_INTERPOLATE_PIPELINE_H 2 | #define LM_INTERPOLATE_PIPELINE_H 3 | 4 | #include "lm/common/model_buffer.hh" 5 | #include "util/fixed_array.hh" 6 | #include "util/stream/config.hh" 7 | 8 | #include 9 | #include 10 | 11 | namespace lm { namespace interpolate { 12 | 13 | struct Config { 14 | std::vector lambdas; 15 | util::stream::SortConfig sort; 16 | std::size_t BufferSize() const { return sort.buffer_size; } 17 | }; 18 | 19 | void Pipeline(util::FixedArray &models, const Config &config, int write_file); 20 | 21 | }} // namespaces 22 | #endif // LM_INTERPOLATE_PIPELINE_H 23 | -------------------------------------------------------------------------------- /kenlm/lm/interpolate/split_worker.cc: -------------------------------------------------------------------------------- 1 | #include "lm/interpolate/split_worker.hh" 2 | #include "lm/common/ngram.hh" 3 | 4 | namespace lm { 5 | namespace interpolate { 6 | 7 | SplitWorker::SplitWorker(std::size_t order, util::stream::Chain &backoff_chain, 8 | util::stream::Chain &sort_chain) 9 | : order_(order) { 10 | backoff_chain >> backoff_input_; 11 | sort_chain >> sort_input_; 12 | } 13 | 14 | void SplitWorker::Run(const util::stream::ChainPosition &position) { 15 | // input: ngram record (id, prob, and backoff) 16 | // output: a float to the backoff_input stream 17 | // an ngram id and a float to the sort_input stream 18 | for (util::stream::Stream stream(position); stream; ++stream) { 19 | NGram ngram(stream.Get(), order_); 20 | 21 | // write id and prob to the sort stream 22 | float prob = ngram.Value().prob; 23 | lm::WordIndex *out = reinterpret_cast(sort_input_.Get()); 24 | for (const lm::WordIndex *it = ngram.begin(); it != ngram.end(); ++it) { 25 | *out++ = *it; 26 | } 27 | *reinterpret_cast(out) = prob; 28 | ++sort_input_; 29 | 30 | // write backoff to the backoff output stream 31 | float boff = ngram.Value().backoff; 32 | *reinterpret_cast(backoff_input_.Get()) = boff; 33 | ++backoff_input_; 34 | } 35 | sort_input_.Poison(); 36 | backoff_input_.Poison(); 37 | } 38 | 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /kenlm/lm/interpolate/split_worker.hh: -------------------------------------------------------------------------------- 1 | #ifndef KENLM_INTERPOLATE_SPLIT_WORKER_H_ 2 | #define KENLM_INTERPOLATE_SPLIT_WORKER_H_ 3 | 4 | #include "util/stream/chain.hh" 5 | #include "util/stream/stream.hh" 6 | 7 | namespace lm { 8 | namespace interpolate { 9 | 10 | class SplitWorker { 11 | public: 12 | /** 13 | * Constructs a split worker for a particular order. It writes the 14 | * split-off backoff values to the backoff chain and the ngram id and 15 | * probability to the sort chain for each ngram in the input. 16 | */ 17 | SplitWorker(std::size_t order, util::stream::Chain &backoff_chain, 18 | util::stream::Chain &sort_chain); 19 | 20 | /** 21 | * The callback invoked to handle the input from the ngram intermediate 22 | * files. 23 | */ 24 | void Run(const util::stream::ChainPosition& position); 25 | 26 | private: 27 | /** 28 | * The ngram order we are reading/writing for. 29 | */ 30 | std::size_t order_; 31 | 32 | /** 33 | * The stream to write to for the backoff values. 34 | */ 35 | util::stream::Stream backoff_input_; 36 | 37 | /** 38 | * The stream to write to for the ngram id + probability values. 39 | */ 40 | util::stream::Stream sort_input_; 41 | }; 42 | } 43 | } 44 | #endif 45 | -------------------------------------------------------------------------------- /kenlm/lm/interpolate/tune_derivatives.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_INTERPOLATE_TUNE_DERIVATIVES_H 2 | #define LM_INTERPOLATE_TUNE_DERIVATIVES_H 3 | 4 | #include "lm/interpolate/tune_matrix.hh" 5 | 6 | #include 7 | #include 8 | 9 | namespace lm { namespace interpolate { 10 | 11 | class Instances; 12 | 13 | // Given tuning instances and model weights, computes the objective function (log probability), gradient, and Hessian. 14 | // Returns log probability / number of instances. 15 | Accum Derivatives(Instances &instances /* Doesn't modify but ReadExtensions is lazy */, const Vector &weights, Vector &gradient, Matrix &hessian); 16 | 17 | }} // namespaces 18 | 19 | #endif // LM_INTERPOLATE_TUNE_DERIVATIVES_H 20 | 21 | -------------------------------------------------------------------------------- /kenlm/lm/interpolate/tune_matrix.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_INTERPOLATE_TUNE_MATRIX_H 2 | #define LM_INTERPOLATE_TUNE_MATRIX_H 3 | 4 | #pragma GCC diagnostic push 5 | #pragma GCC diagnostic ignored "-Wpragmas" // Older gcc doesn't have "-Wunused-local-typedefs" and complains. 6 | #pragma GCC diagnostic ignored "-Wunused-local-typedefs" 7 | #include 8 | #pragma GCC diagnostic pop 9 | 10 | namespace lm { namespace interpolate { 11 | 12 | typedef Eigen::MatrixXf Matrix; 13 | typedef Eigen::VectorXf Vector; 14 | 15 | typedef Matrix::Scalar Accum; 16 | 17 | }} // namespaces 18 | #endif // LM_INTERPOLATE_TUNE_MATRIX_H 19 | -------------------------------------------------------------------------------- /kenlm/lm/interpolate/tune_weights.cc: -------------------------------------------------------------------------------- 1 | #include "lm/interpolate/tune_weights.hh" 2 | 3 | #include "lm/interpolate/tune_derivatives.hh" 4 | #include "lm/interpolate/tune_instances.hh" 5 | 6 | #pragma GCC diagnostic push 7 | #pragma GCC diagnostic ignored "-Wpragmas" // Older gcc doesn't have "-Wunused-local-typedefs" and complains. 8 | #pragma GCC diagnostic ignored "-Wunused-local-typedefs" 9 | #include 10 | #pragma GCC diagnostic pop 11 | #include 12 | 13 | #include 14 | 15 | namespace lm { namespace interpolate { 16 | void TuneWeights(int tune_file, const std::vector &model_names, const InstancesConfig &config, std::vector &weights_out) { 17 | Instances instances(tune_file, model_names, config); 18 | Vector weights = Vector::Constant(model_names.size(), 1.0 / model_names.size()); 19 | Vector gradient; 20 | Matrix hessian; 21 | for (std::size_t iteration = 0; iteration < 10 /*TODO fancy stopping criteria */; ++iteration) { 22 | std::cerr << "Iteration " << iteration << ": weights ="; 23 | for (Vector::Index i = 0; i < weights.rows(); ++i) { 24 | std::cerr << ' ' << weights(i); 25 | } 26 | std::cerr << std::endl; 27 | std::cerr << "Perplexity = " << Derivatives(instances, weights, gradient, hessian) << std::endl; 28 | // TODO: 1.0 step size was too big and it kept getting unstable. More math. 29 | weights -= 0.7 * hessian.inverse() * gradient; 30 | } 31 | weights_out.assign(weights.data(), weights.data() + weights.size()); 32 | } 33 | }} // namespaces 34 | -------------------------------------------------------------------------------- /kenlm/lm/interpolate/tune_weights.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_INTERPOLATE_TUNE_WEIGHTS_H 2 | #define LM_INTERPOLATE_TUNE_WEIGHTS_H 3 | 4 | #include "util/string_piece.hh" 5 | 6 | #include 7 | 8 | namespace lm { namespace interpolate { 9 | struct InstancesConfig; 10 | 11 | // Run a tuning loop, producing weights as output. 12 | void TuneWeights(int tune_file, const std::vector &model_names, const InstancesConfig &config, std::vector &weights); 13 | 14 | }} // namespaces 15 | #endif // LM_INTERPOLATE_TUNE_WEIGHTS_H 16 | -------------------------------------------------------------------------------- /kenlm/lm/interpolate/universal_vocab.cc: -------------------------------------------------------------------------------- 1 | #include "lm/interpolate/universal_vocab.hh" 2 | 3 | namespace lm { 4 | namespace interpolate { 5 | 6 | UniversalVocab::UniversalVocab(const std::vector& model_vocab_sizes) { 7 | model_index_map_.resize(model_vocab_sizes.size()); 8 | for (size_t i = 0; i < model_vocab_sizes.size(); ++i) { 9 | model_index_map_[i].resize(model_vocab_sizes[i]); 10 | } 11 | } 12 | 13 | }} // namespaces 14 | -------------------------------------------------------------------------------- /kenlm/lm/interpolate/universal_vocab.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_INTERPOLATE_UNIVERSAL_VOCAB_H 2 | #define LM_INTERPOLATE_UNIVERSAL_VOCAB_H 3 | 4 | #include "lm/word_index.hh" 5 | 6 | #include 7 | #include 8 | 9 | namespace lm { 10 | namespace interpolate { 11 | 12 | class UniversalVocab { 13 | public: 14 | explicit UniversalVocab(const std::vector& model_vocab_sizes); 15 | 16 | // GetUniversalIndex takes the model number and index for the specific 17 | // model and returns the universal model number 18 | WordIndex GetUniversalIdx(std::size_t model_num, WordIndex model_word_index) const { 19 | return model_index_map_[model_num][model_word_index]; 20 | } 21 | 22 | const WordIndex *Mapping(std::size_t model) const { 23 | return &*model_index_map_[model].begin(); 24 | } 25 | 26 | WordIndex SlowConvertToModel(std::size_t model, WordIndex index) const { 27 | std::vector::const_iterator i = lower_bound(model_index_map_[model].begin(), model_index_map_[model].end(), index); 28 | if (i == model_index_map_[model].end() || *i != index) return 0; 29 | return i - model_index_map_[model].begin(); 30 | } 31 | 32 | void InsertUniversalIdx(std::size_t model_num, WordIndex word_index, 33 | WordIndex universal_word_index) { 34 | model_index_map_[model_num][word_index] = universal_word_index; 35 | } 36 | 37 | private: 38 | std::vector > model_index_map_; 39 | }; 40 | 41 | } // namespace interpolate 42 | } // namespace lm 43 | 44 | #endif // LM_INTERPOLATE_UNIVERSAL_VOCAB_H 45 | -------------------------------------------------------------------------------- /kenlm/lm/lm_exception.cc: -------------------------------------------------------------------------------- 1 | #include "lm/lm_exception.hh" 2 | 3 | #include 4 | #include 5 | 6 | namespace lm { 7 | 8 | ConfigException::ConfigException() throw() {} 9 | ConfigException::~ConfigException() throw() {} 10 | 11 | LoadException::LoadException() throw() {} 12 | LoadException::~LoadException() throw() {} 13 | 14 | FormatLoadException::FormatLoadException() throw() {} 15 | FormatLoadException::~FormatLoadException() throw() {} 16 | 17 | VocabLoadException::VocabLoadException() throw() {} 18 | VocabLoadException::~VocabLoadException() throw() {} 19 | 20 | SpecialWordMissingException::SpecialWordMissingException() throw() {} 21 | SpecialWordMissingException::~SpecialWordMissingException() throw() {} 22 | 23 | } // namespace lm 24 | -------------------------------------------------------------------------------- /kenlm/lm/lm_exception.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_LM_EXCEPTION_H 2 | #define LM_LM_EXCEPTION_H 3 | 4 | // Named to avoid conflict with util/exception.hh. 5 | 6 | #include "util/exception.hh" 7 | #include "util/string_piece.hh" 8 | 9 | #include 10 | #include 11 | 12 | namespace lm { 13 | 14 | typedef enum {THROW_UP, COMPLAIN, SILENT} WarningAction; 15 | 16 | class ConfigException : public util::Exception { 17 | public: 18 | ConfigException() throw(); 19 | ~ConfigException() throw(); 20 | }; 21 | 22 | class LoadException : public util::Exception { 23 | public: 24 | virtual ~LoadException() throw(); 25 | 26 | protected: 27 | LoadException() throw(); 28 | }; 29 | 30 | class FormatLoadException : public LoadException { 31 | public: 32 | FormatLoadException() throw(); 33 | ~FormatLoadException() throw(); 34 | }; 35 | 36 | class VocabLoadException : public LoadException { 37 | public: 38 | virtual ~VocabLoadException() throw(); 39 | VocabLoadException() throw(); 40 | }; 41 | 42 | class SpecialWordMissingException : public VocabLoadException { 43 | public: 44 | explicit SpecialWordMissingException() throw(); 45 | ~SpecialWordMissingException() throw(); 46 | }; 47 | 48 | } // namespace lm 49 | 50 | #endif // LM_LM_EXCEPTION 51 | -------------------------------------------------------------------------------- /kenlm/lm/max_order.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_MAX_ORDER_H 2 | #define LM_MAX_ORDER_H 3 | /* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER, THEN CHANGE THE BUILD SYSTEM. 4 | * If not, this is the default maximum order. 5 | * Having this limit means that State can be 6 | * (kMaxOrder - 1) * sizeof(float) bytes instead of 7 | * sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead 8 | */ 9 | #ifndef KENLM_ORDER_MESSAGE 10 | #define KENLM_ORDER_MESSAGE "If your build system supports changing KENLM_MAX_ORDER, change it there and recompile. In the KenLM tarball or Moses, use e.g. `bjam --max-kenlm-order=6 -a'. Otherwise, edit lm/max_order.hh." 11 | #endif 12 | 13 | #endif // LM_MAX_ORDER_H 14 | -------------------------------------------------------------------------------- /kenlm/lm/model_type.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_MODEL_TYPE_H 2 | #define LM_MODEL_TYPE_H 3 | 4 | namespace lm { 5 | namespace ngram { 6 | 7 | /* Not the best numbering system, but it grew this way for historical reasons 8 | * and I want to preserve existing binary files. */ 9 | typedef enum {PROBING=0, REST_PROBING=1, TRIE=2, QUANT_TRIE=3, ARRAY_TRIE=4, QUANT_ARRAY_TRIE=5} ModelType; 10 | 11 | // Historical names. 12 | const ModelType HASH_PROBING = PROBING; 13 | const ModelType TRIE_SORTED = TRIE; 14 | const ModelType QUANT_TRIE_SORTED = QUANT_TRIE; 15 | const ModelType ARRAY_TRIE_SORTED = ARRAY_TRIE; 16 | const ModelType QUANT_ARRAY_TRIE_SORTED = QUANT_ARRAY_TRIE; 17 | 18 | const static ModelType kQuantAdd = static_cast(QUANT_TRIE - TRIE); 19 | const static ModelType kArrayAdd = static_cast(ARRAY_TRIE - TRIE); 20 | 21 | } // namespace ngram 22 | } // namespace lm 23 | #endif // LM_MODEL_TYPE_H 24 | -------------------------------------------------------------------------------- /kenlm/lm/return.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_RETURN_H 2 | #define LM_RETURN_H 3 | 4 | #include 5 | 6 | namespace lm { 7 | /* Structure returned by scoring routines. */ 8 | struct FullScoreReturn { 9 | // log10 probability 10 | float prob; 11 | 12 | /* The length of n-gram matched. Do not use this for recombination. 13 | * Consider a model containing only the following n-grams: 14 | * -1 foo 15 | * -3.14 bar 16 | * -2.718 baz -5 17 | * -6 foo bar 18 | * 19 | * If you score ``bar'' then ngram_length is 1 and recombination state is the 20 | * empty string because bar has zero backoff and does not extend to the 21 | * right. 22 | * If you score ``foo'' then ngram_length is 1 and recombination state is 23 | * ``foo''. 24 | * 25 | * Ideally, keep output states around and compare them. Failing that, 26 | * get out_state.ValidLength() and use that length for recombination. 27 | */ 28 | unsigned char ngram_length; 29 | 30 | /* Left extension information. If independent_left is set, then prob is 31 | * independent of words to the left (up to additional backoff). Otherwise, 32 | * extend_left indicates how to efficiently extend further to the left. 33 | */ 34 | bool independent_left; 35 | uint64_t extend_left; // Defined only if independent_left 36 | 37 | // Rest cost for extension to the left. 38 | float rest; 39 | }; 40 | 41 | } // namespace lm 42 | #endif // LM_RETURN_H 43 | -------------------------------------------------------------------------------- /kenlm/lm/sizes.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_SIZES_H 2 | #define LM_SIZES_H 3 | 4 | #include 5 | 6 | #include 7 | 8 | namespace lm { namespace ngram { 9 | 10 | struct Config; 11 | 12 | void ShowSizes(const std::vector &counts, const lm::ngram::Config &config); 13 | void ShowSizes(const std::vector &counts); 14 | void ShowSizes(const char *file, const lm::ngram::Config &config); 15 | 16 | }} // namespaces 17 | #endif // LM_SIZES_H 18 | -------------------------------------------------------------------------------- /kenlm/lm/value_build.cc: -------------------------------------------------------------------------------- 1 | #include "lm/value_build.hh" 2 | 3 | #include "lm/model.hh" 4 | #include "lm/read_arpa.hh" 5 | 6 | namespace lm { 7 | namespace ngram { 8 | 9 | template LowerRestBuild::LowerRestBuild(const Config &config, unsigned int order, const typename Model::Vocabulary &vocab) { 10 | UTIL_THROW_IF(config.rest_lower_files.size() != order - 1, ConfigException, "This model has order " << order << " so there should be " << (order - 1) << " lower-order models for rest cost purposes."); 11 | Config for_lower = config; 12 | for_lower.write_mmap = NULL; 13 | for_lower.rest_lower_files.clear(); 14 | 15 | // Unigram models aren't supported, so this is a custom loader. 16 | // TODO: optimize the unigram loading? 17 | { 18 | util::FilePiece uni(config.rest_lower_files[0].c_str()); 19 | std::vector number; 20 | ReadARPACounts(uni, number); 21 | UTIL_THROW_IF(number.size() != 1, FormatLoadException, "Expected the unigram model to have order 1, not " << number.size()); 22 | ReadNGramHeader(uni, 1); 23 | unigrams_.resize(number[0]); 24 | unigrams_[0] = config.unknown_missing_logprob; 25 | PositiveProbWarn warn; 26 | for (uint64_t i = 0; i < number[0]; ++i) { 27 | WordIndex w; 28 | Prob entry; 29 | ReadNGram(uni, 1, vocab, &w, entry, warn); 30 | unigrams_[w] = entry.prob; 31 | } 32 | } 33 | 34 | try { 35 | for (unsigned int i = 2; i < order; ++i) { 36 | models_.push_back(new Model(config.rest_lower_files[i - 1].c_str(), for_lower)); 37 | UTIL_THROW_IF(models_.back()->Order() != i, FormatLoadException, "Lower order file " << config.rest_lower_files[i-1] << " should have order " << i); 38 | } 39 | } catch (...) { 40 | for (typename std::vector::const_iterator i = models_.begin(); i != models_.end(); ++i) { 41 | delete *i; 42 | } 43 | models_.clear(); 44 | throw; 45 | } 46 | 47 | // TODO: force/check same vocab. 48 | } 49 | 50 | template LowerRestBuild::~LowerRestBuild() { 51 | for (typename std::vector::const_iterator i = models_.begin(); i != models_.end(); ++i) { 52 | delete *i; 53 | } 54 | } 55 | 56 | template class LowerRestBuild; 57 | 58 | } // namespace ngram 59 | } // namespace lm 60 | -------------------------------------------------------------------------------- /kenlm/lm/virtual_interface.cc: -------------------------------------------------------------------------------- 1 | #include "lm/virtual_interface.hh" 2 | 3 | #include "lm/lm_exception.hh" 4 | 5 | namespace lm { 6 | namespace base { 7 | 8 | Vocabulary::~Vocabulary() {} 9 | 10 | void Vocabulary::SetSpecial(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found) { 11 | begin_sentence_ = begin_sentence; 12 | end_sentence_ = end_sentence; 13 | not_found_ = not_found; 14 | } 15 | 16 | Model::~Model() {} 17 | 18 | } // namespace base 19 | } // namespace lm 20 | -------------------------------------------------------------------------------- /kenlm/lm/weights.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_WEIGHTS_H 2 | #define LM_WEIGHTS_H 3 | 4 | // Weights for n-grams. Probability and possibly a backoff. 5 | 6 | namespace lm { 7 | struct Prob { 8 | float prob; 9 | }; 10 | // No inheritance so this will be a POD. 11 | struct ProbBackoff { 12 | float prob; 13 | float backoff; 14 | }; 15 | struct RestWeights { 16 | float prob; 17 | float backoff; 18 | float rest; 19 | }; 20 | 21 | } // namespace lm 22 | #endif // LM_WEIGHTS_H 23 | -------------------------------------------------------------------------------- /kenlm/lm/word_index.hh: -------------------------------------------------------------------------------- 1 | // Separate header because this is used often. 2 | #ifndef LM_WORD_INDEX_H 3 | #define LM_WORD_INDEX_H 4 | 5 | #include 6 | 7 | namespace lm { 8 | typedef unsigned int WordIndex; 9 | const WordIndex kMaxWordIndex = UINT_MAX; 10 | const WordIndex kUNK = 0; 11 | } // namespace lm 12 | 13 | typedef lm::WordIndex LMWordIndex; 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /kenlm/lm/wrappers/README: -------------------------------------------------------------------------------- 1 | This directory is for wrappers around other people's LMs, presenting an interface similar to KenLM's. You will need to have their LM installed. 2 | 3 | NPLM is a work in progress. 4 | -------------------------------------------------------------------------------- /kenlm/lm/wrappers/nplm.hh: -------------------------------------------------------------------------------- 1 | #ifndef LM_WRAPPERS_NPLM_H 2 | #define LM_WRAPPERS_NPLM_H 3 | 4 | #include "lm/facade.hh" 5 | #include "lm/max_order.hh" 6 | #include "util/string_piece.hh" 7 | 8 | #include 9 | #include 10 | 11 | /* Wrapper to NPLM "by Ashish Vaswani, with contributions from David Chiang 12 | * and Victoria Fossum." 13 | * http://nlg.isi.edu/software/nplm/ 14 | */ 15 | 16 | namespace nplm { 17 | class vocabulary; 18 | class neuralLM; 19 | } // namespace nplm 20 | 21 | namespace lm { 22 | namespace np { 23 | 24 | class Vocabulary : public base::Vocabulary { 25 | public: 26 | Vocabulary(const nplm::vocabulary &vocab); 27 | 28 | ~Vocabulary(); 29 | 30 | WordIndex Index(const std::string &str) const; 31 | 32 | // TODO: lobby them to support StringPiece 33 | WordIndex Index(const StringPiece &str) const { 34 | return Index(std::string(str.data(), str.size())); 35 | } 36 | 37 | lm::WordIndex NullWord() const { return null_word_; } 38 | 39 | private: 40 | const nplm::vocabulary &vocab_; 41 | 42 | const lm::WordIndex null_word_; 43 | }; 44 | 45 | // Sorry for imposing my limitations on your code. 46 | #define NPLM_MAX_ORDER 7 47 | 48 | struct State { 49 | WordIndex words[NPLM_MAX_ORDER - 1]; 50 | }; 51 | 52 | class Backend; 53 | 54 | class Model : public lm::base::ModelFacade { 55 | private: 56 | typedef lm::base::ModelFacade P; 57 | 58 | public: 59 | // Does this look like an NPLM? 60 | static bool Recognize(const std::string &file); 61 | 62 | explicit Model(const std::string &file, std::size_t cache_size = 1 << 20); 63 | 64 | ~Model(); 65 | 66 | FullScoreReturn FullScore(const State &from, const WordIndex new_word, State &out_state) const; 67 | 68 | FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const; 69 | 70 | private: 71 | boost::scoped_ptr base_instance_; 72 | 73 | mutable boost::thread_specific_ptr backend_; 74 | 75 | Vocabulary vocab_; 76 | 77 | lm::WordIndex null_word_; 78 | 79 | const std::size_t cache_size_; 80 | }; 81 | 82 | } // namespace np 83 | } // namespace lm 84 | 85 | #endif // LM_WRAPPERS_NPLM_H 86 | -------------------------------------------------------------------------------- /kenlm/python/_kenlm.pxd: -------------------------------------------------------------------------------- 1 | cdef extern from "lm/word_index.hh" namespace "lm": 2 | ctypedef unsigned WordIndex 3 | 4 | cdef extern from "lm/return.hh" namespace "lm": 5 | cdef struct FullScoreReturn: 6 | float prob 7 | unsigned char ngram_length 8 | 9 | cdef extern from "lm/state.hh" namespace "lm::ngram": 10 | cdef cppclass State : 11 | int Compare(const State &other) const 12 | 13 | int hash_value(const State &state) 14 | 15 | cdef extern from "lm/virtual_interface.hh" namespace "lm::base": 16 | cdef cppclass Vocabulary: 17 | WordIndex Index(char*) 18 | WordIndex BeginSentence() 19 | WordIndex EndSentence() 20 | WordIndex NotFound() 21 | 22 | ctypedef Vocabulary const_Vocabulary "const lm::base::Vocabulary" 23 | 24 | cdef cppclass Model: 25 | void BeginSentenceWrite(void *) 26 | void NullContextWrite(void *) 27 | unsigned int Order() 28 | const_Vocabulary& BaseVocabulary() 29 | float BaseScore(void *in_state, WordIndex new_word, void *out_state) 30 | FullScoreReturn BaseFullScore(void *in_state, WordIndex new_word, void *out_state) 31 | 32 | cdef extern from "util/mmap.hh" namespace "util": 33 | cdef enum LoadMethod: 34 | LAZY 35 | POPULATE_OR_LAZY 36 | POPULATE_OR_READ 37 | READ 38 | PARALLEL_READ 39 | 40 | cdef extern from "lm/config.hh" namespace "lm::ngram": 41 | cdef cppclass Config: 42 | Config() 43 | float probing_multiplier 44 | LoadMethod load_method 45 | 46 | cdef extern from "lm/model.hh" namespace "lm::ngram": 47 | cdef Model *LoadVirtual(char *, Config &config) except + 48 | #default constructor 49 | cdef Model *LoadVirtual(char *) except + 50 | 51 | -------------------------------------------------------------------------------- /kenlm/python/example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import kenlm 4 | 5 | LM = os.path.join(os.path.dirname(__file__), '..', 'lm', 'test.arpa') 6 | model = kenlm.Model(LM) 7 | print('{0}-gram model'.format(model.order)) 8 | 9 | sentence = 'language modeling is fun .' 10 | print(sentence) 11 | print(model.score(sentence)) 12 | 13 | # Check that total full score = direct score 14 | def score(s): 15 | return sum(prob for prob, _, _ in model.full_scores(s)) 16 | 17 | assert (abs(score(sentence) - model.score(sentence)) < 1e-3) 18 | 19 | # Show scores and n-gram matches 20 | words = [''] + sentence.split() + [''] 21 | for i, (prob, length, oov) in enumerate(model.full_scores(sentence)): 22 | print('{0} {1}: {2}'.format(prob, length, ' '.join(words[i+2-length:i+2]))) 23 | if oov: 24 | print('\t"{0}" is an OOV'.format(words[i+1])) 25 | 26 | # Find out-of-vocabulary words 27 | for w in words: 28 | if not w in model: 29 | print('"{0}" is an OOV'.format(w)) 30 | 31 | #Stateful query 32 | state = kenlm.State() 33 | state2 = kenlm.State() 34 | #Use as context. If you don't want , use model.NullContextWrite(state). 35 | model.BeginSentenceWrite(state) 36 | accum = 0.0 37 | accum += model.BaseScore(state, "a", state2) 38 | accum += model.BaseScore(state2, "sentence", state) 39 | #score defaults to bos = True and eos = True. Here we'll check without the end 40 | #of sentence marker. 41 | assert (abs(accum - model.score("a sentence", eos = False)) < 1e-3) 42 | accum += model.BaseScore(state, "", state2) 43 | assert (abs(accum - model.score("a sentence")) < 1e-3) 44 | -------------------------------------------------------------------------------- /kenlm/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, Extension 2 | import glob 3 | import platform 4 | import os 5 | 6 | #Does gcc compile with this header and library? 7 | def compile_test(header, library): 8 | dummy_path = os.path.join(os.path.dirname(__file__), "dummy") 9 | command = "bash -c \"g++ -include " + header + " -l" + library + " -x c++ - <<<'int main() {}' -o " + dummy_path + " >/dev/null 2>/dev/null && rm " + dummy_path + " 2>/dev/null\"" 10 | return os.system(command) == 0 11 | 12 | 13 | FILES = glob.glob('util/*.cc') + glob.glob('lm/*.cc') + glob.glob('util/double-conversion/*.cc') 14 | FILES = [fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc'))] 15 | 16 | LIBS = ['stdc++'] 17 | if platform.system() != 'Darwin': 18 | LIBS.append('rt') 19 | 20 | #We don't need -std=c++11 but python seems to be compiled with it now. https://github.com/kpu/kenlm/issues/86 21 | ARGS = ['-O3', '-DNDEBUG', '-DKENLM_MAX_ORDER=6', '-std=c++11'] 22 | 23 | if compile_test('zlib.h', 'z'): 24 | ARGS.append('-DHAVE_ZLIB') 25 | LIBS.append('z') 26 | 27 | if compile_test('bzlib.h', 'bz2'): 28 | ARGS.append('-DHAVE_BZLIB') 29 | LIBS.append('bz2') 30 | 31 | if compile_test('lzma.h', 'lzma'): 32 | ARGS.append('-DHAVE_XZLIB') 33 | LIBS.append('lzma') 34 | 35 | ext_modules = [ 36 | Extension(name='kenlm', 37 | sources=FILES + ['python/kenlm.cpp'], 38 | language='C++', 39 | include_dirs=['.'], 40 | libraries=LIBS, 41 | extra_compile_args=ARGS) 42 | ] 43 | 44 | setup( 45 | name='kenlm', 46 | ext_modules=ext_modules, 47 | include_package_data=True, 48 | ) 49 | -------------------------------------------------------------------------------- /kenlm/util/bit_packing.cc: -------------------------------------------------------------------------------- 1 | #include "util/bit_packing.hh" 2 | #include "util/exception.hh" 3 | 4 | #include 5 | 6 | namespace util { 7 | 8 | namespace { 9 | template struct StaticCheck {}; 10 | template <> struct StaticCheck { typedef bool StaticAssertionPassed; }; 11 | 12 | // If your float isn't 4 bytes, we're hosed. 13 | typedef StaticCheck::StaticAssertionPassed FloatSize; 14 | 15 | } // namespace 16 | 17 | uint8_t RequiredBits(uint64_t max_value) { 18 | if (!max_value) return 0; 19 | uint8_t ret = 1; 20 | while (max_value >>= 1) ++ret; 21 | return ret; 22 | } 23 | 24 | void BitPackingSanity() { 25 | const FloatEnc neg1 = { -1.0 }, pos1 = { 1.0 }; 26 | if ((neg1.i ^ pos1.i) != 0x80000000) UTIL_THROW(Exception, "Sign bit is not 0x80000000"); 27 | char mem[57+8]; 28 | memset(mem, 0, sizeof(mem)); 29 | const uint64_t test57 = 0x123456789abcdefULL; 30 | for (uint64_t b = 0; b < 57 * 8; b += 57) { 31 | WriteInt57(mem, b, 57, test57); 32 | } 33 | for (uint64_t b = 0; b < 57 * 8; b += 57) { 34 | if (test57 != ReadInt57(mem, b, 57, (1ULL << 57) - 1)) 35 | UTIL_THROW(Exception, "The bit packing routines are failing for your architecture. Please send a bug report with your architecture, operating system, and compiler."); 36 | } 37 | // TODO: more checks. 38 | } 39 | 40 | } // namespace util 41 | -------------------------------------------------------------------------------- /kenlm/util/bit_packing_test.cc: -------------------------------------------------------------------------------- 1 | #include "util/bit_packing.hh" 2 | 3 | #define BOOST_TEST_MODULE BitPackingTest 4 | #include 5 | 6 | #include 7 | 8 | namespace util { 9 | namespace { 10 | 11 | const uint64_t test57 = 0x123456789abcdefULL; 12 | const uint32_t test25 = 0x1234567; 13 | 14 | BOOST_AUTO_TEST_CASE(ZeroBit57) { 15 | char mem[16]; 16 | memset(mem, 0, sizeof(mem)); 17 | WriteInt57(mem, 0, 57, test57); 18 | BOOST_CHECK_EQUAL(test57, ReadInt57(mem, 0, 57, (1ULL << 57) - 1)); 19 | } 20 | 21 | BOOST_AUTO_TEST_CASE(EachBit57) { 22 | char mem[16]; 23 | for (uint8_t b = 0; b < 8; ++b) { 24 | memset(mem, 0, sizeof(mem)); 25 | WriteInt57(mem, b, 57, test57); 26 | BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1)); 27 | } 28 | } 29 | 30 | BOOST_AUTO_TEST_CASE(Consecutive57) { 31 | char mem[57+8]; 32 | memset(mem, 0, sizeof(mem)); 33 | for (uint64_t b = 0; b < 57 * 8; b += 57) { 34 | WriteInt57(mem, b, 57, test57); 35 | BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1)); 36 | } 37 | for (uint64_t b = 0; b < 57 * 8; b += 57) { 38 | BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1)); 39 | } 40 | } 41 | 42 | BOOST_AUTO_TEST_CASE(Consecutive25) { 43 | char mem[25+8]; 44 | memset(mem, 0, sizeof(mem)); 45 | for (uint64_t b = 0; b < 25 * 8; b += 25) { 46 | WriteInt25(mem, b, 25, test25); 47 | BOOST_CHECK_EQUAL(test25, ReadInt25(mem, b, 25, (1ULL << 25) - 1)); 48 | } 49 | for (uint64_t b = 0; b < 25 * 8; b += 25) { 50 | BOOST_CHECK_EQUAL(test25, ReadInt25(mem, b, 25, (1ULL << 25) - 1)); 51 | } 52 | } 53 | 54 | BOOST_AUTO_TEST_CASE(Sanity) { 55 | BitPackingSanity(); 56 | } 57 | 58 | } // namespace 59 | } // namespace util 60 | -------------------------------------------------------------------------------- /kenlm/util/cat_compressed_main.cc: -------------------------------------------------------------------------------- 1 | // Like cat but interprets compressed files. 2 | #include "util/file.hh" 3 | #include "util/read_compressed.hh" 4 | 5 | #include 6 | #include 7 | 8 | namespace { 9 | const std::size_t kBufSize = 16384; 10 | void Copy(util::ReadCompressed &from, int to) { 11 | util::scoped_malloc buffer(util::MallocOrThrow(kBufSize)); 12 | while (std::size_t amount = from.Read(buffer.get(), kBufSize)) { 13 | util::WriteOrThrow(to, buffer.get(), amount); 14 | } 15 | } 16 | } // namespace 17 | 18 | int main(int argc, char *argv[]) { 19 | // Lane Schwartz likes -h and --help 20 | for (int i = 1; i < argc; ++i) { 21 | char *arg = argv[i]; 22 | if (!strcmp(arg, "--")) break; 23 | if (!strcmp(arg, "-h") || !strcmp(arg, "--help")) { 24 | std::cerr << 25 | "A cat implementation that interprets compressed files.\n" 26 | "Usage: " << argv[0] << " [file1] [file2] ...\n" 27 | "If no file is provided, then stdin is read.\n"; 28 | return 1; 29 | } 30 | } 31 | 32 | try { 33 | if (argc == 1) { 34 | util::ReadCompressed in(0); 35 | Copy(in, 1); 36 | } else { 37 | for (int i = 1; i < argc; ++i) { 38 | util::ReadCompressed in(util::OpenReadOrThrow(argv[i])); 39 | Copy(in, 1); 40 | } 41 | } 42 | } catch (const std::exception &e) { 43 | std::cerr << e.what() << std::endl; 44 | return 2; 45 | } 46 | return 0; 47 | } 48 | -------------------------------------------------------------------------------- /kenlm/util/double-conversion/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # This CMake file was created by Lane Schwartz 2 | 3 | # Explicitly list the source files for this subdirectory 4 | # 5 | # If you add any source files to this subdirectory 6 | # that should be included in the kenlm library, 7 | # (this excludes any unit test files) 8 | # you should add them to the following list: 9 | # 10 | # In order to allow CMake files in the parent directory 11 | # to see this variable definition, we set PARENT_SCOPE. 12 | # 13 | # In order to set correct paths to these files 14 | # when this variable is referenced by CMake files in the parent directory, 15 | # we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}. 16 | # 17 | set(KENLM_UTIL_DOUBLECONVERSION_SOURCE 18 | ${CMAKE_CURRENT_SOURCE_DIR}/bignum-dtoa.cc 19 | ${CMAKE_CURRENT_SOURCE_DIR}/bignum.cc 20 | ${CMAKE_CURRENT_SOURCE_DIR}/cached-powers.cc 21 | ${CMAKE_CURRENT_SOURCE_DIR}/diy-fp.cc 22 | ${CMAKE_CURRENT_SOURCE_DIR}/double-conversion.cc 23 | ${CMAKE_CURRENT_SOURCE_DIR}/fast-dtoa.cc 24 | ${CMAKE_CURRENT_SOURCE_DIR}/fixed-dtoa.cc 25 | ${CMAKE_CURRENT_SOURCE_DIR}/strtod.cc 26 | PARENT_SCOPE) 27 | 28 | -------------------------------------------------------------------------------- /kenlm/util/double-conversion/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2006-2011, the V8 project authors. All rights reserved. 2 | Redistribution and use in source and binary forms, with or without 3 | modification, are permitted provided that the following conditions are 4 | met: 5 | 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above 9 | copyright notice, this list of conditions and the following 10 | disclaimer in the documentation and/or other materials provided 11 | with the distribution. 12 | * Neither the name of Google Inc. nor the names of its 13 | contributors may be used to endorse or promote products derived 14 | from this software without specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /kenlm/util/double-conversion/strtod.h: -------------------------------------------------------------------------------- 1 | // Copyright 2010 the V8 project authors. All rights reserved. 2 | // Redistribution and use in source and binary forms, with or without 3 | // modification, are permitted provided that the following conditions are 4 | // met: 5 | // 6 | // * Redistributions of source code must retain the above copyright 7 | // notice, this list of conditions and the following disclaimer. 8 | // * Redistributions in binary form must reproduce the above 9 | // copyright notice, this list of conditions and the following 10 | // disclaimer in the documentation and/or other materials provided 11 | // with the distribution. 12 | // * Neither the name of Google Inc. nor the names of its 13 | // contributors may be used to endorse or promote products derived 14 | // from this software without specific prior written permission. 15 | // 16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | #ifndef DOUBLE_CONVERSION_STRTOD_H_ 29 | #define DOUBLE_CONVERSION_STRTOD_H_ 30 | 31 | #include "utils.h" 32 | 33 | namespace double_conversion { 34 | 35 | // The buffer must only contain digits in the range [0-9]. It must not 36 | // contain a dot or a sign. It must not start with '0', and must not be empty. 37 | double Strtod(Vector buffer, int exponent); 38 | 39 | // The buffer must only contain digits in the range [0-9]. It must not 40 | // contain a dot or a sign. It must not start with '0', and must not be empty. 41 | float Strtof(Vector buffer, int exponent); 42 | 43 | } // namespace double_conversion 44 | 45 | #endif // DOUBLE_CONVERSION_STRTOD_H_ 46 | -------------------------------------------------------------------------------- /kenlm/util/ersatz_progress.cc: -------------------------------------------------------------------------------- 1 | #include "util/ersatz_progress.hh" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace util { 9 | 10 | namespace { const unsigned char kWidth = 100; } 11 | 12 | const char kProgressBanner[] = "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n"; 13 | 14 | ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits::max()), complete_(next_), out_(NULL) {} 15 | 16 | ErsatzProgress::~ErsatzProgress() { 17 | if (out_) Finished(); 18 | } 19 | 20 | ErsatzProgress::ErsatzProgress(uint64_t complete, std::ostream *to, const std::string &message) 21 | : current_(0), next_(complete / kWidth), complete_(complete), stones_written_(0), out_(to) { 22 | if (!out_) { 23 | next_ = std::numeric_limits::max(); 24 | return; 25 | } 26 | if (!message.empty()) *out_ << message << '\n'; 27 | *out_ << kProgressBanner; 28 | } 29 | 30 | void ErsatzProgress::Milestone() { 31 | if (!out_) { current_ = 0; return; } 32 | if (!complete_) return; 33 | unsigned char stone = std::min(static_cast(kWidth), (current_ * kWidth) / complete_); 34 | 35 | for (; stones_written_ < stone; ++stones_written_) { 36 | (*out_) << '*'; 37 | } 38 | if (stone == kWidth) { 39 | (*out_) << std::endl; 40 | next_ = std::numeric_limits::max(); 41 | out_ = NULL; 42 | } else { 43 | next_ = std::max(next_, ((stone + 1) * complete_ + kWidth - 1) / kWidth); 44 | } 45 | } 46 | 47 | } // namespace util 48 | -------------------------------------------------------------------------------- /kenlm/util/ersatz_progress.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_ERSATZ_PROGRESS_H 2 | #define UTIL_ERSATZ_PROGRESS_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | // Ersatz version of boost::progress so core language model doesn't depend on 9 | // boost. Also adds option to print nothing. 10 | 11 | namespace util { 12 | 13 | extern const char kProgressBanner[]; 14 | 15 | class ErsatzProgress { 16 | public: 17 | // No output. 18 | ErsatzProgress(); 19 | 20 | // Null means no output. The null value is useful for passing along the ostream pointer from another caller. 21 | explicit ErsatzProgress(uint64_t complete, std::ostream *to = &std::cerr, const std::string &message = ""); 22 | 23 | #if __cplusplus >= 201103L 24 | ErsatzProgress(ErsatzProgress &&from) noexcept : current_(from.current_), next_(from.next_), complete_(from.complete_), stones_written_(from.stones_written_), out_(from.out_) { 25 | from.out_ = nullptr; 26 | from.next_ = (uint64_t)-1; 27 | } 28 | #endif 29 | 30 | ~ErsatzProgress(); 31 | 32 | ErsatzProgress &operator++() { 33 | if (++current_ >= next_) Milestone(); 34 | return *this; 35 | } 36 | 37 | ErsatzProgress &operator+=(uint64_t amount) { 38 | if ((current_ += amount) >= next_) Milestone(); 39 | return *this; 40 | } 41 | 42 | void Set(uint64_t to) { 43 | if ((current_ = to) >= next_) Milestone(); 44 | } 45 | 46 | void Finished() { 47 | Set(complete_); 48 | } 49 | 50 | private: 51 | void Milestone(); 52 | 53 | uint64_t current_, next_, complete_; 54 | unsigned char stones_written_; 55 | std::ostream *out_; 56 | 57 | // noncopyable 58 | ErsatzProgress(const ErsatzProgress &other); 59 | ErsatzProgress &operator=(const ErsatzProgress &other); 60 | }; 61 | 62 | } // namespace util 63 | 64 | #endif // UTIL_ERSATZ_PROGRESS_H 65 | -------------------------------------------------------------------------------- /kenlm/util/float_to_string.cc: -------------------------------------------------------------------------------- 1 | #include "util/float_to_string.hh" 2 | 3 | #include "util/double-conversion/double-conversion.h" 4 | #include "util/double-conversion/utils.h" 5 | 6 | namespace util { 7 | namespace { 8 | const double_conversion::DoubleToStringConverter kConverter(double_conversion::DoubleToStringConverter::NO_FLAGS, "inf", "NaN", 'e', -6, 21, 6, 0); 9 | } // namespace 10 | 11 | char *ToString(double value, char *to) { 12 | double_conversion::StringBuilder builder(to, ToStringBuf::kBytes); 13 | kConverter.ToShortest(value, &builder); 14 | return &to[builder.position()]; 15 | } 16 | 17 | char *ToString(float value, char *to) { 18 | double_conversion::StringBuilder builder(to, ToStringBuf::kBytes); 19 | kConverter.ToShortestSingle(value, &builder); 20 | return &to[builder.position()]; 21 | } 22 | 23 | } // namespace util 24 | -------------------------------------------------------------------------------- /kenlm/util/float_to_string.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_FLOAT_TO_STRING_H 2 | #define UTIL_FLOAT_TO_STRING_H 3 | 4 | // Just for ToStringBuf 5 | #include "util/integer_to_string.hh" 6 | 7 | namespace util { 8 | 9 | template <> struct ToStringBuf { 10 | // DoubleToStringConverter::kBase10MaximalLength + 1 for null paranoia. 11 | static const unsigned kBytes = 19; 12 | }; 13 | 14 | // Single wasn't documented in double conversion, so be conservative and 15 | // say the same as double. 16 | template <> struct ToStringBuf { 17 | static const unsigned kBytes = 19; 18 | }; 19 | 20 | char *ToString(double value, char *to); 21 | char *ToString(float value, char *to); 22 | 23 | } // namespace util 24 | 25 | #endif // UTIL_FLOAT_TO_STRING_H 26 | -------------------------------------------------------------------------------- /kenlm/util/getopt.c: -------------------------------------------------------------------------------- 1 | /* 2 | POSIX getopt for Windows 3 | 4 | AT&T Public License 5 | 6 | Code given out at the 1985 UNIFORUM conference in Dallas. 7 | */ 8 | 9 | #ifndef __GNUC__ 10 | 11 | #include "getopt.hh" 12 | #include 13 | #include 14 | 15 | #define NULL 0 16 | #define EOF (-1) 17 | #define ERR(s, c) if(opterr){\ 18 | char errbuf[2];\ 19 | errbuf[0] = c; errbuf[1] = '\n';\ 20 | fputs(argv[0], stderr);\ 21 | fputs(s, stderr);\ 22 | fputc(c, stderr);} 23 | //(void) write(2, argv[0], (unsigned)strlen(argv[0]));\ 24 | //(void) write(2, s, (unsigned)strlen(s));\ 25 | //(void) write(2, errbuf, 2);} 26 | 27 | int opterr = 1; 28 | int optind = 1; 29 | int optopt; 30 | char *optarg; 31 | 32 | int 33 | getopt(argc, argv, opts) 34 | int argc; 35 | char **argv, *opts; 36 | { 37 | static int sp = 1; 38 | register int c; 39 | register char *cp; 40 | 41 | if(sp == 1) 42 | if(optind >= argc || 43 | argv[optind][0] != '-' || argv[optind][1] == '\0') 44 | return(EOF); 45 | else if(strcmp(argv[optind], "--") == NULL) { 46 | optind++; 47 | return(EOF); 48 | } 49 | optopt = c = argv[optind][sp]; 50 | if(c == ':' || (cp=strchr(opts, c)) == NULL) { 51 | ERR(": illegal option -- ", c); 52 | if(argv[optind][++sp] == '\0') { 53 | optind++; 54 | sp = 1; 55 | } 56 | return('?'); 57 | } 58 | if(*++cp == ':') { 59 | if(argv[optind][sp+1] != '\0') 60 | optarg = &argv[optind++][sp+1]; 61 | else if(++optind >= argc) { 62 | ERR(": option requires an argument -- ", c); 63 | sp = 1; 64 | return('?'); 65 | } else 66 | optarg = argv[optind++]; 67 | sp = 1; 68 | } else { 69 | if(argv[optind][++sp] == '\0') { 70 | sp = 1; 71 | optind++; 72 | } 73 | optarg = NULL; 74 | } 75 | return(c); 76 | } 77 | 78 | #endif /* __GNUC__ */ 79 | -------------------------------------------------------------------------------- /kenlm/util/getopt.hh: -------------------------------------------------------------------------------- 1 | /* 2 | POSIX getopt for Windows 3 | 4 | AT&T Public License 5 | 6 | Code given out at the 1985 UNIFORUM conference in Dallas. 7 | */ 8 | 9 | #ifdef __GNUC__ 10 | #include 11 | #endif 12 | #ifndef __GNUC__ 13 | 14 | #ifndef UTIL_GETOPT_H 15 | #define UTIL_GETOPT_H 16 | 17 | #ifdef __cplusplus 18 | extern "C" { 19 | #endif 20 | 21 | extern int opterr; 22 | extern int optind; 23 | extern int optopt; 24 | extern char *optarg; 25 | extern int getopt(int argc, char **argv, char *opts); 26 | 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | 31 | #endif /* UTIL_GETOPT_H */ 32 | #endif /* __GNUC__ */ 33 | 34 | -------------------------------------------------------------------------------- /kenlm/util/have.hh: -------------------------------------------------------------------------------- 1 | /* Optional packages. You might want to integrate this with your build system e.g. config.h from ./configure. */ 2 | #ifndef UTIL_HAVE_H 3 | #define UTIL_HAVE_H 4 | 5 | #ifdef HAVE_CONFIG_H 6 | #include "config.h" 7 | #endif 8 | 9 | #ifndef HAVE_ICU 10 | //#define HAVE_ICU 11 | #endif 12 | 13 | #endif // UTIL_HAVE_H 14 | -------------------------------------------------------------------------------- /kenlm/util/integer_to_string.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_INTEGER_TO_STRING_H 2 | #define UTIL_INTEGER_TO_STRING_H 3 | #include 4 | #include 5 | 6 | namespace util { 7 | 8 | /* These functions convert integers to strings and return the end pointer. 9 | */ 10 | char *ToString(uint32_t value, char *to); 11 | char *ToString(uint64_t value, char *to); 12 | 13 | // Implemented as wrappers to above 14 | char *ToString(int32_t value, char *to); 15 | char *ToString(int64_t value, char *to); 16 | 17 | // Calls the 32-bit versions for now. 18 | char *ToString(uint16_t value, char *to); 19 | char *ToString(int16_t value, char *to); 20 | 21 | char *ToString(const void *value, char *to); 22 | 23 | inline char *ToString(bool value, char *to) { 24 | *to++ = '0' + value; 25 | return to; 26 | } 27 | 28 | // How many bytes to reserve in the buffer for these strings: 29 | // g++ 4.9.1 doesn't work with this: 30 | // static const std::size_t kBytes = 5; 31 | // So use enum. 32 | template struct ToStringBuf; 33 | template <> struct ToStringBuf { 34 | enum { kBytes = 1 }; 35 | }; 36 | template <> struct ToStringBuf { 37 | enum { kBytes = 5 }; 38 | }; 39 | template <> struct ToStringBuf { 40 | enum { kBytes = 6 }; 41 | }; 42 | template <> struct ToStringBuf { 43 | enum { kBytes = 10 }; 44 | }; 45 | template <> struct ToStringBuf { 46 | enum { kBytes = 11 }; 47 | }; 48 | template <> struct ToStringBuf { 49 | enum { kBytes = 20 }; 50 | }; 51 | template <> struct ToStringBuf { 52 | // Not a typo. 2^63 has 19 digits. 53 | enum { kBytes = 20 }; 54 | }; 55 | 56 | template <> struct ToStringBuf { 57 | // Either 18 on 64-bit or 10 on 32-bit. 58 | enum { kBytes = sizeof(const void*) * 2 + 2 }; 59 | }; 60 | 61 | // Maximum over this and float. 62 | enum { kToStringMaxBytes = 20 }; 63 | 64 | } // namespace util 65 | 66 | #endif // UTIL_INTEGER_TO_STRING_H 67 | -------------------------------------------------------------------------------- /kenlm/util/integer_to_string_test.cc: -------------------------------------------------------------------------------- 1 | #define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE 2 | #include "util/integer_to_string.hh" 3 | #include "util/string_piece.hh" 4 | 5 | #define BOOST_TEST_MODULE IntegerToStringTest 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | namespace util { 12 | namespace { 13 | 14 | template void TestValue(const T value) { 15 | char buf[ToStringBuf::kBytes]; 16 | StringPiece result(buf, ToString(value, buf) - buf); 17 | BOOST_REQUIRE_GE(static_cast(ToStringBuf::kBytes), result.size()); 18 | if (value) { 19 | BOOST_CHECK_EQUAL(boost::lexical_cast(value), result); 20 | } else { 21 | // Platforms can do void * as 0x0 or 0. 22 | BOOST_CHECK(result == "0x0" || result == "0"); 23 | } 24 | } 25 | 26 | template void TestCorners() { 27 | TestValue(std::numeric_limits::min()); 28 | TestValue(std::numeric_limits::max()); 29 | TestValue((T)0); 30 | TestValue((T)-1); 31 | TestValue((T)1); 32 | } 33 | 34 | BOOST_AUTO_TEST_CASE(Corners) { 35 | TestCorners(); 36 | TestCorners(); 37 | TestCorners(); 38 | TestCorners(); 39 | TestCorners(); 40 | TestCorners(); 41 | TestCorners(); 42 | } 43 | 44 | template void TestAll() { 45 | for (T i = std::numeric_limits::min(); i < std::numeric_limits::max(); ++i) { 46 | TestValue(i); 47 | } 48 | TestValue(std::numeric_limits::max()); 49 | } 50 | 51 | BOOST_AUTO_TEST_CASE(Short) { 52 | TestAll(); 53 | TestAll(); 54 | } 55 | 56 | template void Test10s() { 57 | for (T i = 1; i < std::numeric_limits::max() / 10; i *= 10) { 58 | TestValue(i); 59 | TestValue(i - 1); 60 | TestValue(i + 1); 61 | } 62 | } 63 | 64 | BOOST_AUTO_TEST_CASE(Tens) { 65 | Test10s(); 66 | Test10s(); 67 | Test10s(); 68 | Test10s(); 69 | } 70 | 71 | BOOST_AUTO_TEST_CASE(Pointers) { 72 | for (uintptr_t i = 1; i < std::numeric_limits::max() / 10; i *= 10) { 73 | TestValue((const void*)i); 74 | } 75 | for (uintptr_t i = 0; i < 256; ++i) { 76 | TestValue((const void*)i); 77 | TestValue((const void*)(i + 0xf00)); 78 | } 79 | } 80 | 81 | }} // namespaces 82 | -------------------------------------------------------------------------------- /kenlm/util/joint_sort_test.cc: -------------------------------------------------------------------------------- 1 | #include "util/joint_sort.hh" 2 | 3 | #define BOOST_TEST_MODULE JointSortTest 4 | #include 5 | 6 | namespace util { namespace { 7 | 8 | BOOST_AUTO_TEST_CASE(just_flip) { 9 | char keys[2]; 10 | int values[2]; 11 | keys[0] = 1; values[0] = 327; 12 | keys[1] = 0; values[1] = 87897; 13 | JointSort(keys + 0, keys + 2, values + 0); 14 | BOOST_CHECK_EQUAL(0, keys[0]); 15 | BOOST_CHECK_EQUAL(87897, values[0]); 16 | BOOST_CHECK_EQUAL(1, keys[1]); 17 | BOOST_CHECK_EQUAL(327, values[1]); 18 | } 19 | 20 | BOOST_AUTO_TEST_CASE(three) { 21 | char keys[3]; 22 | int values[3]; 23 | keys[0] = 1; values[0] = 327; 24 | keys[1] = 2; values[1] = 87897; 25 | keys[2] = 0; values[2] = 10; 26 | JointSort(keys + 0, keys + 3, values + 0); 27 | BOOST_CHECK_EQUAL(0, keys[0]); 28 | BOOST_CHECK_EQUAL(1, keys[1]); 29 | BOOST_CHECK_EQUAL(2, keys[2]); 30 | } 31 | 32 | BOOST_AUTO_TEST_CASE(char_int) { 33 | char keys[4]; 34 | int values[4]; 35 | keys[0] = 3; values[0] = 327; 36 | keys[1] = 1; values[1] = 87897; 37 | keys[2] = 2; values[2] = 10; 38 | keys[3] = 0; values[3] = 24347; 39 | JointSort(keys + 0, keys + 4, values + 0); 40 | BOOST_CHECK_EQUAL(0, keys[0]); 41 | BOOST_CHECK_EQUAL(24347, values[0]); 42 | BOOST_CHECK_EQUAL(1, keys[1]); 43 | BOOST_CHECK_EQUAL(87897, values[1]); 44 | BOOST_CHECK_EQUAL(2, keys[2]); 45 | BOOST_CHECK_EQUAL(10, values[2]); 46 | BOOST_CHECK_EQUAL(3, keys[3]); 47 | BOOST_CHECK_EQUAL(327, values[3]); 48 | } 49 | 50 | BOOST_AUTO_TEST_CASE(swap_proxy) { 51 | char keys[2] = {0, 1}; 52 | int values[2] = {2, 3}; 53 | detail::JointProxy first(keys, values); 54 | detail::JointProxy second(keys + 1, values + 1); 55 | swap(first, second); 56 | BOOST_CHECK_EQUAL(1, keys[0]); 57 | BOOST_CHECK_EQUAL(0, keys[1]); 58 | BOOST_CHECK_EQUAL(3, values[0]); 59 | BOOST_CHECK_EQUAL(2, values[1]); 60 | } 61 | 62 | }} // namespace anonymous util 63 | -------------------------------------------------------------------------------- /kenlm/util/multi_intersection_test.cc: -------------------------------------------------------------------------------- 1 | #include "util/multi_intersection.hh" 2 | 3 | #define BOOST_TEST_MODULE MultiIntersectionTest 4 | #include 5 | 6 | namespace util { 7 | namespace { 8 | 9 | BOOST_AUTO_TEST_CASE(Empty) { 10 | std::vector > sets; 11 | 12 | sets.push_back(boost::iterator_range(static_cast(NULL), static_cast(NULL))); 13 | BOOST_CHECK(!FirstIntersection(sets)); 14 | } 15 | 16 | BOOST_AUTO_TEST_CASE(Single) { 17 | std::vector nums; 18 | nums.push_back(1); 19 | nums.push_back(4); 20 | nums.push_back(100); 21 | std::vector::const_iterator> > sets; 22 | sets.push_back(nums); 23 | 24 | boost::optional ret(FirstIntersection(sets)); 25 | 26 | BOOST_REQUIRE(ret); 27 | BOOST_CHECK_EQUAL(static_cast(1), *ret); 28 | } 29 | 30 | template boost::iterator_range RangeFromArray(const T (&arr)[len]) { 31 | return boost::iterator_range(arr, arr + len); 32 | } 33 | 34 | BOOST_AUTO_TEST_CASE(MultiNone) { 35 | unsigned int nums0[] = {1, 3, 4, 22}; 36 | unsigned int nums1[] = {2, 5, 12}; 37 | unsigned int nums2[] = {4, 17}; 38 | 39 | std::vector > sets; 40 | sets.push_back(RangeFromArray(nums0)); 41 | sets.push_back(RangeFromArray(nums1)); 42 | sets.push_back(RangeFromArray(nums2)); 43 | 44 | BOOST_CHECK(!FirstIntersection(sets)); 45 | } 46 | 47 | BOOST_AUTO_TEST_CASE(MultiOne) { 48 | unsigned int nums0[] = {1, 3, 4, 17, 22}; 49 | unsigned int nums1[] = {2, 5, 12, 17}; 50 | unsigned int nums2[] = {4, 17}; 51 | 52 | std::vector > sets; 53 | sets.push_back(RangeFromArray(nums0)); 54 | sets.push_back(RangeFromArray(nums1)); 55 | sets.push_back(RangeFromArray(nums2)); 56 | 57 | boost::optional ret(FirstIntersection(sets)); 58 | BOOST_REQUIRE(ret); 59 | BOOST_CHECK_EQUAL(static_cast(17), *ret); 60 | } 61 | 62 | } // namespace 63 | } // namespace util 64 | -------------------------------------------------------------------------------- /kenlm/util/murmur_hash.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_MURMUR_HASH_H 2 | #define UTIL_MURMUR_HASH_H 3 | #include 4 | #include 5 | 6 | namespace util { 7 | 8 | // 64-bit machine version 9 | uint64_t MurmurHash64A(const void * key, std::size_t len, uint64_t seed = 0); 10 | // 32-bit machine version (not the same function as above) 11 | uint64_t MurmurHash64B(const void * key, std::size_t len, uint64_t seed = 0); 12 | // Use the version for this arch. Because the values differ across 13 | // architectures, really only use it for in-memory structures. 14 | uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed = 0); 15 | 16 | } // namespace util 17 | 18 | #endif // UTIL_MURMUR_HASH_H 19 | -------------------------------------------------------------------------------- /kenlm/util/parallel_read.cc: -------------------------------------------------------------------------------- 1 | #include "util/parallel_read.hh" 2 | 3 | #include "util/file.hh" 4 | 5 | #ifdef WITH_THREADS 6 | #include "util/thread_pool.hh" 7 | 8 | namespace util { 9 | namespace { 10 | 11 | class Reader { 12 | public: 13 | explicit Reader(int fd) : fd_(fd) {} 14 | 15 | struct Request { 16 | void *to; 17 | std::size_t size; 18 | uint64_t offset; 19 | 20 | bool operator==(const Request &other) const { 21 | return (to == other.to) && (size == other.size) && (offset == other.offset); 22 | } 23 | }; 24 | 25 | void operator()(const Request &request) { 26 | util::ErsatzPRead(fd_, request.to, request.size, request.offset); 27 | } 28 | 29 | private: 30 | int fd_; 31 | }; 32 | 33 | } // namespace 34 | 35 | void ParallelRead(int fd, void *to, std::size_t amount, uint64_t offset) { 36 | Reader::Request poison; 37 | poison.to = NULL; 38 | poison.size = 0; 39 | poison.offset = 0; 40 | unsigned threads = boost::thread::hardware_concurrency(); 41 | if (!threads) threads = 2; 42 | ThreadPool pool(2 /* don't need much of a queue */, threads, fd, poison); 43 | const std::size_t kBatch = 1ULL << 25; // 32 MB 44 | Reader::Request request; 45 | request.to = to; 46 | request.size = kBatch; 47 | request.offset = offset; 48 | for (; amount > kBatch; amount -= kBatch) { 49 | pool.Produce(request); 50 | request.to = reinterpret_cast(request.to) + kBatch; 51 | request.offset += kBatch; 52 | } 53 | request.size = amount; 54 | if (request.size) { 55 | pool.Produce(request); 56 | } 57 | } 58 | 59 | } // namespace util 60 | 61 | #else // WITH_THREADS 62 | 63 | namespace util { 64 | void ParallelRead(int fd, void *to, std::size_t amount, uint64_t offset) { 65 | util::ErsatzPRead(fd, to, amount, offset); 66 | } 67 | } // namespace util 68 | 69 | #endif 70 | -------------------------------------------------------------------------------- /kenlm/util/parallel_read.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_PARALLEL_READ__ 2 | #define UTIL_PARALLEL_READ__ 3 | 4 | /* Read pieces of a file in parallel. This has a very specific use case: 5 | * reading files from Lustre is CPU bound so multiple threads actually 6 | * increases throughput. Speed matters when an LM takes a terabyte. 7 | */ 8 | 9 | #include 10 | #include 11 | 12 | namespace util { 13 | void ParallelRead(int fd, void *to, std::size_t amount, uint64_t offset); 14 | } // namespace util 15 | 16 | #endif // UTIL_PARALLEL_READ__ 17 | -------------------------------------------------------------------------------- /kenlm/util/pcqueue_test.cc: -------------------------------------------------------------------------------- 1 | #include "util/pcqueue.hh" 2 | 3 | #define BOOST_TEST_MODULE PCQueueTest 4 | #include 5 | 6 | namespace util { 7 | namespace { 8 | 9 | BOOST_AUTO_TEST_CASE(SingleThread) { 10 | PCQueue queue(10); 11 | for (int i = 0; i < 10; ++i) { 12 | queue.Produce(i); 13 | } 14 | for (int i = 0; i < 10; ++i) { 15 | BOOST_CHECK_EQUAL(i, queue.Consume()); 16 | } 17 | } 18 | 19 | } 20 | } // namespace util 21 | -------------------------------------------------------------------------------- /kenlm/util/pool.cc: -------------------------------------------------------------------------------- 1 | #include "util/pool.hh" 2 | 3 | #include "util/scoped.hh" 4 | 5 | #include 6 | 7 | #include 8 | 9 | namespace util { 10 | 11 | Pool::Pool() { 12 | current_ = NULL; 13 | current_end_ = NULL; 14 | } 15 | 16 | Pool::~Pool() { 17 | FreeAll(); 18 | } 19 | 20 | void Pool::FreeAll() { 21 | for (std::vector::const_iterator i(free_list_.begin()); i != free_list_.end(); ++i) { 22 | free(*i); 23 | } 24 | free_list_.clear(); 25 | current_ = NULL; 26 | current_end_ = NULL; 27 | } 28 | 29 | void *Pool::More(std::size_t size) { 30 | std::size_t amount = std::max(static_cast(32) << free_list_.size(), size); 31 | uint8_t *ret = static_cast(MallocOrThrow(amount)); 32 | free_list_.push_back(ret); 33 | current_ = ret + size; 34 | current_end_ = ret + amount; 35 | return ret; 36 | } 37 | 38 | } // namespace util 39 | -------------------------------------------------------------------------------- /kenlm/util/read_compressed.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_READ_COMPRESSED_H 2 | #define UTIL_READ_COMPRESSED_H 3 | 4 | #include "util/exception.hh" 5 | #include "util/scoped.hh" 6 | 7 | #include 8 | #include 9 | 10 | namespace util { 11 | 12 | class CompressedException : public Exception { 13 | public: 14 | CompressedException() throw(); 15 | virtual ~CompressedException() throw(); 16 | }; 17 | 18 | class GZException : public CompressedException { 19 | public: 20 | GZException() throw(); 21 | ~GZException() throw(); 22 | }; 23 | 24 | class BZException : public CompressedException { 25 | public: 26 | BZException() throw(); 27 | ~BZException() throw(); 28 | }; 29 | 30 | class XZException : public CompressedException { 31 | public: 32 | XZException() throw(); 33 | ~XZException() throw(); 34 | }; 35 | 36 | class ReadCompressed; 37 | 38 | class ReadBase { 39 | public: 40 | virtual ~ReadBase() {} 41 | 42 | virtual std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) = 0; 43 | 44 | protected: 45 | static void ReplaceThis(ReadBase *with, ReadCompressed &thunk); 46 | 47 | ReadBase *Current(ReadCompressed &thunk); 48 | 49 | static uint64_t &ReadCount(ReadCompressed &thunk); 50 | }; 51 | 52 | class ReadCompressed { 53 | public: 54 | static const std::size_t kMagicSize = 6; 55 | // Must have at least kMagicSize bytes. 56 | static bool DetectCompressedMagic(const void *from); 57 | 58 | // Takes ownership of fd. 59 | explicit ReadCompressed(int fd); 60 | 61 | // Try to avoid using this. Use the fd instead. 62 | // There is no decompression support for istreams. 63 | explicit ReadCompressed(std::istream &in); 64 | 65 | // Must call Reset later. 66 | ReadCompressed(); 67 | 68 | // Takes ownership of fd. 69 | void Reset(int fd); 70 | 71 | // Same advice as the constructor. 72 | void Reset(std::istream &in); 73 | 74 | std::size_t Read(void *to, std::size_t amount); 75 | 76 | // Repeatedly call read to fill a buffer unless EOF is hit. 77 | // Return number of bytes read. 78 | std::size_t ReadOrEOF(void *const to, std::size_t amount); 79 | 80 | uint64_t RawAmount() const { return raw_amount_; } 81 | 82 | private: 83 | friend class ReadBase; 84 | 85 | scoped_ptr internal_; 86 | 87 | uint64_t raw_amount_; 88 | }; 89 | 90 | } // namespace util 91 | 92 | #endif // UTIL_READ_COMPRESSED_H 93 | -------------------------------------------------------------------------------- /kenlm/util/scoped.cc: -------------------------------------------------------------------------------- 1 | #include "util/scoped.hh" 2 | 3 | #include 4 | #if !defined(_WIN32) && !defined(_WIN64) 5 | #include 6 | #endif 7 | 8 | namespace util { 9 | 10 | // TODO: if we're really under memory pressure, don't allocate memory to 11 | // display the error. 12 | MallocException::MallocException(std::size_t requested) throw() { 13 | *this << "for " << requested << " bytes "; 14 | } 15 | 16 | MallocException::~MallocException() throw() {} 17 | 18 | namespace { 19 | void *InspectAddr(void *addr, std::size_t requested, const char *func_name) { 20 | UTIL_THROW_IF_ARG(!addr && requested, MallocException, (requested), "in " << func_name); 21 | return addr; 22 | } 23 | } // namespace 24 | 25 | void *MallocOrThrow(std::size_t requested) { 26 | return InspectAddr(std::malloc(requested), requested, "malloc"); 27 | } 28 | 29 | void *CallocOrThrow(std::size_t requested) { 30 | return InspectAddr(std::calloc(requested, 1), requested, "calloc"); 31 | } 32 | 33 | void scoped_malloc::call_realloc(std::size_t requested) { 34 | p_ = InspectAddr(std::realloc(p_, requested), requested, "realloc"); 35 | } 36 | 37 | void AdviseHugePages(const void *addr, std::size_t size) { 38 | #if MADV_HUGEPAGE 39 | madvise((void*)addr, size, MADV_HUGEPAGE); 40 | #endif 41 | } 42 | 43 | } // namespace util 44 | -------------------------------------------------------------------------------- /kenlm/util/sized_iterator_test.cc: -------------------------------------------------------------------------------- 1 | #include "util/sized_iterator.hh" 2 | 3 | #define BOOST_TEST_MODULE SizedIteratorTest 4 | #include 5 | 6 | namespace util { namespace { 7 | 8 | struct CompareChar { 9 | bool operator()(const void *first, const void *second) const { 10 | return *static_cast(first) < *static_cast(second); 11 | } 12 | }; 13 | 14 | BOOST_AUTO_TEST_CASE(sort) { 15 | char items[3] = {1, 2, 0}; 16 | SizedSort(items, items + 3, 1, CompareChar()); 17 | BOOST_CHECK_EQUAL(0, items[0]); 18 | BOOST_CHECK_EQUAL(1, items[1]); 19 | BOOST_CHECK_EQUAL(2, items[2]); 20 | } 21 | 22 | }} // namespace anonymous util 23 | -------------------------------------------------------------------------------- /kenlm/util/spaces.cc: -------------------------------------------------------------------------------- 1 | #include "util/spaces.hh" 2 | 3 | namespace util { 4 | 5 | // Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale). 6 | const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; 7 | 8 | } // namespace util 9 | -------------------------------------------------------------------------------- /kenlm/util/spaces.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_SPACES_H 2 | #define UTIL_SPACES_H 3 | 4 | // bool array of spaces. 5 | 6 | namespace util { 7 | 8 | extern const bool kSpaces[256]; 9 | 10 | } // namespace util 11 | 12 | #endif // UTIL_SPACES_H 13 | -------------------------------------------------------------------------------- /kenlm/util/stream/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # This CMake file was created by Lane Schwartz 2 | 3 | # Explicitly list the source files for this subdirectory 4 | # 5 | # If you add any source files to this subdirectory 6 | # that should be included in the kenlm library, 7 | # (this excludes any unit test files) 8 | # you should add them to the following list: 9 | # 10 | # In order to allow CMake files in the parent directory 11 | # to see this variable definition, we set PARENT_SCOPE. 12 | # 13 | # In order to set correct paths to these files 14 | # when this variable is referenced by CMake files in the parent directory, 15 | # we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}. 16 | # 17 | set(KENLM_UTIL_STREAM_SOURCE 18 | ${CMAKE_CURRENT_SOURCE_DIR}/chain.cc 19 | ${CMAKE_CURRENT_SOURCE_DIR}/count_records.cc 20 | ${CMAKE_CURRENT_SOURCE_DIR}/io.cc 21 | ${CMAKE_CURRENT_SOURCE_DIR}/line_input.cc 22 | ${CMAKE_CURRENT_SOURCE_DIR}/multi_progress.cc 23 | ${CMAKE_CURRENT_SOURCE_DIR}/rewindable_stream.cc 24 | PARENT_SCOPE) 25 | 26 | 27 | 28 | if(BUILD_TESTING) 29 | # Explicitly list the Boost test files to be compiled 30 | set(KENLM_BOOST_TESTS_LIST 31 | io_test 32 | sort_test 33 | stream_test 34 | rewindable_stream_test 35 | ) 36 | 37 | AddTests(TESTS ${KENLM_BOOST_TESTS_LIST} 38 | LIBRARIES kenlm_util ${Boost_LIBRARIES} pthread) 39 | endif() 40 | -------------------------------------------------------------------------------- /kenlm/util/stream/config.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_STREAM_CONFIG_H 2 | #define UTIL_STREAM_CONFIG_H 3 | 4 | #include 5 | #include 6 | 7 | namespace util { namespace stream { 8 | 9 | /** 10 | * Represents how a chain should be configured. 11 | */ 12 | struct ChainConfig { 13 | 14 | /** Constructs an configuration with underspecified (or default) parameters. */ 15 | ChainConfig() {} 16 | 17 | /** 18 | * Constructs a chain configuration object. 19 | * 20 | * @param [in] in_entry_size Number of bytes in each record. 21 | * @param [in] in_block_count Number of blocks in the chain. 22 | * @param [in] in_total_memory Total number of bytes available to the chain. 23 | * This value will be divided amongst the blocks in the chain. 24 | */ 25 | ChainConfig(std::size_t in_entry_size, std::size_t in_block_count, std::size_t in_total_memory) 26 | : entry_size(in_entry_size), block_count(in_block_count), total_memory(in_total_memory) {} 27 | 28 | /** 29 | * Number of bytes in each record. 30 | */ 31 | std::size_t entry_size; 32 | 33 | /** 34 | * Number of blocks in the chain. 35 | */ 36 | std::size_t block_count; 37 | 38 | /** 39 | * Total number of bytes available to the chain. 40 | * This value will be divided amongst the blocks in the chain. 41 | * Chain's constructor will make this a multiple of entry_size. 42 | */ 43 | std::size_t total_memory; 44 | }; 45 | 46 | 47 | /** 48 | * Represents how a sorter should be configured. 49 | */ 50 | struct SortConfig { 51 | 52 | /** Filename prefix where temporary files should be placed. */ 53 | std::string temp_prefix; 54 | 55 | /** Size of each input/output buffer. */ 56 | std::size_t buffer_size; 57 | 58 | /** Total memory to use when running alone. */ 59 | std::size_t total_memory; 60 | }; 61 | 62 | }} // namespaces 63 | #endif // UTIL_STREAM_CONFIG_H 64 | -------------------------------------------------------------------------------- /kenlm/util/stream/count_records.cc: -------------------------------------------------------------------------------- 1 | #include "util/stream/count_records.hh" 2 | #include "util/stream/chain.hh" 3 | 4 | namespace util { namespace stream { 5 | 6 | void CountRecords::Run(const ChainPosition &position) { 7 | for (Link link(position); link; ++link) { 8 | *count_ += link->ValidSize() / position.GetChain().EntrySize(); 9 | } 10 | } 11 | 12 | }} // namespaces 13 | -------------------------------------------------------------------------------- /kenlm/util/stream/count_records.hh: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace util { namespace stream { 4 | 5 | class ChainPosition; 6 | 7 | class CountRecords { 8 | public: 9 | explicit CountRecords(uint64_t *out) 10 | : count_(out) { 11 | *count_ = 0; 12 | } 13 | 14 | void Run(const ChainPosition &position); 15 | 16 | private: 17 | uint64_t *count_; 18 | }; 19 | 20 | }} // namespaces 21 | -------------------------------------------------------------------------------- /kenlm/util/stream/io.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_STREAM_IO_H 2 | #define UTIL_STREAM_IO_H 3 | 4 | #include "util/exception.hh" 5 | #include "util/file.hh" 6 | 7 | namespace util { 8 | namespace stream { 9 | 10 | class ChainPosition; 11 | 12 | class ReadSizeException : public util::Exception { 13 | public: 14 | ReadSizeException() throw(); 15 | ~ReadSizeException() throw(); 16 | }; 17 | 18 | class Read { 19 | public: 20 | explicit Read(int fd) : file_(fd) {} 21 | void Run(const ChainPosition &position); 22 | private: 23 | int file_; 24 | }; 25 | 26 | // Like read but uses pread so that the file can be accessed from multiple threads. 27 | class PRead { 28 | public: 29 | explicit PRead(int fd, bool take_own = false) : file_(fd), own_(take_own) {} 30 | void Run(const ChainPosition &position); 31 | private: 32 | int file_; 33 | bool own_; 34 | }; 35 | 36 | class Write { 37 | public: 38 | explicit Write(int fd) : file_(fd) {} 39 | void Run(const ChainPosition &position); 40 | private: 41 | int file_; 42 | }; 43 | 44 | // It's a common case that stuff is written and then recycled. So rather than 45 | // spawn another thread to Recycle, this combines the two roles. 46 | class WriteAndRecycle { 47 | public: 48 | explicit WriteAndRecycle(int fd) : file_(fd) {} 49 | void Run(const ChainPosition &position); 50 | private: 51 | int file_; 52 | }; 53 | 54 | class PWrite { 55 | public: 56 | explicit PWrite(int fd) : file_(fd) {} 57 | void Run(const ChainPosition &position); 58 | private: 59 | int file_; 60 | }; 61 | 62 | 63 | // Reuse the same file over and over again to buffer output. 64 | class FileBuffer { 65 | public: 66 | explicit FileBuffer(int fd) : file_(fd) {} 67 | 68 | PWrite Sink() const { 69 | util::SeekOrThrow(file_.get(), 0); 70 | return PWrite(file_.get()); 71 | } 72 | 73 | PRead Source(bool discard = false) { 74 | return PRead(discard ? file_.release() : file_.get(), discard); 75 | } 76 | 77 | uint64_t Size() const { 78 | return SizeOrThrow(file_.get()); 79 | } 80 | 81 | private: 82 | scoped_fd file_; 83 | }; 84 | 85 | } // namespace stream 86 | } // namespace util 87 | #endif // UTIL_STREAM_IO_H 88 | -------------------------------------------------------------------------------- /kenlm/util/stream/io_test.cc: -------------------------------------------------------------------------------- 1 | #include "util/stream/io.hh" 2 | 3 | #include "util/stream/chain.hh" 4 | #include "util/file.hh" 5 | 6 | #define BOOST_TEST_MODULE IOTest 7 | #include 8 | 9 | #include 10 | 11 | namespace util { namespace stream { namespace { 12 | 13 | BOOST_AUTO_TEST_CASE(CopyFile) { 14 | std::string temps("io_test_temp"); 15 | 16 | scoped_fd in(MakeTemp(temps)); 17 | for (uint64_t i = 0; i < 100000; ++i) { 18 | WriteOrThrow(in.get(), &i, sizeof(uint64_t)); 19 | } 20 | SeekOrThrow(in.get(), 0); 21 | scoped_fd out(MakeTemp(temps)); 22 | 23 | ChainConfig config; 24 | config.entry_size = 8; 25 | config.total_memory = 1024; 26 | config.block_count = 10; 27 | 28 | Chain(config) >> PRead(in.get()) >> Write(out.get()); 29 | 30 | SeekOrThrow(out.get(), 0); 31 | for (uint64_t i = 0; i < 100000; ++i) { 32 | uint64_t got; 33 | ReadOrThrow(out.get(), &got, sizeof(uint64_t)); 34 | BOOST_CHECK_EQUAL(i, got); 35 | } 36 | } 37 | 38 | }}} // namespaces 39 | -------------------------------------------------------------------------------- /kenlm/util/stream/line_input.cc: -------------------------------------------------------------------------------- 1 | #include "util/stream/line_input.hh" 2 | 3 | #include "util/exception.hh" 4 | #include "util/file.hh" 5 | #include "util/read_compressed.hh" 6 | #include "util/stream/chain.hh" 7 | 8 | #include 9 | #include 10 | 11 | namespace util { namespace stream { 12 | 13 | void LineInput::Run(const ChainPosition &position) { 14 | ReadCompressed reader(fd_); 15 | // Holding area for beginning of line to be placed in next block. 16 | std::vector carry; 17 | 18 | for (Link block(position); ; ++block) { 19 | char *to = static_cast(block->Get()); 20 | char *begin = to; 21 | char *end = to + position.GetChain().BlockSize(); 22 | std::copy(carry.begin(), carry.end(), to); 23 | to += carry.size(); 24 | while (to != end) { 25 | std::size_t got = reader.Read(to, end - to); 26 | if (!got) { 27 | // EOF 28 | block->SetValidSize(to - begin); 29 | ++block; 30 | block.Poison(); 31 | return; 32 | } 33 | to += got; 34 | } 35 | 36 | // Find the last newline. 37 | char *newline; 38 | for (newline = to - 1; ; --newline) { 39 | UTIL_THROW_IF(newline < begin, Exception, "Did not find a newline in " << position.GetChain().BlockSize() << " bytes of input of " << NameFromFD(fd_) << ". Is this a text file?"); 40 | if (*newline == '\n') break; 41 | } 42 | 43 | // Copy everything after the last newline to the carry. 44 | carry.clear(); 45 | carry.resize(to - (newline + 1)); 46 | std::copy(newline + 1, to, &*carry.begin()); 47 | 48 | block->SetValidSize(newline + 1 - begin); 49 | } 50 | } 51 | 52 | }} // namespaces 53 | -------------------------------------------------------------------------------- /kenlm/util/stream/line_input.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_STREAM_LINE_INPUT_H 2 | #define UTIL_STREAM_LINE_INPUT_H 3 | namespace util {namespace stream { 4 | 5 | class ChainPosition; 6 | 7 | /* Worker that reads input into blocks, ensuring that blocks contain whole 8 | * lines. Assumes that the maximum size of a line is less than the block size 9 | */ 10 | class LineInput { 11 | public: 12 | // Takes ownership upon thread execution. 13 | explicit LineInput(int fd); 14 | 15 | void Run(const ChainPosition &position); 16 | 17 | private: 18 | int fd_; 19 | }; 20 | 21 | }} // namespaces 22 | #endif // UTIL_STREAM_LINE_INPUT_H 23 | -------------------------------------------------------------------------------- /kenlm/util/stream/multi_progress.hh: -------------------------------------------------------------------------------- 1 | /* Progress bar suitable for chains of workers */ 2 | #ifndef UTIL_STREAM_MULTI_PROGRESS_H 3 | #define UTIL_STREAM_MULTI_PROGRESS_H 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | namespace util { namespace stream { 11 | 12 | class WorkerProgress; 13 | 14 | class MultiProgress { 15 | public: 16 | static const unsigned char kWidth = 100; 17 | 18 | MultiProgress(); 19 | 20 | ~MultiProgress(); 21 | 22 | // Turns on showing (requires SetTarget too). 23 | void Activate(); 24 | 25 | void SetTarget(uint64_t complete); 26 | 27 | WorkerProgress Add(); 28 | 29 | void Finished(); 30 | 31 | private: 32 | friend class WorkerProgress; 33 | void Milestone(WorkerProgress &worker); 34 | 35 | bool active_; 36 | 37 | uint64_t complete_; 38 | 39 | boost::mutex mutex_; 40 | 41 | // \0 at the end. 42 | char display_[kWidth + 1]; 43 | 44 | std::size_t character_handout_; 45 | 46 | MultiProgress(const MultiProgress &); 47 | MultiProgress &operator=(const MultiProgress &); 48 | }; 49 | 50 | class WorkerProgress { 51 | public: 52 | // Default contrutor must be initialized with operator= later. 53 | WorkerProgress() : parent_(NULL) {} 54 | 55 | // Not threadsafe for the same worker by default. 56 | WorkerProgress &operator++() { 57 | if (++current_ >= next_) { 58 | parent_->Milestone(*this); 59 | } 60 | return *this; 61 | } 62 | 63 | WorkerProgress &operator+=(uint64_t amount) { 64 | current_ += amount; 65 | if (current_ >= next_) { 66 | parent_->Milestone(*this); 67 | } 68 | return *this; 69 | } 70 | 71 | private: 72 | friend class MultiProgress; 73 | WorkerProgress(uint64_t next, MultiProgress &parent, char character) 74 | : current_(0), next_(next), parent_(&parent), stone_(0), character_(character) {} 75 | 76 | uint64_t current_, next_; 77 | 78 | MultiProgress *parent_; 79 | 80 | // Previous milestone reached. 81 | unsigned char stone_; 82 | 83 | // Character to display in bar. 84 | char character_; 85 | }; 86 | 87 | }} // namespaces 88 | 89 | #endif // UTIL_STREAM_MULTI_PROGRESS_H 90 | -------------------------------------------------------------------------------- /kenlm/util/stream/rewindable_stream_test.cc: -------------------------------------------------------------------------------- 1 | #include "util/stream/io.hh" 2 | 3 | #include "util/stream/rewindable_stream.hh" 4 | #include "util/file.hh" 5 | 6 | #define BOOST_TEST_MODULE RewindableStreamTest 7 | #include 8 | 9 | namespace util { 10 | namespace stream { 11 | namespace { 12 | 13 | BOOST_AUTO_TEST_CASE(RewindableStreamTest) { 14 | scoped_fd in(MakeTemp("io_test_temp")); 15 | for (uint64_t i = 0; i < 100000; ++i) { 16 | WriteOrThrow(in.get(), &i, sizeof(uint64_t)); 17 | } 18 | SeekOrThrow(in.get(), 0); 19 | 20 | ChainConfig config; 21 | config.entry_size = 8; 22 | config.total_memory = 100; 23 | config.block_count = 6; 24 | 25 | Chain chain(config); 26 | RewindableStream s; 27 | chain >> Read(in.get()) >> s >> kRecycle; 28 | uint64_t i = 0; 29 | for (; s; ++s, ++i) { 30 | BOOST_CHECK_EQUAL(i, *static_cast(s.Get())); 31 | if (100000UL - i == 2) 32 | s.Mark(); 33 | } 34 | BOOST_CHECK_EQUAL(100000ULL, i); 35 | s.Rewind(); 36 | BOOST_CHECK_EQUAL(100000ULL - 2, *static_cast(s.Get())); 37 | } 38 | 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /kenlm/util/stream/sort_test.cc: -------------------------------------------------------------------------------- 1 | #include "util/stream/sort.hh" 2 | 3 | #define BOOST_TEST_MODULE SortTest 4 | #include 5 | 6 | #include 7 | 8 | #include 9 | 10 | namespace util { namespace stream { namespace { 11 | 12 | struct CompareUInt64 : public std::binary_function { 13 | bool operator()(const void *first, const void *second) const { 14 | return *static_cast(first) < *reinterpret_cast(second); 15 | } 16 | }; 17 | 18 | const uint64_t kSize = 100000; 19 | 20 | struct Putter { 21 | Putter(std::vector &shuffled) : shuffled_(shuffled) {} 22 | 23 | void Run(const ChainPosition &position) { 24 | Stream put_shuffled(position); 25 | for (uint64_t i = 0; i < shuffled_.size(); ++i, ++put_shuffled) { 26 | *static_cast(put_shuffled.Get()) = shuffled_[i]; 27 | } 28 | put_shuffled.Poison(); 29 | } 30 | std::vector &shuffled_; 31 | }; 32 | 33 | BOOST_AUTO_TEST_CASE(FromShuffled) { 34 | std::vector shuffled; 35 | shuffled.reserve(kSize); 36 | for (uint64_t i = 0; i < kSize; ++i) { 37 | shuffled.push_back(i); 38 | } 39 | std::random_shuffle(shuffled.begin(), shuffled.end()); 40 | 41 | ChainConfig config; 42 | config.entry_size = 8; 43 | config.total_memory = 800; 44 | config.block_count = 3; 45 | 46 | SortConfig merge_config; 47 | merge_config.temp_prefix = "sort_test_temp"; 48 | merge_config.buffer_size = 800; 49 | merge_config.total_memory = 3300; 50 | 51 | Chain chain(config); 52 | chain >> Putter(shuffled); 53 | BlockingSort(chain, merge_config, CompareUInt64(), NeverCombine()); 54 | Stream sorted; 55 | chain >> sorted >> kRecycle; 56 | for (uint64_t i = 0; i < kSize; ++i, ++sorted) { 57 | BOOST_CHECK_EQUAL(i, *static_cast(sorted.Get())); 58 | } 59 | BOOST_CHECK(!sorted); 60 | } 61 | 62 | }}} // namespaces 63 | -------------------------------------------------------------------------------- /kenlm/util/stream/stream.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_STREAM_STREAM_H 2 | #define UTIL_STREAM_STREAM_H 3 | 4 | #include "util/stream/chain.hh" 5 | 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | namespace util { 12 | namespace stream { 13 | 14 | class Stream : boost::noncopyable { 15 | public: 16 | Stream() : current_(NULL), end_(NULL) {} 17 | 18 | void Init(const ChainPosition &position) { 19 | entry_size_ = position.GetChain().EntrySize(); 20 | block_size_ = position.GetChain().BlockSize(); 21 | block_it_.Init(position); 22 | StartBlock(); 23 | } 24 | 25 | explicit Stream(const ChainPosition &position) { 26 | Init(position); 27 | } 28 | 29 | operator bool() const { return current_ != NULL; } 30 | bool operator!() const { return current_ == NULL; } 31 | 32 | const void *Get() const { return current_; } 33 | void *Get() { return current_; } 34 | 35 | void Poison() { 36 | block_it_->SetValidSize(current_ - static_cast(block_it_->Get())); 37 | ++block_it_; 38 | block_it_.Poison(); 39 | } 40 | 41 | Stream &operator++() { 42 | assert(*this); 43 | assert(current_ < end_); 44 | current_ += entry_size_; 45 | if (current_ == end_) { 46 | ++block_it_; 47 | StartBlock(); 48 | } 49 | return *this; 50 | } 51 | 52 | private: 53 | void StartBlock() { 54 | for (; block_it_ && !block_it_->ValidSize(); ++block_it_) {} 55 | current_ = static_cast(block_it_->Get()); 56 | end_ = current_ + block_it_->ValidSize(); 57 | } 58 | 59 | // The following are pointers to raw memory 60 | // current_ is the current record 61 | // end_ is the end of the block (so we know when to move to the next block) 62 | uint8_t *current_, *end_; 63 | 64 | std::size_t entry_size_; 65 | std::size_t block_size_; 66 | 67 | Link block_it_; 68 | }; 69 | 70 | inline Chain &operator>>(Chain &chain, Stream &stream) { 71 | stream.Init(chain.Add()); 72 | return chain; 73 | } 74 | 75 | } // namespace stream 76 | } // namespace util 77 | #endif // UTIL_STREAM_STREAM_H 78 | -------------------------------------------------------------------------------- /kenlm/util/stream/stream_test.cc: -------------------------------------------------------------------------------- 1 | #include "util/stream/io.hh" 2 | 3 | #include "util/stream/stream.hh" 4 | #include "util/file.hh" 5 | 6 | #define BOOST_TEST_MODULE StreamTest 7 | #include 8 | 9 | #include 10 | 11 | namespace util { namespace stream { namespace { 12 | 13 | BOOST_AUTO_TEST_CASE(StreamTest) { 14 | scoped_fd in(MakeTemp("io_test_temp")); 15 | for (uint64_t i = 0; i < 100000; ++i) { 16 | WriteOrThrow(in.get(), &i, sizeof(uint64_t)); 17 | } 18 | SeekOrThrow(in.get(), 0); 19 | 20 | ChainConfig config; 21 | config.entry_size = 8; 22 | config.total_memory = 100; 23 | config.block_count = 12; 24 | 25 | Stream s; 26 | Chain chain(config); 27 | chain >> Read(in.get()) >> s >> kRecycle; 28 | uint64_t i = 0; 29 | for (; s; ++s, ++i) { 30 | BOOST_CHECK_EQUAL(i, *static_cast(s.Get())); 31 | } 32 | BOOST_CHECK_EQUAL(100000ULL, i); 33 | } 34 | 35 | }}} // namespaces 36 | -------------------------------------------------------------------------------- /kenlm/util/stream/typed_stream.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_STREAM_TYPED_STREAM_H 2 | #define UTIL_STREAM_TYPED_STREAM_H 3 | // A typed wrapper to Stream for POD types. 4 | 5 | #include "util/stream/stream.hh" 6 | 7 | namespace util { namespace stream { 8 | 9 | template class TypedStream : public Stream { 10 | public: 11 | // After using the default constructor, call Init (in the parent class) 12 | TypedStream() {} 13 | 14 | explicit TypedStream(const ChainPosition &position) : Stream(position) {} 15 | 16 | const T *operator->() const { return static_cast(Get()); } 17 | T *operator->() { return static_cast(Get()); } 18 | 19 | const T &operator*() const { return *static_cast(Get()); } 20 | T &operator*() { return *static_cast(Get()); } 21 | }; 22 | 23 | }} // namespaces 24 | 25 | #endif // UTIL_STREAM_TYPED_STREAM_H 26 | -------------------------------------------------------------------------------- /kenlm/util/string_piece_hash.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_STRING_PIECE_HASH_H 2 | #define UTIL_STRING_PIECE_HASH_H 3 | 4 | #include "util/string_piece.hh" 5 | 6 | #include 7 | #include 8 | 9 | inline size_t hash_value(const StringPiece &str) { 10 | return boost::hash_range(str.data(), str.data() + str.length()); 11 | } 12 | 13 | /* Support for lookup of StringPiece in boost::unordered_map */ 14 | struct StringPieceCompatibleHash : public std::unary_function { 15 | size_t operator()(const StringPiece &str) const { 16 | return hash_value(str); 17 | } 18 | }; 19 | 20 | struct StringPieceCompatibleEquals : public std::binary_function { 21 | bool operator()(const StringPiece &first, const StringPiece &second) const { 22 | return first == second; 23 | } 24 | }; 25 | template typename T::const_iterator FindStringPiece(const T &t, const StringPiece &key) { 26 | #if BOOST_VERSION < 104200 27 | std::string temp(key.data(), key.size()); 28 | return t.find(temp); 29 | #else 30 | return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); 31 | #endif 32 | } 33 | 34 | template typename T::iterator FindStringPiece(T &t, const StringPiece &key) { 35 | #if BOOST_VERSION < 104200 36 | std::string temp(key.data(), key.size()); 37 | return t.find(temp); 38 | #else 39 | return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); 40 | #endif 41 | } 42 | 43 | #endif // UTIL_STRING_PIECE_HASH_H 44 | -------------------------------------------------------------------------------- /kenlm/util/string_stream.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_STRING_STREAM_H 2 | #define UTIL_STRING_STREAM_H 3 | 4 | #include "util/fake_ostream.hh" 5 | 6 | #include 7 | #include 8 | 9 | namespace util { 10 | 11 | class StringStream : public FakeOStream { 12 | public: 13 | StringStream() {} 14 | 15 | StringStream &flush() { return *this; } 16 | 17 | StringStream &write(const void *data, std::size_t length) { 18 | out_.append(static_cast(data), length); 19 | return *this; 20 | } 21 | 22 | const std::string &str() const { return out_; } 23 | 24 | void str(const std::string &val) { out_ = val; } 25 | 26 | void swap(std::string &str) { std::swap(out_, str); } 27 | 28 | protected: 29 | friend class FakeOStream; 30 | char *Ensure(std::size_t amount) { 31 | std::size_t current = out_.size(); 32 | out_.resize(out_.size() + amount); 33 | return &out_[current]; 34 | } 35 | 36 | void AdvanceTo(char *to) { 37 | assert(to <= &*out_.end()); 38 | assert(to >= &*out_.begin()); 39 | out_.resize(to - &*out_.begin()); 40 | } 41 | 42 | private: 43 | std::string out_; 44 | }; 45 | 46 | } // namespace 47 | 48 | #endif // UTIL_STRING_STREAM_H 49 | -------------------------------------------------------------------------------- /kenlm/util/string_stream_test.cc: -------------------------------------------------------------------------------- 1 | #define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE 2 | #define BOOST_TEST_MODULE FakeOStreamTest 3 | 4 | #include "util/string_stream.hh" 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | namespace util { namespace { 12 | 13 | template void TestEqual(const T value) { 14 | StringStream strme; 15 | strme << value; 16 | BOOST_CHECK_EQUAL(boost::lexical_cast(value), strme.str()); 17 | } 18 | 19 | template void TestCorners() { 20 | TestEqual(std::numeric_limits::max()); 21 | TestEqual(std::numeric_limits::min()); 22 | TestEqual(static_cast(0)); 23 | TestEqual(static_cast(-1)); 24 | TestEqual(static_cast(1)); 25 | } 26 | 27 | BOOST_AUTO_TEST_CASE(Integer) { 28 | TestCorners(); 29 | TestCorners(); 30 | TestCorners(); 31 | 32 | TestCorners(); 33 | TestCorners(); 34 | TestCorners(); 35 | 36 | TestCorners(); 37 | TestCorners(); 38 | TestCorners(); 39 | 40 | TestCorners(); 41 | TestCorners(); 42 | TestCorners(); 43 | 44 | TestCorners(); 45 | TestCorners(); 46 | TestCorners(); 47 | 48 | TestCorners(); 49 | } 50 | 51 | enum TinyEnum { EnumValue }; 52 | 53 | BOOST_AUTO_TEST_CASE(EnumCase) { 54 | TestEqual(EnumValue); 55 | } 56 | 57 | BOOST_AUTO_TEST_CASE(Strings) { 58 | TestEqual("foo"); 59 | const char *a = "bar"; 60 | TestEqual(a); 61 | StringPiece piece("abcdef"); 62 | TestEqual(piece); 63 | TestEqual(StringPiece()); 64 | 65 | char non_const[3]; 66 | non_const[0] = 'b'; 67 | non_const[1] = 'c'; 68 | non_const[2] = 0; 69 | 70 | StringStream out; 71 | out << "a" << non_const << 'c'; 72 | BOOST_CHECK_EQUAL("abcc", out.str()); 73 | 74 | // Now test as a separate object. 75 | StringStream stream; 76 | stream << "a" << non_const << 'c' << piece; 77 | BOOST_CHECK_EQUAL("abccabcdef", stream.str()); 78 | } 79 | 80 | }} // namespaces 81 | -------------------------------------------------------------------------------- /kenlm/util/tokenize_piece_test.cc: -------------------------------------------------------------------------------- 1 | #include "util/tokenize_piece.hh" 2 | #include "util/string_piece.hh" 3 | 4 | #define BOOST_TEST_MODULE TokenIteratorTest 5 | #include 6 | 7 | #include 8 | 9 | namespace util { 10 | namespace { 11 | 12 | BOOST_AUTO_TEST_CASE(pipe_pipe_none) { 13 | const char str[] = "nodelimit at all"; 14 | TokenIter it(str, MultiCharacter("|||")); 15 | BOOST_REQUIRE(it); 16 | BOOST_CHECK_EQUAL(StringPiece(str), *it); 17 | ++it; 18 | BOOST_CHECK(!it); 19 | } 20 | BOOST_AUTO_TEST_CASE(pipe_pipe_two) { 21 | const char str[] = "|||"; 22 | TokenIter it(str, MultiCharacter("|||")); 23 | BOOST_REQUIRE(it); 24 | BOOST_CHECK_EQUAL(StringPiece(), *it); 25 | ++it; 26 | BOOST_REQUIRE(it); 27 | BOOST_CHECK_EQUAL(StringPiece(), *it); 28 | ++it; 29 | BOOST_CHECK(!it); 30 | } 31 | 32 | BOOST_AUTO_TEST_CASE(remove_empty) { 33 | const char str[] = "|||"; 34 | TokenIter it(str, MultiCharacter("|||")); 35 | BOOST_CHECK(!it); 36 | } 37 | 38 | BOOST_AUTO_TEST_CASE(remove_empty_keep) { 39 | const char str[] = " |||"; 40 | TokenIter it(str, MultiCharacter("|||")); 41 | BOOST_REQUIRE(it); 42 | BOOST_CHECK_EQUAL(StringPiece(" "), *it); 43 | ++it; 44 | BOOST_CHECK(!it); 45 | } 46 | 47 | } // namespace 48 | } // namespace util 49 | -------------------------------------------------------------------------------- /kenlm/util/usage.hh: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_USAGE_H 2 | #define UTIL_USAGE_H 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace util { 9 | // Time in seconds since process started. Zero on unsupported platforms. 10 | double WallTime(); 11 | 12 | // User + system time, process-wide. 13 | double CPUTime(); 14 | 15 | // User + system time, thread-specific. 16 | double ThreadTime(); 17 | 18 | // Resident usage in bytes. 19 | uint64_t RSSMax(); 20 | 21 | void PrintUsage(std::ostream &to); 22 | 23 | // Determine how much physical memory there is. Return 0 on failure. 24 | uint64_t GuessPhysicalMemory(); 25 | 26 | // Parse a size like unix sort. Sadly, this means the default multiplier is K. 27 | uint64_t ParseSize(const std::string &arg); 28 | 29 | } // namespace util 30 | #endif // UTIL_USAGE_H 31 | -------------------------------------------------------------------------------- /kenmodels/zhwiki_bigram.klm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/kenmodels/zhwiki_bigram.klm -------------------------------------------------------------------------------- /kenmodels/zhwiki_trigram.klm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/kenmodels/zhwiki_trigram.klm -------------------------------------------------------------------------------- /train_kenlm.sh: -------------------------------------------------------------------------------- 1 | python feed_kenlm.py | ./kenlm/build/bin/lmplz -o 3 > zhwiki_trigram.arpa 2 | ./kenlm/build/bin/build_binary zhwiki_trigram.arpa zhwiki_trigram.klm 3 | --------------------------------------------------------------------------------