├── README.md
├── cn_spelling.py
├── data
    ├── SimilarPronunciation.txt
    ├── SimilarShape.txt
    ├── bcmi_data
    │   ├── dev_input.txt
    │   └── input.txt
    ├── cncorpus
    │   ├── CorpusCharacterlist.xls
    │   ├── CorpusWordPOSlist.xls
    │   ├── CorpusWordlist.xls
    │   ├── 现代汉语常用字表.xls
    │   ├── 现代汉语通用字表.xls
    │   └── 通用规范汉字表.xls
    ├── common.pkl
    ├── sighan
    │   └── processed
    │   │   ├── clp14-C1-training.txt
    │   │   ├── clp14csc_C1_training.pkl
    │   │   ├── sighan15-A2-Training.txt
    │   │   └── sighan15_A2_training.pkl
    ├── simp.pickle
    ├── simp_simplified.pickle
    ├── simp_sm.pickle
    ├── sims.pickle
    ├── xingjinzi.txt
    ├── xjz.pickle
    └── xjz.pkl
├── feed_kenlm.py
├── kenlm
    ├── .gitignore
    ├── BUILDING
    ├── CMakeLists.txt
    ├── COPYING
    ├── COPYING.3
    ├── COPYING.LESSER.3
    ├── Doxyfile
    ├── GIT_REVISION
    ├── LICENSE
    ├── MANIFEST.in
    ├── README.md
    ├── clean_query_only.sh
    ├── cmake
    │   ├── KenLMFunctions.cmake
    │   └── modules
    │   │   └── FindEigen3.cmake
    ├── compile_query_only.sh
    ├── include
    │   ├── lm
    │   │   ├── bhiksha.hh
    │   │   ├── binary_format.hh
    │   │   ├── blank.hh
    │   │   ├── builder
    │   │   │   ├── adjust_counts.hh
    │   │   │   ├── corpus_count.hh
    │   │   │   ├── discount.hh
    │   │   │   ├── hash_gamma.hh
    │   │   │   ├── header_info.hh
    │   │   │   ├── initial_probabilities.hh
    │   │   │   ├── interpolate.hh
    │   │   │   ├── joint_order.hh
    │   │   │   ├── ngram.hh
    │   │   │   ├── ngram_stream.hh
    │   │   │   ├── output.hh
    │   │   │   ├── pipeline.hh
    │   │   │   ├── print.hh
    │   │   │   └── sort.hh
    │   │   ├── config.hh
    │   │   ├── enumerate_vocab.hh
    │   │   ├── facade.hh
    │   │   ├── filter
    │   │   │   ├── arpa_io.hh
    │   │   │   ├── count_io.hh
    │   │   │   ├── format.hh
    │   │   │   ├── phrase.hh
    │   │   │   ├── thread.hh
    │   │   │   ├── vocab.hh
    │   │   │   └── wrapper.hh
    │   │   ├── interpolate
    │   │   │   └── arpa_to_stream.hh
    │   │   ├── left.hh
    │   │   ├── lm_exception.hh
    │   │   ├── max_order.hh
    │   │   ├── model.hh
    │   │   ├── model_type.hh
    │   │   ├── neural
    │   │   │   └── wordvecs.hh
    │   │   ├── ngram_query.hh
    │   │   ├── partial.hh
    │   │   ├── quantize.hh
    │   │   ├── read_arpa.hh
    │   │   ├── return.hh
    │   │   ├── search_hashed.hh
    │   │   ├── search_trie.hh
    │   │   ├── sizes.hh
    │   │   ├── state.hh
    │   │   ├── trie.hh
    │   │   ├── trie_sort.hh
    │   │   ├── value.hh
    │   │   ├── value_build.hh
    │   │   ├── virtual_interface.hh
    │   │   ├── vocab.hh
    │   │   ├── weights.hh
    │   │   ├── word_index.hh
    │   │   └── wrappers
    │   │   │   └── nplm.hh
    │   └── util
    │   │   ├── bit_packing.hh
    │   │   ├── ersatz_progress.hh
    │   │   ├── exception.hh
    │   │   ├── fake_ofstream.hh
    │   │   ├── file.hh
    │   │   ├── file_piece.hh
    │   │   ├── fixed_array.hh
    │   │   ├── getopt.hh
    │   │   ├── have.hh
    │   │   ├── joint_sort.hh
    │   │   ├── mmap.hh
    │   │   ├── multi_intersection.hh
    │   │   ├── murmur_hash.hh
    │   │   ├── parallel_read.hh
    │   │   ├── pcqueue.hh
    │   │   ├── pool.hh
    │   │   ├── probing_hash_table.hh
    │   │   ├── proxy_iterator.hh
    │   │   ├── read_compressed.hh
    │   │   ├── scoped.hh
    │   │   ├── sized_iterator.hh
    │   │   ├── sorted_uniform.hh
    │   │   ├── stream
    │   │       ├── block.hh
    │   │       ├── chain.hh
    │   │       ├── config.hh
    │   │       ├── io.hh
    │   │       ├── line_input.hh
    │   │       ├── multi_progress.hh
    │   │       ├── multi_stream.hh
    │   │       ├── sort.hh
    │   │       ├── stream.hh
    │   │       └── timer.hh
    │   │   ├── string_piece.hh
    │   │   ├── string_piece_hash.hh
    │   │   ├── thread_pool.hh
    │   │   ├── tokenize_piece.hh
    │   │   ├── unistd.hh
    │   │   └── usage.hh
    ├── lm
    │   ├── CMakeLists.txt
    │   ├── bhiksha.cc
    │   ├── bhiksha.hh
    │   ├── binary_format.cc
    │   ├── binary_format.hh
    │   ├── blank.hh
    │   ├── build_binary_main.cc
    │   ├── builder
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   ├── TODO
    │   │   ├── adjust_counts.cc
    │   │   ├── adjust_counts.hh
    │   │   ├── adjust_counts_test.cc
    │   │   ├── combine_counts.hh
    │   │   ├── corpus_count.cc
    │   │   ├── corpus_count.hh
    │   │   ├── corpus_count_test.cc
    │   │   ├── count_ngrams_main.cc
    │   │   ├── debug_print.hh
    │   │   ├── discount.hh
    │   │   ├── dump_counts_main.cc
    │   │   ├── hash_gamma.hh
    │   │   ├── header_info.hh
    │   │   ├── initial_probabilities.cc
    │   │   ├── initial_probabilities.hh
    │   │   ├── interpolate.cc
    │   │   ├── interpolate.hh
    │   │   ├── lmplz_main.cc
    │   │   ├── output.cc
    │   │   ├── output.hh
    │   │   ├── payload.hh
    │   │   ├── pipeline.cc
    │   │   └── pipeline.hh
    │   ├── common
    │   │   ├── CMakeLists.txt
    │   │   ├── compare.hh
    │   │   ├── joint_order.hh
    │   │   ├── model_buffer.cc
    │   │   ├── model_buffer.hh
    │   │   ├── model_buffer_test.cc
    │   │   ├── ngram.hh
    │   │   ├── ngram_stream.hh
    │   │   ├── print.cc
    │   │   ├── print.hh
    │   │   ├── renumber.cc
    │   │   ├── renumber.hh
    │   │   ├── size_option.cc
    │   │   ├── size_option.hh
    │   │   ├── special.hh
    │   │   └── test_data
    │   │   │   ├── generate.sh
    │   │   │   ├── toy0.1
    │   │   │   ├── toy0.2
    │   │   │   ├── toy0.3
    │   │   │   ├── toy0.arpa
    │   │   │   ├── toy0.kenlm_intermediate
    │   │   │   ├── toy0.vocab
    │   │   │   ├── toy1.1
    │   │   │   ├── toy1.2
    │   │   │   ├── toy1.3
    │   │   │   ├── toy1.arpa
    │   │   │   ├── toy1.kenlm_intermediate
    │   │   │   └── toy1.vocab
    │   ├── config.cc
    │   ├── config.hh
    │   ├── enumerate_vocab.hh
    │   ├── facade.hh
    │   ├── filter
    │   │   ├── CMakeLists.txt
    │   │   ├── arpa_io.cc
    │   │   ├── arpa_io.hh
    │   │   ├── count_io.hh
    │   │   ├── filter_main.cc
    │   │   ├── format.hh
    │   │   ├── phrase.cc
    │   │   ├── phrase.hh
    │   │   ├── phrase_table_vocab_main.cc
    │   │   ├── thread.hh
    │   │   ├── vocab.cc
    │   │   ├── vocab.hh
    │   │   └── wrapper.hh
    │   ├── fragment_main.cc
    │   ├── interpolate
    │   │   ├── CMakeLists.txt
    │   │   ├── backoff_matrix.hh
    │   │   ├── backoff_reunification.cc
    │   │   ├── backoff_reunification.hh
    │   │   ├── backoff_reunification_test.cc
    │   │   ├── bounded_sequence_encoding.cc
    │   │   ├── bounded_sequence_encoding.hh
    │   │   ├── bounded_sequence_encoding_test.cc
    │   │   ├── interpolate_info.hh
    │   │   ├── interpolate_main.cc
    │   │   ├── merge_probabilities.cc
    │   │   ├── merge_probabilities.hh
    │   │   ├── merge_test
    │   │   │   ├── test1
    │   │   │   ├── test2
    │   │   │   ├── test3
    │   │   │   ├── test_bad_order
    │   │   │   └── test_no_unk
    │   │   ├── merge_vocab.cc
    │   │   ├── merge_vocab.hh
    │   │   ├── merge_vocab_test.cc
    │   │   ├── normalize.cc
    │   │   ├── normalize.hh
    │   │   ├── normalize_test.cc
    │   │   ├── pipeline.cc
    │   │   ├── pipeline.hh
    │   │   ├── split_worker.cc
    │   │   ├── split_worker.hh
    │   │   ├── streaming_example_main.cc
    │   │   ├── tune_derivatives.cc
    │   │   ├── tune_derivatives.hh
    │   │   ├── tune_derivatives_test.cc
    │   │   ├── tune_instances.cc
    │   │   ├── tune_instances.hh
    │   │   ├── tune_instances_test.cc
    │   │   ├── tune_matrix.hh
    │   │   ├── tune_weights.cc
    │   │   ├── tune_weights.hh
    │   │   ├── universal_vocab.cc
    │   │   └── universal_vocab.hh
    │   ├── kenlm_benchmark_main.cc
    │   ├── left.hh
    │   ├── left_test.cc
    │   ├── lm_exception.cc
    │   ├── lm_exception.hh
    │   ├── max_order.hh
    │   ├── model.cc
    │   ├── model.hh
    │   ├── model_test.cc
    │   ├── model_type.hh
    │   ├── ngram_query.hh
    │   ├── partial.hh
    │   ├── partial_test.cc
    │   ├── quantize.cc
    │   ├── quantize.hh
    │   ├── query_main.cc
    │   ├── read_arpa.cc
    │   ├── read_arpa.hh
    │   ├── return.hh
    │   ├── search_hashed.cc
    │   ├── search_hashed.hh
    │   ├── search_trie.cc
    │   ├── search_trie.hh
    │   ├── sizes.cc
    │   ├── sizes.hh
    │   ├── state.hh
    │   ├── test.arpa
    │   ├── test_nounk.arpa
    │   ├── trie.cc
    │   ├── trie.hh
    │   ├── trie_sort.cc
    │   ├── trie_sort.hh
    │   ├── value.hh
    │   ├── value_build.cc
    │   ├── value_build.hh
    │   ├── virtual_interface.cc
    │   ├── virtual_interface.hh
    │   ├── vocab.cc
    │   ├── vocab.hh
    │   ├── weights.hh
    │   ├── word_index.hh
    │   └── wrappers
    │   │   ├── README
    │   │   ├── nplm.cc
    │   │   └── nplm.hh
    ├── python
    │   ├── _kenlm.pxd
    │   ├── example.py
    │   ├── kenlm.cpp
    │   └── kenlm.pyx
    ├── setup.py
    ├── util
    │   ├── CMakeLists.txt
    │   ├── bit_packing.cc
    │   ├── bit_packing.hh
    │   ├── bit_packing_test.cc
    │   ├── cat_compressed_main.cc
    │   ├── double-conversion
    │   │   ├── CMakeLists.txt
    │   │   ├── LICENSE
    │   │   ├── bignum-dtoa.cc
    │   │   ├── bignum-dtoa.h
    │   │   ├── bignum.cc
    │   │   ├── bignum.h
    │   │   ├── cached-powers.cc
    │   │   ├── cached-powers.h
    │   │   ├── diy-fp.cc
    │   │   ├── diy-fp.h
    │   │   ├── double-conversion.cc
    │   │   ├── double-conversion.h
    │   │   ├── fast-dtoa.cc
    │   │   ├── fast-dtoa.h
    │   │   ├── fixed-dtoa.cc
    │   │   ├── fixed-dtoa.h
    │   │   ├── ieee.h
    │   │   ├── strtod.cc
    │   │   ├── strtod.h
    │   │   └── utils.h
    │   ├── ersatz_progress.cc
    │   ├── ersatz_progress.hh
    │   ├── exception.cc
    │   ├── exception.hh
    │   ├── fake_ostream.hh
    │   ├── file.cc
    │   ├── file.hh
    │   ├── file_piece.cc
    │   ├── file_piece.hh
    │   ├── file_piece_test.cc
    │   ├── file_stream.hh
    │   ├── fixed_array.hh
    │   ├── float_to_string.cc
    │   ├── float_to_string.hh
    │   ├── getopt.c
    │   ├── getopt.hh
    │   ├── have.hh
    │   ├── integer_to_string.cc
    │   ├── integer_to_string.hh
    │   ├── integer_to_string_test.cc
    │   ├── joint_sort.hh
    │   ├── joint_sort_test.cc
    │   ├── mmap.cc
    │   ├── mmap.hh
    │   ├── multi_intersection.hh
    │   ├── multi_intersection_test.cc
    │   ├── murmur_hash.cc
    │   ├── murmur_hash.hh
    │   ├── parallel_read.cc
    │   ├── parallel_read.hh
    │   ├── pcqueue.hh
    │   ├── pcqueue_test.cc
    │   ├── pool.cc
    │   ├── pool.hh
    │   ├── probing_hash_table.hh
    │   ├── probing_hash_table_benchmark_main.cc
    │   ├── probing_hash_table_test.cc
    │   ├── proxy_iterator.hh
    │   ├── read_compressed.cc
    │   ├── read_compressed.hh
    │   ├── read_compressed_test.cc
    │   ├── scoped.cc
    │   ├── scoped.hh
    │   ├── sized_iterator.hh
    │   ├── sized_iterator_test.cc
    │   ├── sorted_uniform.hh
    │   ├── sorted_uniform_test.cc
    │   ├── spaces.cc
    │   ├── spaces.hh
    │   ├── stream
    │   │   ├── CMakeLists.txt
    │   │   ├── block.hh
    │   │   ├── chain.cc
    │   │   ├── chain.hh
    │   │   ├── config.hh
    │   │   ├── count_records.cc
    │   │   ├── count_records.hh
    │   │   ├── io.cc
    │   │   ├── io.hh
    │   │   ├── io_test.cc
    │   │   ├── line_input.cc
    │   │   ├── line_input.hh
    │   │   ├── multi_progress.cc
    │   │   ├── multi_progress.hh
    │   │   ├── multi_stream.hh
    │   │   ├── rewindable_stream.cc
    │   │   ├── rewindable_stream.hh
    │   │   ├── rewindable_stream_test.cc
    │   │   ├── sort.hh
    │   │   ├── sort_test.cc
    │   │   ├── stream.hh
    │   │   ├── stream_test.cc
    │   │   └── typed_stream.hh
    │   ├── string_piece.cc
    │   ├── string_piece.hh
    │   ├── string_piece_hash.hh
    │   ├── string_stream.hh
    │   ├── string_stream_test.cc
    │   ├── thread_pool.hh
    │   ├── tokenize_piece.hh
    │   ├── tokenize_piece_test.cc
    │   ├── usage.cc
    │   └── usage.hh
    └── windows
    │   ├── build_binary.vcxproj
    │   ├── kenlm.sln
    │   ├── kenlm.vcxproj
    │   ├── lmplz.vcxproj
    │   └── ngram_query.vcxproj
├── kenmodels
    ├── zhwiki_bigram.arpa
    ├── zhwiki_bigram.klm
    ├── zhwiki_trigram.arpa
    └── zhwiki_trigram.klm
├── langconv.py
├── train_kenlm.sh
└── zh_wiki.py


/data/cncorpus/CorpusCharacterlist.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/cncorpus/CorpusCharacterlist.xls


--------------------------------------------------------------------------------
/data/cncorpus/CorpusWordPOSlist.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/cncorpus/CorpusWordPOSlist.xls


--------------------------------------------------------------------------------
/data/cncorpus/CorpusWordlist.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/cncorpus/CorpusWordlist.xls


--------------------------------------------------------------------------------
/data/cncorpus/现代汉语常用字表.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/cncorpus/现代汉语常用字表.xls


--------------------------------------------------------------------------------
/data/cncorpus/现代汉语通用字表.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/cncorpus/现代汉语通用字表.xls


--------------------------------------------------------------------------------
/data/cncorpus/通用规范汉字表.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/cncorpus/通用规范汉字表.xls


--------------------------------------------------------------------------------
/data/common.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/common.pkl


--------------------------------------------------------------------------------
/data/sighan/processed/clp14csc_C1_training.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/sighan/processed/clp14csc_C1_training.pkl


--------------------------------------------------------------------------------
/data/sighan/processed/sighan15_A2_training.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/sighan/processed/sighan15_A2_training.pkl


--------------------------------------------------------------------------------
/data/simp.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/simp.pickle


--------------------------------------------------------------------------------
/data/simp_simplified.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/simp_simplified.pickle


--------------------------------------------------------------------------------
/data/simp_sm.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/simp_sm.pickle


--------------------------------------------------------------------------------
/data/sims.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/sims.pickle


--------------------------------------------------------------------------------
/data/xjz.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/xjz.pickle


--------------------------------------------------------------------------------
/data/xjz.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/data/xjz.pkl


--------------------------------------------------------------------------------
/feed_kenlm.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | import mmap
 3 | 
 4 | fpath = "./data/wikipedia/cn_wiki.txt"
 5 | with codecs.open(fpath, 'r', encoding='utf-8') as f:
 6 |     text = f.readlines()
 7 | 
 8 | for line in text[:10]:
 9 |     print(' '.join(line.strip()), end=' ')
10 |     # print(' '.join(line.strip().split(' / ')), end=' ')
11 | 


--------------------------------------------------------------------------------
/kenlm/.gitignore:
--------------------------------------------------------------------------------
 1 | util/file_piece.cc.gz
 2 | *.swp
 3 | *.o
 4 | doc/
 5 | build/
 6 | ._*
 7 | windows/Win32
 8 | windows/x64
 9 | windows/*.user
10 | windows/*.sdf
11 | windows/*.opensdf
12 | windows/*.suo
13 | CMakeFiles
14 | cmake_install.cmake
15 | CMakeCache.txt
16 | CTestTestfile.cmake
17 | DartConfiguration.tcl
18 | Makefile
19 | 


--------------------------------------------------------------------------------
/kenlm/BUILDING:
--------------------------------------------------------------------------------
 1 | KenLM has switched to cmake
 2 |   cmake .
 3 |   make -j 4
 4 | But they recommend building out of tree
 5 |   mkdir -p build && cd build
 6 |   cmake ..
 7 |   make -j 4
 8 | 
 9 | If you only want the query code and do not care about compression (.gz, .bz2, and .xz):
10 |   ./compile_query_only.sh
11 | 
12 | Windows:
13 |   The windows directory has visual studio files.  Note that you need to compile
14 |   the kenlm project before build_binary and ngram_query projects.  
15 | 


--------------------------------------------------------------------------------
/kenlm/GIT_REVISION:
--------------------------------------------------------------------------------
1 | cdd794598ea15dc23a7daaf7a8cf89423c97f7e6
2 | 


--------------------------------------------------------------------------------
/kenlm/LICENSE:
--------------------------------------------------------------------------------
 1 | Most of the code here is licensed under the LGPL.  There are exceptions that
 2 | have their own licenses, listed below.  See comments in those files for more
 3 | details.  
 4 | 
 5 | util/getopt.* is getopt for Windows
 6 | util/murmur_hash.cc
 7 | util/string_piece.hh and util/string_piece.cc
 8 | util/double-conversion/LICENSE covers util/double-conversion except the build files
 9 | util/file.cc contains a modified implementation of mkstemp under the LGPL
10 | util/integer_to_string.* is BSD
11 | 
12 | For the rest:
13 | 
14 |     KenLM is free software: you can redistribute it and/or modify
15 |     it under the terms of the GNU Lesser General Public License as published
16 |     by the Free Software Foundation, either version 2.1 of the License, or
17 |     (at your option) any later version.
18 | 
19 |     KenLM is distributed in the hope that it will be useful,
20 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
21 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 |     GNU Lesser General Public License for more details.
23 | 
24 |     You should have received a copy of the GNU Lesser General Public License 2.1
25 |     along with KenLM code.  If not, see <http://www.gnu.org/licenses/lgpl-2.1.html>.
26 | 


--------------------------------------------------------------------------------
/kenlm/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | # file GENERATED by distutils, do NOT edit
 2 | include setup.py
 3 | include lm/*.cc
 4 | include lm/*.hh
 5 | include python/*.cpp
 6 | include util/*.cc
 7 | include util/*.hh
 8 | include util/double-conversion/*.cc
 9 | include util/double-conversion/*.h
10 | 


--------------------------------------------------------------------------------
/kenlm/clean_query_only.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | rm -rf {lm,util,util/double-conversion}/*.o bin/{query,build_binary}
3 | 


--------------------------------------------------------------------------------
/kenlm/compile_query_only.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #This is just an example compilation.  You should integrate these files into your build system.  Boost jam is provided and preferred.
 3 | 
 4 | echo You must use ./bjam if you want language model estimation, filtering, or support for compressed files \(.gz, .bz2, .xz\) 1>&2
 5 | 
 6 | rm {lm,util}/*.o 2>/dev/null
 7 | set -e
 8 | 
 9 | CXX=${CXX:-g++}
10 | 
11 | CXXFLAGS+=" -I. -O3 -DNDEBUG -DKENLM_MAX_ORDER=6"
12 | 
13 | #If this fails for you, consider using bjam.
14 | if [ ${#NPLM} != 0 ]; then
15 |   CXXFLAGS+=" -DHAVE_NPLM -lneuralLM -L$NPLM/src -I$NPLM/src -lboost_thread-mt -fopenmp"
16 |   ADDED_PATHS="lm/wrappers/*.cc"
17 | fi
18 | echo 'Compiling with '$CXX $CXXFLAGS
19 | 
20 | #Grab all cc files in these directories except those ending in test.cc or main.cc
21 | objects=""
22 | for i in util/double-conversion/*.cc util/*.cc lm/*.cc $ADDED_PATHS; do
23 |   if [ "${i%test.cc}" == "$i" ] && [ "${i%main.cc}" == "$i" ]; then
24 |     $CXX $CXXFLAGS -c $i -o ${i%.cc}.o
25 |     objects="$objects ${i%.cc}.o"
26 |   fi
27 | done
28 | 
29 | mkdir -p bin
30 | if [ "$(uname)" != Darwin ]; then
31 |   CXXFLAGS="$CXXFLAGS -lrt"
32 | fi
33 | $CXX lm/build_binary_main.cc $objects -o bin/build_binary $CXXFLAGS $LDFLAGS
34 | $CXX lm/query_main.cc $objects -o bin/query $CXXFLAGS $LDFLAGS
35 | 


--------------------------------------------------------------------------------
/kenlm/include/lm/blank.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_BLANK_H
 2 | #define LM_BLANK_H
 3 | 
 4 | #include <limits>
 5 | 
 6 | #include <stdint.h>
 7 | #include <math.h>
 8 | 
 9 | namespace lm {
10 | namespace ngram {
11 | 
12 | /* Suppose "foo bar" appears with zero backoff but there is no trigram
13 |  * beginning with these words.  Then, when scoring "foo bar", the model could
14 |  * return out_state containing "bar" or even null context if "bar" also has no
15 |  * backoff and is never followed by another word.  Then the backoff is set to
16 |  * kNoExtensionBackoff.  If the n-gram might be extended, then out_state must
17 |  * contain the full n-gram, in which case kExtensionBackoff is set.  In any
18 |  * case, if an n-gram has non-zero backoff, the full state is returned so
19 |  * backoff can be properly charged.  
20 |  * These differ only in sign bit because the backoff is in fact zero in either
21 |  * case.   
22 |  */
23 | const float kNoExtensionBackoff = -0.0;
24 | const float kExtensionBackoff = 0.0;
25 | const uint64_t kNoExtensionQuant = 0;
26 | const uint64_t kExtensionQuant = 1;
27 | 
28 | inline void SetExtension(float &backoff) {
29 |   if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff;
30 | }
31 | 
32 | // This compiles down nicely.  
33 | inline bool HasExtension(const float &backoff) {
34 |   typedef union { float f; uint32_t i; } UnionValue;
35 |   UnionValue compare, interpret;
36 |   compare.f = kNoExtensionBackoff;
37 |   interpret.f = backoff;
38 |   return compare.i != interpret.i;
39 | }
40 | 
41 | } // namespace ngram
42 | } // namespace lm
43 | #endif // LM_BLANK_H
44 | 


--------------------------------------------------------------------------------
/kenlm/include/lm/builder/adjust_counts.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_BUILDER_ADJUST_COUNTS_H
 2 | #define LM_BUILDER_ADJUST_COUNTS_H
 3 | 
 4 | #include "lm/builder/discount.hh"
 5 | #include "lm/lm_exception.hh"
 6 | #include "util/exception.hh"
 7 | 
 8 | #include <vector>
 9 | 
10 | #include <stdint.h>
11 | 
12 | namespace util { namespace stream { class ChainPositions; } }
13 | 
14 | namespace lm {
15 | namespace builder {
16 | 
17 | class BadDiscountException : public util::Exception {
18 |   public:
19 |     BadDiscountException() throw();
20 |     ~BadDiscountException() throw();
21 | };
22 | 
23 | struct DiscountConfig {
24 |   // Overrides discounts for orders [1,discount_override.size()].
25 |   std::vector<Discount> overwrite;
26 |   // If discounting fails for an order, copy them from here.
27 |   Discount fallback;
28 |   // What to do when discounts are out of range or would trigger divison by
29 |   // zero.  It it does something other than THROW_UP, use fallback_discount.
30 |   WarningAction bad_action;
31 | };
32 | 
33 | /* Compute adjusted counts.  
34 |  * Input: unique suffix sorted N-grams (and just the N-grams) with raw counts.
35 |  * Output: [1,N]-grams with adjusted counts.  
36 |  * [1,N)-grams are in suffix order
37 |  * N-grams are in undefined order (they're going to be sorted anyway).
38 |  */
39 | class AdjustCounts {
40 |   public:
41 |     // counts: output
42 |     // counts_pruned: output
43 |     // discounts: mostly output.  If the input already has entries, they will be kept.
44 |     // prune_thresholds: input.  n-grams with normal (not adjusted) count below this will be pruned.
45 |     AdjustCounts(
46 |         const std::vector<uint64_t> &prune_thresholds,
47 |         std::vector<uint64_t> &counts,
48 |         std::vector<uint64_t> &counts_pruned,
49 |         const std::vector<bool> &prune_words,
50 |         const DiscountConfig &discount_config,
51 |         std::vector<Discount> &discounts)
52 |       : prune_thresholds_(prune_thresholds), counts_(counts), counts_pruned_(counts_pruned),
53 |         prune_words_(prune_words), discount_config_(discount_config), discounts_(discounts) 
54 |     {}
55 | 
56 |     void Run(const util::stream::ChainPositions &positions);
57 | 
58 |   private:
59 |     const std::vector<uint64_t> &prune_thresholds_; 
60 |     std::vector<uint64_t> &counts_;
61 |     std::vector<uint64_t> &counts_pruned_;
62 |     const std::vector<bool> &prune_words_;
63 | 
64 |     DiscountConfig discount_config_;
65 |     std::vector<Discount> &discounts_;
66 | };
67 | 
68 | } // namespace builder
69 | } // namespace lm
70 | 
71 | #endif // LM_BUILDER_ADJUST_COUNTS_H
72 | 
73 | 


--------------------------------------------------------------------------------
/kenlm/include/lm/builder/corpus_count.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_BUILDER_CORPUS_COUNT_H
 2 | #define LM_BUILDER_CORPUS_COUNT_H
 3 | 
 4 | #include "lm/lm_exception.hh"
 5 | #include "lm/word_index.hh"
 6 | #include "util/scoped.hh"
 7 | 
 8 | #include <cstddef>
 9 | #include <string>
10 | #include <stdint.h>
11 | #include <vector>
12 | 
13 | namespace util {
14 | class FilePiece;
15 | namespace stream {
16 | class ChainPosition;
17 | } // namespace stream
18 | } // namespace util
19 | 
20 | namespace lm {
21 | namespace builder {
22 | 
23 | class CorpusCount {
24 |   public:
25 |     // Memory usage will be DedupeMultipler(order) * block_size + total_chain_size + unknown vocab_hash_size
26 |     static float DedupeMultiplier(std::size_t order);
27 | 
28 |     // How much memory vocabulary will use based on estimated size of the vocab.
29 |     static std::size_t VocabUsage(std::size_t vocab_estimate);
30 | 
31 |     // token_count: out.
32 |     // type_count aka vocabulary size.  Initialize to an estimate.  It is set to the exact value.
33 |     CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::vector<bool> &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol);
34 | 
35 |     void Run(const util::stream::ChainPosition &position);
36 | 
37 |   private:
38 |     util::FilePiece &from_;
39 |     int vocab_write_;
40 |     uint64_t &token_count_;
41 |     WordIndex &type_count_;
42 |     std::vector<bool>& prune_words_;
43 |     const std::string& prune_vocab_filename_; 
44 | 
45 |     std::size_t dedupe_mem_size_;
46 |     util::scoped_malloc dedupe_mem_;
47 | 
48 |     WarningAction disallowed_symbol_action_;
49 | };
50 | 
51 | } // namespace builder
52 | } // namespace lm
53 | #endif // LM_BUILDER_CORPUS_COUNT_H
54 | 


--------------------------------------------------------------------------------
/kenlm/include/lm/builder/discount.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_BUILDER_DISCOUNT_H
 2 | #define LM_BUILDER_DISCOUNT_H
 3 | 
 4 | #include <algorithm>
 5 | 
 6 | #include <stdint.h>
 7 | 
 8 | namespace lm {
 9 | namespace builder {
10 | 
11 | struct Discount {
12 |   float amount[4];
13 | 
14 |   float Get(uint64_t count) const {
15 |     return amount[std::min<uint64_t>(count, 3)];
16 |   }
17 | 
18 |   float Apply(uint64_t count) const {
19 |     return static_cast<float>(count) - Get(count);
20 |   }
21 | };
22 | 
23 | } // namespace builder
24 | } // namespace lm
25 | 
26 | #endif // LM_BUILDER_DISCOUNT_H
27 | 


--------------------------------------------------------------------------------
/kenlm/include/lm/builder/hash_gamma.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_BUILDER_HASH_GAMMA__
 2 | #define LM_BUILDER_HASH_GAMMA__
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | namespace lm { namespace builder {
 7 | 
 8 | #pragma pack(push)
 9 | #pragma pack(4)
10 | 
11 | struct HashGamma {
12 |     uint64_t hash_value;
13 |     float gamma;
14 | };
15 | 
16 | #pragma pack(pop)
17 | 
18 | }} // namespaces
19 | #endif // LM_BUILDER_HASH_GAMMA__
20 | 


--------------------------------------------------------------------------------
/kenlm/include/lm/builder/header_info.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_BUILDER_HEADER_INFO_H
 2 | #define LM_BUILDER_HEADER_INFO_H
 3 | 
 4 | #include <string>
 5 | #include <vector>
 6 | #include <stdint.h>
 7 | 
 8 | // Some configuration info that is used to add
 9 | // comments to the beginning of an ARPA file
10 | struct HeaderInfo {
11 |   std::string input_file;
12 |   uint64_t token_count;
13 |   std::vector<uint64_t> counts_pruned;
14 | 
15 |   HeaderInfo() {}
16 | 
17 |   HeaderInfo(const std::string& input_file_in, uint64_t token_count_in, const std::vector<uint64_t> &counts_pruned_in)
18 |     : input_file(input_file_in), token_count(token_count_in), counts_pruned(counts_pruned_in) {}
19 | 
20 |   // TODO: Add smoothing type
21 |   // TODO: More info if multiple models were interpolated
22 | };
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/kenlm/include/lm/builder/initial_probabilities.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_BUILDER_INITIAL_PROBABILITIES_H
 2 | #define LM_BUILDER_INITIAL_PROBABILITIES_H
 3 | 
 4 | #include "lm/builder/discount.hh"
 5 | #include "util/stream/config.hh"
 6 | 
 7 | #include <vector>
 8 | 
 9 | namespace util { namespace stream { class Chains; } }
10 | 
11 | namespace lm {
12 | namespace builder {
13 | 
14 | struct InitialProbabilitiesConfig {
15 |   // These should be small buffers to keep the adder from getting too far ahead
16 |   util::stream::ChainConfig adder_in;
17 |   util::stream::ChainConfig adder_out;
18 |   // SRILM doesn't normally interpolate unigrams.  
19 |   bool interpolate_unigrams;
20 | };
21 | 
22 | /* Compute initial (uninterpolated) probabilities
23 |  * primary: the normal chain of n-grams.  Incoming is context sorted adjusted
24 |  *   counts.  Outgoing has uninterpolated probabilities for use by Interpolate.
25 |  * second_in: a second copy of the primary input.  Discard the output.  
26 |  * gamma_out: Computed gamma values are output on these chains in suffix order.
27 |  *   The values are bare floats and should be buffered for interpolation to
28 |  *   use.  
29 |  */
30 | void InitialProbabilities(
31 |     const InitialProbabilitiesConfig &config,
32 |     const std::vector<Discount> &discounts,
33 |     util::stream::Chains &primary,
34 |     util::stream::Chains &second_in,
35 |     util::stream::Chains &gamma_out,
36 |     const std::vector<uint64_t> &prune_thresholds,
37 |     bool prune_vocab);
38 | 
39 | } // namespace builder
40 | } // namespace lm
41 | 
42 | #endif // LM_BUILDER_INITIAL_PROBABILITIES_H
43 | 


--------------------------------------------------------------------------------
/kenlm/include/lm/builder/interpolate.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_BUILDER_INTERPOLATE_H
 2 | #define LM_BUILDER_INTERPOLATE_H
 3 | 
 4 | #include "util/stream/multi_stream.hh"
 5 | 
 6 | #include <vector>
 7 | 
 8 | #include <stdint.h>
 9 | 
10 | namespace lm { namespace builder {
11 |  
12 | /* Interpolate step.  
13 |  * Input: suffix sorted n-grams with (p_uninterpolated, gamma) from
14 |  * InitialProbabilities.
15 |  * Output: suffix sorted n-grams with complete probability
16 |  */
17 | class Interpolate {
18 |   public:
19 |     // Normally vocab_size is the unigram count-1 (since p(<s>) = 0) but might
20 |     // be larger when the user specifies a consistent vocabulary size.
21 |     explicit Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t> &prune_thresholds, bool prune_vocab, bool output_q_);
22 | 
23 |     void Run(const util::stream::ChainPositions &positions);
24 | 
25 |   private:
26 |     float uniform_prob_;
27 |     util::stream::ChainPositions backoffs_;
28 |     const std::vector<uint64_t> prune_thresholds_;
29 |     bool prune_vocab_;
30 |     bool output_q_;
31 | };
32 | 
33 | }} // namespaces
34 | #endif // LM_BUILDER_INTERPOLATE_H
35 | 


--------------------------------------------------------------------------------
/kenlm/include/lm/builder/joint_order.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_BUILDER_JOINT_ORDER_H
 2 | #define LM_BUILDER_JOINT_ORDER_H
 3 | 
 4 | #include "lm/builder/ngram_stream.hh"
 5 | #include "lm/lm_exception.hh"
 6 | 
 7 | #ifdef DEBUG
 8 | #include "util/fixed_array.hh"
 9 | #include <iostream>
10 | #endif
11 | 
12 | #include <string.h>
13 | 
14 | namespace lm { namespace builder {
15 | 
16 | template <class Callback, class Compare> void JointOrder(const util::stream::ChainPositions &positions, Callback &callback) {
17 |   // Allow matching to reference streams[-1].
18 |   NGramStreams streams_with_dummy;
19 |   streams_with_dummy.InitWithDummy(positions);
20 |   NGramStream *streams = streams_with_dummy.begin() + 1;
21 | 
22 |   unsigned int order;
23 |   for (order = 0; order < positions.size() && streams[order]; ++order) {}
24 |   assert(order); // should always have <unk>.
25 | 
26 |   // Debugging only: call comparison function to sanity check order.
27 | #ifdef DEBUG
28 |   util::FixedArray<Compare> less_compare(order);
29 |   for (unsigned i = 0; i < order; ++i)
30 |     less_compare.push_back(i + 1);
31 | #endif // DEBUG
32 | 
33 |   unsigned int current = 0;
34 |   while (true) {
35 |     // Does the context match the lower one?
36 |     if (!memcmp(streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset, sizeof(WordIndex) * current)) {
37 |       callback.Enter(current, *streams[current]);
38 |       // Transition to looking for extensions.  
39 |       if (++current < order) continue;
40 |     }
41 | #ifdef DEBUG
42 |     // match_check[current - 1] matches current-grams
43 |     // The lower-order stream (which skips fewer current-grams) should always be <= the higher order-stream (which can skip current-grams).
44 |     else if (!less_compare[current - 1](streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset)) {
45 |       std::cerr << "Stream out of order detected" << std::endl;
46 |       abort();
47 |     }
48 | #endif // DEBUG
49 |     // No extension left.  
50 |     while(true) {
51 |       assert(current > 0);
52 |       --current;
53 |       callback.Exit(current, *streams[current]);
54 |       
55 |       if (++streams[current]) break;
56 |       
57 |       UTIL_THROW_IF(order != current + 1, FormatLoadException, "Detected n-gram without matching suffix");
58 |       
59 |       order = current;
60 |       if (!order) return;
61 |     }
62 |   }
63 | }
64 | 
65 | }} // namespaces
66 | 
67 | #endif // LM_BUILDER_JOINT_ORDER_H
68 | 


--------------------------------------------------------------------------------
/kenlm/include/lm/builder/ngram_stream.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_BUILDER_NGRAM_STREAM_H
 2 | #define LM_BUILDER_NGRAM_STREAM_H
 3 | 
 4 | #include "lm/builder/ngram.hh"
 5 | #include "util/stream/chain.hh"
 6 | #include "util/stream/multi_stream.hh"
 7 | #include "util/stream/stream.hh"
 8 | 
 9 | #include <cstddef>
10 | 
11 | namespace lm { namespace builder {
12 | 
13 | class NGramStream {
14 |   public:
15 |     NGramStream() : gram_(NULL, 0) {}
16 | 
17 |     NGramStream(const util::stream::ChainPosition &position) : gram_(NULL, 0) {
18 |       Init(position);
19 |     }
20 | 
21 |     void Init(const util::stream::ChainPosition &position) {
22 |       stream_.Init(position);
23 |       gram_ = NGram(stream_.Get(), NGram::OrderFromSize(position.GetChain().EntrySize()));
24 |     }
25 | 
26 |     NGram &operator*() { return gram_; }
27 |     const NGram &operator*() const { return gram_; }
28 | 
29 |     NGram *operator->() { return &gram_; }
30 |     const NGram *operator->() const { return &gram_; }
31 | 
32 |     void *Get() { return stream_.Get(); }
33 |     const void *Get() const { return stream_.Get(); }
34 | 
35 |     operator bool() const { return stream_; }
36 |     bool operator!() const { return !stream_; }
37 |     void Poison() { stream_.Poison(); }
38 | 
39 |     NGramStream &operator++() {
40 |       ++stream_;
41 |       gram_.ReBase(stream_.Get());
42 |       return *this;
43 |     }
44 | 
45 |   private:
46 |     NGram gram_;
47 |     util::stream::Stream stream_;
48 | };
49 | 
50 | inline util::stream::Chain &operator>>(util::stream::Chain &chain, NGramStream &str) {
51 |   str.Init(chain.Add());
52 |   return chain;
53 | }
54 | 
55 | typedef util::stream::GenericStreams<NGramStream> NGramStreams;
56 | 
57 | }} // namespaces
58 | #endif // LM_BUILDER_NGRAM_STREAM_H
59 | 


--------------------------------------------------------------------------------
/kenlm/include/lm/enumerate_vocab.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_ENUMERATE_VOCAB_H
 2 | #define LM_ENUMERATE_VOCAB_H
 3 | 
 4 | #include "lm/word_index.hh"
 5 | #include "util/string_piece.hh"
 6 | 
 7 | namespace lm {
 8 | 
 9 | /* If you need the actual strings in the vocabulary, inherit from this class
10 |  * and implement Add.  Then put a pointer in Config.enumerate_vocab; it does
11 |  * not take ownership.  Add is called once per vocab word.  index starts at 0
12 |  * and increases by 1 each time.  This is only used by the Model constructor;
13 |  * the pointer is not retained by the class.  
14 |  */
15 | class EnumerateVocab {
16 |   public:
17 |     virtual ~EnumerateVocab() {}
18 | 
19 |     virtual void Add(WordIndex index, const StringPiece &str) = 0;
20 | 
21 |   protected:
22 |     EnumerateVocab() {}
23 | };
24 | 
25 | } // namespace lm
26 | 
27 | #endif // LM_ENUMERATE_VOCAB_H
28 | 
29 | 


--------------------------------------------------------------------------------
/kenlm/include/lm/filter/wrapper.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_FILTER_WRAPPER_H
 2 | #define LM_FILTER_WRAPPER_H
 3 | 
 4 | #include "util/string_piece.hh"
 5 | 
 6 | #include <algorithm>
 7 | #include <string>
 8 | #include <vector>
 9 | 
10 | namespace lm {
11 | 
12 | // Provide a single-output filter with the same interface as a
13 | // multiple-output filter so clients code against one interface.
14 | template <class Binary> class BinaryFilter {
15 |   public:
16 |     // Binary modes are just references (and a set) and it makes the API cleaner to copy them.  
17 |     explicit BinaryFilter(Binary binary) : binary_(binary) {}
18 | 
19 |     template <class Iterator, class Output> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) {
20 |       if (binary_.PassNGram(begin, end))
21 |         output.AddNGram(line);
22 |     }
23 | 
24 |     template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) {
25 |       AddNGram(util::TokenIter<util::SingleCharacter, true>(ngram, ' '), util::TokenIter<util::SingleCharacter, true>::end(), line, output);
26 |     }
27 | 
28 |     void Flush() const {}
29 | 
30 |   private:
31 |     Binary binary_;
32 | };
33 | 
34 | // Wrap another filter to pay attention only to context words
35 | template <class FilterT> class ContextFilter {
36 |   public:
37 |     typedef FilterT Filter;
38 | 
39 |     explicit ContextFilter(Filter &backend) : backend_(backend) {}
40 | 
41 |     template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) {
42 |       // Find beginning of string or last space.
43 |       const char *last_space;
44 |       for (last_space = ngram.data() + ngram.size() - 1; last_space > ngram.data() && *last_space != ' '; --last_space) {}
45 |       backend_.AddNGram(StringPiece(ngram.data(), last_space - ngram.data()), line, output);
46 |     }
47 | 
48 |     void Flush() const {}
49 | 
50 |   private:
51 |     Filter backend_;
52 | };
53 | 
54 | } // namespace lm
55 | 
56 | #endif // LM_FILTER_WRAPPER_H
57 | 


--------------------------------------------------------------------------------
/kenlm/include/lm/interpolate/arpa_to_stream.hh:
--------------------------------------------------------------------------------
 1 | #include "lm/read_arpa.hh"
 2 | #include "util/file_piece.hh"
 3 | 
 4 | #include <vector>
 5 | 
 6 | #include <stdint.h>
 7 | 
 8 | namespace util { namespace stream { class ChainPositions; } }
 9 | 
10 | namespace lm {
11 | 
12 | namespace ngram {
13 | template <class T> class GrowableVocab;
14 | class WriteUniqueWords;
15 | } // namespace ngram
16 | 
17 | namespace interpolate {
18 | 
19 | class ARPAToStream {
20 |   public:
21 |     // Takes ownership of fd.
22 |     explicit ARPAToStream(int fd, ngram::GrowableVocab<ngram::WriteUniqueWords> &vocab);
23 | 
24 |     std::size_t Order() const { return counts_.size(); }
25 | 
26 |     const std::vector<uint64_t> &Counts() const { return counts_; }
27 | 
28 |     void Run(const util::stream::ChainPositions &positions);
29 | 
30 |   private:
31 |     util::FilePiece in_;
32 | 
33 |     std::vector<uint64_t> counts_;
34 | 
35 |     ngram::GrowableVocab<ngram::WriteUniqueWords> &vocab_;
36 | };
37 | 
38 | }} // namespaces
39 | 


--------------------------------------------------------------------------------
/kenlm/include/lm/lm_exception.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_LM_EXCEPTION_H
 2 | #define LM_LM_EXCEPTION_H
 3 | 
 4 | // Named to avoid conflict with util/exception.hh.  
 5 | 
 6 | #include "util/exception.hh"
 7 | #include "util/string_piece.hh"
 8 | 
 9 | #include <exception>
10 | #include <string>
11 | 
12 | namespace lm {
13 | 
14 | typedef enum {THROW_UP, COMPLAIN, SILENT} WarningAction;
15 | 
16 | class ConfigException : public util::Exception {
17 |   public:
18 |     ConfigException() throw();
19 |     ~ConfigException() throw();
20 | };
21 | 
22 | class LoadException : public util::Exception {
23 |    public:
24 |       virtual ~LoadException() throw();
25 | 
26 |    protected:
27 |       LoadException() throw();
28 | };
29 | 
30 | class FormatLoadException : public LoadException {
31 |   public:
32 |     FormatLoadException() throw();
33 |     ~FormatLoadException() throw();
34 | };
35 | 
36 | class VocabLoadException : public LoadException {
37 |   public:
38 |     virtual ~VocabLoadException() throw();
39 |     VocabLoadException() throw();
40 | };
41 | 
42 | class SpecialWordMissingException : public VocabLoadException {
43 |   public:
44 |     explicit SpecialWordMissingException() throw();
45 |     ~SpecialWordMissingException() throw();
46 | };
47 | 
48 | } // namespace lm
49 | 
50 | #endif // LM_LM_EXCEPTION
51 | 


--------------------------------------------------------------------------------
/kenlm/include/lm/max_order.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_MAX_ORDER_H
 2 | #define LM_MAX_ORDER_H
 3 | /* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER, THEN CHANGE THE BUILD SYSTEM.
 4 |  * If not, this is the default maximum order.  
 5 |  * Having this limit means that State can be
 6 |  * (kMaxOrder - 1) * sizeof(float) bytes instead of
 7 |  * sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead
 8 |  */
 9 | #ifndef KENLM_ORDER_MESSAGE
10 | #define KENLM_ORDER_MESSAGE "If your build system supports changing KENLM_MAX_ORDER, change it there and recompile.  In the KenLM tarball or Moses, use e.g. `bjam --max-kenlm-order=6 -a'.  Otherwise, edit lm/max_order.hh."
11 | #endif
12 | 
13 | #endif // LM_MAX_ORDER_H
14 | 


--------------------------------------------------------------------------------
/kenlm/include/lm/model_type.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_MODEL_TYPE_H
 2 | #define LM_MODEL_TYPE_H
 3 | 
 4 | namespace lm {
 5 | namespace ngram {
 6 | 
 7 | /* Not the best numbering system, but it grew this way for historical reasons
 8 |  * and I want to preserve existing binary files. */
 9 | typedef enum {PROBING=0, REST_PROBING=1, TRIE=2, QUANT_TRIE=3, ARRAY_TRIE=4, QUANT_ARRAY_TRIE=5} ModelType;
10 | 
11 | // Historical names.  
12 | const ModelType HASH_PROBING = PROBING;
13 | const ModelType TRIE_SORTED = TRIE;
14 | const ModelType QUANT_TRIE_SORTED = QUANT_TRIE;
15 | const ModelType ARRAY_TRIE_SORTED = ARRAY_TRIE;
16 | const ModelType QUANT_ARRAY_TRIE_SORTED = QUANT_ARRAY_TRIE;
17 | 
18 | const static ModelType kQuantAdd = static_cast<ModelType>(QUANT_TRIE - TRIE);
19 | const static ModelType kArrayAdd = static_cast<ModelType>(ARRAY_TRIE - TRIE);
20 | 
21 | } // namespace ngram
22 | } // namespace lm
23 | #endif // LM_MODEL_TYPE_H
24 | 


--------------------------------------------------------------------------------
/kenlm/include/lm/neural/wordvecs.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_NEURAL_WORDVECS_H
 2 | #define LM_NEURAL_WORDVECS_H
 3 | 
 4 | #include "util/scoped.hh"
 5 | #include "lm/vocab.hh"
 6 | 
 7 | #include <Eigen/Dense>
 8 | 
 9 | namespace util { class FilePiece; }
10 | 
11 | namespace lm {
12 | namespace neural {
13 | 
14 | class WordVecs {
15 |   public:
16 |     // Columns of the matrix are word vectors.  The column index is the word.
17 |     typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor> Storage;
18 | 
19 |     /* The file should begin with a line stating the number of word vectors and
20 |      * the length of the vectors.  Then it's followed by lines containing a
21 |      * word followed by floating-point values.
22 |      */
23 |     explicit WordVecs(util::FilePiece &in);
24 | 
25 |     const Storage &Vectors() const { return vecs_; }
26 | 
27 |     WordIndex Index(StringPiece str) const { return vocab_.Index(str); }
28 | 
29 |   private:
30 |     util::scoped_malloc vocab_backing_;
31 |     ngram::ProbingVocabulary vocab_;
32 | 
33 |     Storage vecs_;
34 | };
35 | 
36 | }} // namespaces
37 | 
38 | #endif // LM_NEURAL_WORDVECS_H
39 | 


--------------------------------------------------------------------------------
/kenlm/include/lm/return.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_RETURN_H
 2 | #define LM_RETURN_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | namespace lm {
 7 | /* Structure returned by scoring routines. */
 8 | struct FullScoreReturn {
 9 |   // log10 probability
10 |   float prob;
11 | 
12 |   /* The length of n-gram matched.  Do not use this for recombination.  
13 |    * Consider a model containing only the following n-grams:
14 |    * -1 foo
15 |    * -3.14  bar
16 |    * -2.718 baz -5
17 |    * -6 foo bar
18 |    *
19 |    * If you score ``bar'' then ngram_length is 1 and recombination state is the
20 |    * empty string because bar has zero backoff and does not extend to the
21 |    * right.  
22 |    * If you score ``foo'' then ngram_length is 1 and recombination state is 
23 |    * ``foo''.  
24 |    *
25 |    * Ideally, keep output states around and compare them.  Failing that,
26 |    * get out_state.ValidLength() and use that length for recombination.
27 |    */
28 |   unsigned char ngram_length;
29 | 
30 |   /* Left extension information.  If independent_left is set, then prob is
31 |    * independent of words to the left (up to additional backoff).  Otherwise,
32 |    * extend_left indicates how to efficiently extend further to the left.  
33 |    */
34 |   bool independent_left;
35 |   uint64_t extend_left; // Defined only if independent_left
36 | 
37 |   // Rest cost for extension to the left.
38 |   float rest;
39 | };
40 | 
41 | } // namespace lm
42 | #endif // LM_RETURN_H
43 | 


--------------------------------------------------------------------------------
/kenlm/include/lm/sizes.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_SIZES_H
 2 | #define LM_SIZES_H
 3 | 
 4 | #include <vector>
 5 | 
 6 | #include <stdint.h>
 7 | 
 8 | namespace lm { namespace ngram {
 9 | 
10 | struct Config;
11 | 
12 | void ShowSizes(const std::vector<uint64_t> &counts, const lm::ngram::Config &config);
13 | void ShowSizes(const std::vector<uint64_t> &counts);
14 | void ShowSizes(const char *file, const lm::ngram::Config &config);
15 | 
16 | }} // namespaces
17 | #endif // LM_SIZES_H
18 | 


--------------------------------------------------------------------------------
/kenlm/include/lm/weights.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_WEIGHTS_H
 2 | #define LM_WEIGHTS_H
 3 | 
 4 | // Weights for n-grams.  Probability and possibly a backoff.  
 5 | 
 6 | namespace lm {
 7 | struct Prob {
 8 |   float prob;
 9 | };
10 | // No inheritance so this will be a POD.  
11 | struct ProbBackoff {
12 |   float prob;
13 |   float backoff;
14 | };
15 | struct RestWeights {
16 |   float prob;
17 |   float backoff;
18 |   float rest;
19 | };
20 | 
21 | } // namespace lm
22 | #endif // LM_WEIGHTS_H
23 | 


--------------------------------------------------------------------------------
/kenlm/include/lm/word_index.hh:
--------------------------------------------------------------------------------
 1 | // Separate header because this is used often.
 2 | #ifndef LM_WORD_INDEX_H
 3 | #define LM_WORD_INDEX_H
 4 | 
 5 | #include <limits.h>
 6 | 
 7 | namespace lm {
 8 | typedef unsigned int WordIndex;
 9 | const WordIndex kMaxWordIndex = UINT_MAX;
10 | } // namespace lm
11 | 
12 | typedef lm::WordIndex LMWordIndex;
13 | 
14 | #endif
15 | 


--------------------------------------------------------------------------------
/kenlm/include/lm/wrappers/nplm.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_WRAPPERS_NPLM_H
 2 | #define LM_WRAPPERS_NPLM_H
 3 | 
 4 | #include "lm/facade.hh"
 5 | #include "lm/max_order.hh"
 6 | #include "util/string_piece.hh"
 7 | 
 8 | #include <boost/thread/tss.hpp>
 9 | #include <boost/scoped_ptr.hpp>
10 | 
11 | /* Wrapper to NPLM "by Ashish Vaswani, with contributions from David Chiang
12 |  * and Victoria Fossum."  
13 |  * http://nlg.isi.edu/software/nplm/
14 |  */
15 | 
16 | namespace nplm {
17 | class vocabulary;
18 | class neuralLM;
19 | } // namespace nplm
20 | 
21 | namespace lm {
22 | namespace np {
23 | 
24 | class Vocabulary : public base::Vocabulary {
25 |   public:
26 |     Vocabulary(const nplm::vocabulary &vocab);
27 | 
28 |     ~Vocabulary();
29 | 
30 |     WordIndex Index(const std::string &str) const;
31 | 
32 |     // TODO: lobby them to support StringPiece
33 |     WordIndex Index(const StringPiece &str) const {
34 |       return Index(std::string(str.data(), str.size()));
35 |     }
36 | 
37 |     lm::WordIndex NullWord() const { return null_word_; }
38 | 
39 |   private:
40 |     const nplm::vocabulary &vocab_;
41 | 
42 |     const lm::WordIndex null_word_;
43 | };
44 | 
45 | // Sorry for imposing my limitations on your code.
46 | #define NPLM_MAX_ORDER 7
47 | 
48 | struct State {
49 |   WordIndex words[NPLM_MAX_ORDER - 1];
50 | };
51 | 
52 | class Model : public lm::base::ModelFacade<Model, State, Vocabulary> {
53 |   private:
54 |     typedef lm::base::ModelFacade<Model, State, Vocabulary> P;
55 | 
56 |   public:
57 |     // Does this look like an NPLM?
58 |     static bool Recognize(const std::string &file);
59 | 
60 |     explicit Model(const std::string &file, std::size_t cache_size = 1 << 20);
61 | 
62 |     ~Model();
63 | 
64 |     FullScoreReturn FullScore(const State &from, const WordIndex new_word, State &out_state) const;
65 | 
66 |     FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;
67 | 
68 |   private:
69 |     boost::scoped_ptr<nplm::neuralLM> base_instance_;
70 | 
71 |     mutable boost::thread_specific_ptr<nplm::neuralLM> backend_;
72 | 
73 |     Vocabulary vocab_;
74 | 
75 |     lm::WordIndex null_word_;
76 | 
77 |     const std::size_t cache_size_;
78 | };
79 | 
80 | } // namespace np
81 | } // namespace lm
82 | 
83 | #endif // LM_WRAPPERS_NPLM_H
84 | 


--------------------------------------------------------------------------------
/kenlm/include/util/ersatz_progress.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_ERSATZ_PROGRESS_H
 2 | #define UTIL_ERSATZ_PROGRESS_H
 3 | 
 4 | #include <iostream>
 5 | #include <string>
 6 | 
 7 | #include <stdint.h>
 8 | 
 9 | // Ersatz version of boost::progress so core language model doesn't depend on
10 | // boost.  Also adds option to print nothing.  
11 | 
12 | namespace util {
13 | 
14 | extern const char kProgressBanner[];
15 | 
16 | class ErsatzProgress {
17 |   public:
18 |     // No output.  
19 |     ErsatzProgress();
20 | 
21 |     // Null means no output.  The null value is useful for passing along the ostream pointer from another caller.   
22 |     explicit ErsatzProgress(uint64_t complete, std::ostream *to = &std::cerr, const std::string &message = "");
23 | 
24 |     ~ErsatzProgress();
25 | 
26 |     ErsatzProgress &operator++() {
27 |       if (++current_ >= next_) Milestone();
28 |       return *this;
29 |     }
30 | 
31 |     ErsatzProgress &operator+=(uint64_t amount) {
32 |       if ((current_ += amount) >= next_) Milestone();
33 |       return *this;
34 |     }
35 | 
36 |     void Set(uint64_t to) {
37 |       if ((current_ = to) >= next_) Milestone();
38 |     }
39 | 
40 |     void Finished() {
41 |       Set(complete_);
42 |     }
43 | 
44 |   private:
45 |     void Milestone();
46 | 
47 |     uint64_t current_, next_, complete_;
48 |     unsigned char stones_written_;
49 |     std::ostream *out_;
50 | 
51 |     // noncopyable
52 |     ErsatzProgress(const ErsatzProgress &other);
53 |     ErsatzProgress &operator=(const ErsatzProgress &other);
54 | };
55 | 
56 | } // namespace util
57 | 
58 | #endif // UTIL_ERSATZ_PROGRESS_H
59 | 


--------------------------------------------------------------------------------
/kenlm/include/util/getopt.hh:
--------------------------------------------------------------------------------
 1 | /*
 2 | POSIX getopt for Windows
 3 | 
 4 | AT&T Public License
 5 | 
 6 | Code given out at the 1985 UNIFORUM conference in Dallas.  
 7 | */
 8 | 
 9 | #ifdef __GNUC__
10 | #include <getopt.h>
11 | #endif
12 | #ifndef __GNUC__
13 | 
14 | #ifndef UTIL_GETOPT_H
15 | #define UTIL_GETOPT_H
16 | 
17 | #ifdef __cplusplus
18 | extern "C" {
19 | #endif
20 | 
21 | extern int opterr;
22 | extern int optind;
23 | extern int optopt;
24 | extern char *optarg;
25 | extern int getopt(int argc, char **argv, char *opts);
26 | 
27 | #ifdef __cplusplus
28 | }
29 | #endif
30 | 
31 | #endif  /* UTIL_GETOPT_H */
32 | #endif  /* __GNUC__ */
33 | 
34 | 


--------------------------------------------------------------------------------
/kenlm/include/util/have.hh:
--------------------------------------------------------------------------------
 1 | /* Optional packages.  You might want to integrate this with your build system e.g. config.h from ./configure. */
 2 | #ifndef UTIL_HAVE_H
 3 | #define UTIL_HAVE_H
 4 | 
 5 | #ifdef HAVE_CONFIG_H
 6 | #include "config.h"
 7 | #endif
 8 | 
 9 | #ifndef HAVE_ICU
10 | //#define HAVE_ICU
11 | #endif
12 | 
13 | #endif // UTIL_HAVE_H
14 | 


--------------------------------------------------------------------------------
/kenlm/include/util/murmur_hash.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_MURMUR_HASH_H
 2 | #define UTIL_MURMUR_HASH_H
 3 | #include <cstddef>
 4 | #include <stdint.h>
 5 | 
 6 | namespace util {
 7 | 
 8 | // 64-bit machine version
 9 | uint64_t MurmurHash64A(const void * key, std::size_t len, uint64_t seed = 0);
10 | // 32-bit machine version (not the same function as above)
11 | uint64_t MurmurHash64B(const void * key, std::size_t len, uint64_t seed = 0);
12 | // Use the version for this arch.  Because the values differ across
13 | // architectures, really only use it for in-memory structures.
14 | uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed = 0);
15 | 
16 | } // namespace util
17 | 
18 | #endif // UTIL_MURMUR_HASH_H
19 | 


--------------------------------------------------------------------------------
/kenlm/include/util/parallel_read.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_PARALLEL_READ__
 2 | #define UTIL_PARALLEL_READ__
 3 | 
 4 | /* Read pieces of a file in parallel.  This has a very specific use case:
 5 |  * reading files from Lustre is CPU bound so multiple threads actually
 6 |  * increases throughput.  Speed matters when an LM takes a terabyte.
 7 |  */
 8 | 
 9 | #include <cstddef>
10 | #include <stdint.h>
11 | 
12 | namespace util {
13 | void ParallelRead(int fd, void *to, std::size_t amount, uint64_t offset);
14 | } // namespace util
15 | 
16 | #endif // UTIL_PARALLEL_READ__
17 | 


--------------------------------------------------------------------------------
/kenlm/include/util/pool.hh:
--------------------------------------------------------------------------------
 1 | // Very simple pool.  It can only allocate memory.  And all of the memory it
 2 | // allocates must be freed at the same time.  
 3 | 
 4 | #ifndef UTIL_POOL_H
 5 | #define UTIL_POOL_H
 6 | 
 7 | #include <vector>
 8 | 
 9 | #include <stdint.h>
10 | 
11 | namespace util {
12 | 
13 | class Pool {
14 |   public:
15 |     Pool();
16 | 
17 |     ~Pool();
18 | 
19 |     void *Allocate(std::size_t size) {
20 |       void *ret = current_;
21 |       current_ += size;
22 |       if (current_ < current_end_) {
23 |         return ret;
24 |       } else {
25 |         return More(size);
26 |       }
27 |     }
28 | 
29 |     void FreeAll();
30 | 
31 |   private:
32 |     void *More(std::size_t size);
33 | 
34 |     std::vector<void *> free_list_;
35 | 
36 |     uint8_t *current_, *current_end_;
37 | 
38 |     // no copying
39 |     Pool(const Pool &);
40 |     Pool &operator=(const Pool &);
41 | }; 
42 | 
43 | } // namespace util
44 | 
45 | #endif // UTIL_POOL_H
46 | 


--------------------------------------------------------------------------------
/kenlm/include/util/read_compressed.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_READ_COMPRESSED_H
 2 | #define UTIL_READ_COMPRESSED_H
 3 | 
 4 | #include "util/exception.hh"
 5 | #include "util/scoped.hh"
 6 | 
 7 | #include <cstddef>
 8 | 
 9 | #include <stdint.h>
10 | 
11 | namespace util {
12 | 
13 | class CompressedException : public Exception {
14 |   public:
15 |     CompressedException() throw();
16 |     virtual ~CompressedException() throw();
17 | };
18 | 
19 | class GZException : public CompressedException {
20 |   public:
21 |     GZException() throw();
22 |     ~GZException() throw();
23 | };
24 | 
25 | class BZException : public CompressedException {
26 |   public:
27 |     BZException() throw();
28 |     ~BZException() throw();
29 | };
30 | 
31 | class XZException : public CompressedException {
32 |   public:
33 |     XZException() throw();
34 |     ~XZException() throw();
35 | };
36 | 
37 | class ReadBase;
38 | 
39 | class ReadCompressed {
40 |   public:
41 |     static const std::size_t kMagicSize = 6;
42 |     // Must have at least kMagicSize bytes.  
43 |     static bool DetectCompressedMagic(const void *from);
44 | 
45 |     // Takes ownership of fd.   
46 |     explicit ReadCompressed(int fd);
47 | 
48 |     // Try to avoid using this.  Use the fd instead.
49 |     // There is no decompression support for istreams.
50 |     explicit ReadCompressed(std::istream &in);
51 | 
52 |     // Must call Reset later.
53 |     ReadCompressed();
54 | 
55 |     ~ReadCompressed();
56 | 
57 |     // Takes ownership of fd.  
58 |     void Reset(int fd);
59 | 
60 |     // Same advice as the constructor.
61 |     void Reset(std::istream &in);
62 | 
63 |     std::size_t Read(void *to, std::size_t amount);
64 | 
65 |     // Repeatedly call read to fill a buffer unless EOF is hit.
66 |     // Return number of bytes read.
67 |     std::size_t ReadOrEOF(void *const to, std::size_t amount);
68 | 
69 |     uint64_t RawAmount() const { return raw_amount_; }
70 | 
71 |   private:
72 |     friend class ReadBase;
73 | 
74 |     scoped_ptr<ReadBase> internal_;
75 | 
76 |     uint64_t raw_amount_;
77 | 
78 |     // No copying.  
79 |     ReadCompressed(const ReadCompressed &);
80 |     void operator=(const ReadCompressed &);
81 | };
82 | 
83 | } // namespace util
84 | 
85 | #endif // UTIL_READ_COMPRESSED_H
86 | 


--------------------------------------------------------------------------------
/kenlm/include/util/stream/config.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_STREAM_CONFIG_H
 2 | #define UTIL_STREAM_CONFIG_H
 3 | 
 4 | #include <cstddef>
 5 | #include <string>
 6 | 
 7 | namespace util { namespace stream {
 8 | 
 9 | /**
10 |  * Represents how a chain should be configured.
11 |  */
12 | struct ChainConfig {
13 |   
14 |   /** Constructs an configuration with underspecified (or default) parameters. */
15 |   ChainConfig() {}
16 | 
17 |   /** 
18 |    * Constructs a chain configuration object.
19 |    *
20 |    * @param [in] in_entry_size   Number of bytes in each record.
21 |    * @param [in] in_block_count  Number of blocks in the chain.
22 |    * @param [in] in_total_memory Total number of bytes available to the chain.
23 |    *             This value will be divided amongst the blocks in the chain.
24 |    */
25 |   ChainConfig(std::size_t in_entry_size, std::size_t in_block_count, std::size_t in_total_memory)
26 |     : entry_size(in_entry_size), block_count(in_block_count), total_memory(in_total_memory) {}
27 | 
28 |   /**
29 |    * Number of bytes in each record.
30 |    */
31 |   std::size_t entry_size;
32 |   
33 |   /**
34 |    * Number of blocks in the chain.
35 |    */
36 |   std::size_t block_count;
37 |   
38 |   /** 
39 |    * Total number of bytes available to the chain.
40 |    * This value will be divided amongst the blocks in the chain.
41 |    * Chain's constructor will make this a multiple of entry_size. 
42 |    */
43 |   std::size_t total_memory;
44 | };
45 | 
46 |   
47 | /**
48 |  * Represents how a sorter should be configured.
49 |  */
50 | struct SortConfig {
51 |   
52 |   /** Filename prefix where temporary files should be placed. */
53 |   std::string temp_prefix;
54 | 
55 |   /** Size of each input/output buffer. */
56 |   std::size_t buffer_size;
57 | 
58 |   /** Total memory to use when running alone. */
59 |   std::size_t total_memory;
60 | };
61 | 
62 | }} // namespaces
63 | #endif // UTIL_STREAM_CONFIG_H
64 | 


--------------------------------------------------------------------------------
/kenlm/include/util/stream/io.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_STREAM_IO_H
 2 | #define UTIL_STREAM_IO_H
 3 | 
 4 | #include "util/exception.hh"
 5 | #include "util/file.hh"
 6 | 
 7 | namespace util {
 8 | namespace stream {
 9 | 
10 | class ChainPosition;
11 | 
12 | class ReadSizeException : public util::Exception {
13 |   public:
14 |     ReadSizeException() throw();
15 |     ~ReadSizeException() throw();
16 | };
17 | 
18 | class Read {
19 |   public:
20 |     explicit Read(int fd) : file_(fd) {}
21 |     void Run(const ChainPosition &position); 
22 |   private:
23 |     int file_;
24 | };
25 | 
26 | // Like read but uses pread so that the file can be accessed from multiple threads.  
27 | class PRead {
28 |   public:
29 |     explicit PRead(int fd, bool take_own = false) : file_(fd), own_(take_own) {}
30 |     void Run(const ChainPosition &position);
31 |   private:
32 |     int file_;
33 |     bool own_;
34 | };
35 | 
36 | class Write {
37 |   public:
38 |     explicit Write(int fd) : file_(fd) {}
39 |     void Run(const ChainPosition &position);
40 |   private:
41 |     int file_;
42 | };
43 | 
44 | // It's a common case that stuff is written and then recycled.  So rather than
45 | // spawn another thread to Recycle, this combines the two roles.
46 | class WriteAndRecycle {
47 |   public:
48 |     explicit WriteAndRecycle(int fd) : file_(fd) {}
49 |     void Run(const ChainPosition &position);
50 |   private:
51 |     int file_;
52 | };
53 | 
54 | class PWriteAndRecycle {
55 |   public:
56 |     explicit PWriteAndRecycle(int fd) : file_(fd) {}
57 |     void Run(const ChainPosition &position);
58 |   private:
59 |     int file_;
60 | };
61 | 
62 | 
63 | // Reuse the same file over and over again to buffer output.  
64 | class FileBuffer {
65 |   public:
66 |     explicit FileBuffer(int fd) : file_(fd) {}
67 | 
68 |     PWriteAndRecycle Sink() const {
69 |       util::SeekOrThrow(file_.get(), 0);
70 |       return PWriteAndRecycle(file_.get());
71 |     }
72 | 
73 |     PRead Source() const {
74 |       return PRead(file_.get());
75 |     }
76 | 
77 |     uint64_t Size() const {
78 |       return SizeOrThrow(file_.get());
79 |     }
80 | 
81 |   private:
82 |     scoped_fd file_;
83 | };
84 | 
85 | } // namespace stream
86 | } // namespace util
87 | #endif // UTIL_STREAM_IO_H
88 | 


--------------------------------------------------------------------------------
/kenlm/include/util/stream/line_input.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_STREAM_LINE_INPUT_H
 2 | #define UTIL_STREAM_LINE_INPUT_H
 3 | namespace util {namespace stream {
 4 | 
 5 | class ChainPosition;
 6 | 
 7 | /* Worker that reads input into blocks, ensuring that blocks contain whole
 8 |  * lines.  Assumes that the maximum size of a line is less than the block size
 9 |  */
10 | class LineInput {
11 |   public:
12 |     // Takes ownership upon thread execution.
13 |     explicit LineInput(int fd);
14 | 
15 |     void Run(const ChainPosition &position);
16 | 
17 |   private:
18 |     int fd_;
19 | };
20 | 
21 | }} // namespaces
22 | #endif // UTIL_STREAM_LINE_INPUT_H
23 | 


--------------------------------------------------------------------------------
/kenlm/include/util/stream/multi_progress.hh:
--------------------------------------------------------------------------------
 1 | /* Progress bar suitable for chains of workers */
 2 | #ifndef UTIL_STREAM_MULTI_PROGRESS_H
 3 | #define UTIL_STREAM_MULTI_PROGRESS_H
 4 | 
 5 | #include <boost/thread/mutex.hpp>
 6 | 
 7 | #include <cstddef>
 8 | 
 9 | #include <stdint.h>
10 | 
11 | namespace util { namespace stream {
12 | 
13 | class WorkerProgress;
14 | 
15 | class MultiProgress {
16 |   public:
17 |     static const unsigned char kWidth = 100;
18 | 
19 |     MultiProgress();
20 | 
21 |     ~MultiProgress();
22 | 
23 |     // Turns on showing (requires SetTarget too).
24 |     void Activate();
25 | 
26 |     void SetTarget(uint64_t complete);
27 | 
28 |     WorkerProgress Add();
29 | 
30 |     void Finished();
31 | 
32 |   private:
33 |     friend class WorkerProgress;
34 |     void Milestone(WorkerProgress &worker);
35 | 
36 |     bool active_;
37 | 
38 |     uint64_t complete_;
39 | 
40 |     boost::mutex mutex_;
41 | 
42 |     // \0 at the end.  
43 |     char display_[kWidth + 1];
44 | 
45 |     std::size_t character_handout_;
46 | 
47 |     MultiProgress(const MultiProgress &);
48 |     MultiProgress &operator=(const MultiProgress &);
49 | };
50 | 
51 | class WorkerProgress {
52 |   public:
53 |     // Default contrutor must be initialized with operator= later.  
54 |     WorkerProgress() : parent_(NULL) {}
55 | 
56 |     // Not threadsafe for the same worker by default.  
57 |     WorkerProgress &operator++() {
58 |       if (++current_ >= next_) {
59 |         parent_->Milestone(*this);
60 |       }
61 |       return *this;
62 |     }
63 | 
64 |     WorkerProgress &operator+=(uint64_t amount) {
65 |       current_ += amount;
66 |       if (current_ >= next_) {
67 |         parent_->Milestone(*this);
68 |       }
69 |       return *this;
70 |     }
71 | 
72 |   private:
73 |     friend class MultiProgress;
74 |     WorkerProgress(uint64_t next, MultiProgress &parent, char character) 
75 |       : current_(0), next_(next), parent_(&parent), stone_(0), character_(character) {}
76 | 
77 |     uint64_t current_, next_;
78 | 
79 |     MultiProgress *parent_;
80 | 
81 |     // Previous milestone reached.  
82 |     unsigned char stone_;
83 | 
84 |     // Character to display in bar.  
85 |     char character_;
86 | };
87 | 
88 | }} // namespaces
89 | 
90 | #endif // UTIL_STREAM_MULTI_PROGRESS_H
91 | 


--------------------------------------------------------------------------------
/kenlm/include/util/stream/stream.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_STREAM_STREAM_H
 2 | #define UTIL_STREAM_STREAM_H
 3 | 
 4 | #include "util/stream/chain.hh"
 5 | 
 6 | #include <boost/noncopyable.hpp>
 7 | 
 8 | #include <assert.h>
 9 | #include <stdint.h>
10 | 
11 | namespace util {
12 | namespace stream {
13 | 
14 | class Stream : boost::noncopyable {
15 |   public:
16 |     Stream() : current_(NULL), end_(NULL) {}
17 | 
18 |     void Init(const ChainPosition &position) {
19 |       entry_size_ = position.GetChain().EntrySize();
20 |       block_size_ = position.GetChain().BlockSize();
21 |       block_it_.Init(position);
22 |       StartBlock();
23 |     }
24 | 
25 |     explicit Stream(const ChainPosition &position) {
26 |       Init(position);
27 |     }
28 | 
29 |     operator bool() const { return current_ != NULL; }
30 |     bool operator!() const { return current_ == NULL; }
31 | 
32 |     const void *Get() const { return current_; }
33 |     void *Get() { return current_; }
34 | 
35 |     void Poison() {
36 |       block_it_->SetValidSize(current_ - static_cast<uint8_t*>(block_it_->Get()));
37 |       ++block_it_;
38 |       block_it_.Poison();
39 |     }
40 |     
41 |     Stream &operator++() {
42 |       assert(*this);
43 |       assert(current_ < end_);
44 |       current_ += entry_size_;
45 |       if (current_ == end_) {
46 |         ++block_it_;
47 |         StartBlock();
48 |       }
49 |       return *this;
50 |     }
51 | 
52 |   private:
53 |     void StartBlock() {
54 |       for (; block_it_ && !block_it_->ValidSize(); ++block_it_) {}
55 |       current_ = static_cast<uint8_t*>(block_it_->Get());
56 |       end_ = current_ + block_it_->ValidSize();
57 |     }
58 | 
59 |     // The following are pointers to raw memory
60 |     // current_ is the current record
61 |     // end_ is the end of the block (so we know when to move to the next block)
62 |     uint8_t *current_, *end_;
63 | 
64 |     std::size_t entry_size_;
65 |     std::size_t block_size_;
66 | 
67 |     Link block_it_;
68 | };
69 | 
70 | inline Chain &operator>>(Chain &chain, Stream &stream) {
71 |   stream.Init(chain.Add());
72 |   return chain;
73 | }
74 | 
75 | } // namespace stream
76 | } // namespace util
77 | #endif // UTIL_STREAM_STREAM_H
78 | 


--------------------------------------------------------------------------------
/kenlm/include/util/stream/timer.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_STREAM_TIMER_H
 2 | #define UTIL_STREAM_TIMER_H
 3 | 
 4 | // Sorry Jon, this was adding library dependencies in Moses and people complained.
 5 | 
 6 | /*#include <boost/version.hpp>
 7 | 
 8 | #if BOOST_VERSION >= 104800
 9 | #include <boost/timer/timer.hpp>
10 | #define UTIL_TIMER(str) boost::timer::auto_cpu_timer timer(std::cerr, 1, (str))
11 | #else
12 | //#warning Using Boost older than 1.48. Timing information will not be available.*/
13 | #define UTIL_TIMER(str) 
14 | //#endif
15 | 
16 | #endif // UTIL_STREAM_TIMER_H
17 | 


--------------------------------------------------------------------------------
/kenlm/include/util/string_piece_hash.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_STRING_PIECE_HASH_H
 2 | #define UTIL_STRING_PIECE_HASH_H
 3 | 
 4 | #include "util/string_piece.hh"
 5 | 
 6 | #include <boost/functional/hash.hpp>
 7 | #include <boost/version.hpp>
 8 | 
 9 | inline size_t hash_value(const StringPiece &str) {
10 |   return boost::hash_range(str.data(), str.data() + str.length());
11 | }
12 | 
13 | /* Support for lookup of StringPiece in boost::unordered_map<std::string> */
14 | struct StringPieceCompatibleHash : public std::unary_function<const StringPiece &, size_t> {
15 |   size_t operator()(const StringPiece &str) const {
16 |     return hash_value(str);
17 |   }
18 | };
19 | 
20 | struct StringPieceCompatibleEquals : public std::binary_function<const StringPiece &, const std::string &, bool> {
21 |   bool operator()(const StringPiece &first, const StringPiece &second) const {
22 |     return first == second;
23 |   }
24 | };
25 | template <class T> typename T::const_iterator FindStringPiece(const T &t, const StringPiece &key) {
26 | #if BOOST_VERSION < 104200
27 |   std::string temp(key.data(), key.size());
28 |   return t.find(temp);
29 | #else
30 |   return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals());
31 | #endif
32 | }
33 | 
34 | template <class T> typename T::iterator FindStringPiece(T &t, const StringPiece &key) {
35 | #if BOOST_VERSION < 104200
36 |   std::string temp(key.data(), key.size());
37 |   return t.find(temp);
38 | #else
39 |   return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals());
40 | #endif
41 | }
42 | 
43 | #endif // UTIL_STRING_PIECE_HASH_H
44 | 


--------------------------------------------------------------------------------
/kenlm/include/util/unistd.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_UNISTD_H
 2 | #define UTIL_UNISTD_H
 3 | 
 4 | #if defined(_WIN32) || defined(_WIN64)
 5 | 
 6 | // Windows doesn't define <unistd.h>
 7 | //
 8 | // So we define what we need here instead:
 9 | //
10 | #define STDIN_FILENO=0
11 | #define STDOUT_FILENO=1
12 | 
13 | 
14 | #else // Huzzah for POSIX!
15 | 
16 | #include <unistd.h>
17 | 
18 | #endif
19 | 
20 | 
21 | 
22 | #endif // UTIL_UNISTD_H
23 | 


--------------------------------------------------------------------------------
/kenlm/include/util/usage.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_USAGE_H
 2 | #define UTIL_USAGE_H
 3 | #include <cstddef>
 4 | #include <iosfwd>
 5 | #include <string>
 6 | 
 7 | #include <stdint.h>
 8 | 
 9 | namespace util {
10 | // Time in seconds since process started.  Zero on unsupported platforms.
11 | double WallTime();
12 | 
13 | void PrintUsage(std::ostream &to);
14 | 
15 | // Determine how much physical memory there is.  Return 0 on failure.
16 | uint64_t GuessPhysicalMemory();
17 | 
18 | // Parse a size like unix sort.  Sadly, this means the default multiplier is K.
19 | uint64_t ParseSize(const std::string &arg);
20 | } // namespace util
21 | #endif // UTIL_USAGE_H
22 | 


--------------------------------------------------------------------------------
/kenlm/lm/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Explicitly list the source files for this subdirectory
 2 | #
 3 | # If you add any source files to this subdirectory
 4 | #    that should be included in the kenlm library,
 5 | #        (this excludes any unit test files)
 6 | #    you should add them to the following list:
 7 | set(KENLM_LM_SOURCE
 8 | 	bhiksha.cc
 9 | 	binary_format.cc
10 | 	config.cc
11 | 	lm_exception.cc
12 | 	model.cc
13 | 	quantize.cc
14 | 	read_arpa.cc
15 | 	search_hashed.cc
16 | 	search_trie.cc
17 | 	sizes.cc
18 | 	trie.cc
19 | 	trie_sort.cc
20 | 	value_build.cc
21 | 	virtual_interface.cc
22 | 	vocab.cc
23 | )
24 | 
25 | 
26 | # Group these objects together for later use.
27 | #
28 | # Given add_library(foo OBJECT ${my_foo_sources}),
29 | # refer to these objects as $<TARGET_OBJECTS:foo>
30 | #
31 | add_subdirectory(common)
32 | 
33 | if (NOT MSVC)
34 | 	set(THREADS pthread)
35 | endif()
36 | 
37 | add_library(kenlm ${KENLM_LM_SOURCE} ${KENLM_LM_COMMON_SOURCE})
38 | target_link_libraries(kenlm kenlm_util ${Boost_LIBRARIES} ${THREADS})
39 | 
40 | set(KENLM_MAX_ORDER 6 CACHE STRING "Maximum supported ngram order")
41 | target_compile_definitions(kenlm PUBLIC -DKENLM_MAX_ORDER=${KENLM_MAX_ORDER})
42 | 
43 | # This directory has children that need to be processed
44 | add_subdirectory(builder)
45 | add_subdirectory(filter)
46 | add_subdirectory(interpolate)
47 | 
48 | # Explicitly list the executable files to be compiled
49 | set(EXE_LIST
50 |   query
51 |   fragment
52 |   build_binary
53 |   kenlm_benchmark
54 | )
55 | 
56 | set(LM_LIBS kenlm kenlm_util ${Boost_LIBRARIES} ${THREADS})
57 | 
58 | AddExes(EXES ${EXE_LIST}
59 |         LIBRARIES ${LM_LIBS})
60 | 
61 | if(BUILD_TESTING)
62 | 
63 |   set(KENLM_BOOST_TESTS_LIST left_test partial_test)
64 |   AddTests(TESTS ${KENLM_BOOST_TESTS_LIST}
65 |            LIBRARIES ${LM_LIBS}
66 |            TEST_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/test.arpa)
67 | 
68 |   # model_test requires an extra command line parameter
69 |   KenLMAddTest(TEST model_test
70 |                LIBRARIES ${LM_LIBS}
71 |                TEST_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/test.arpa
72 |                          ${CMAKE_CURRENT_SOURCE_DIR}/test_nounk.arpa)
73 | endif()
74 | 


--------------------------------------------------------------------------------
/kenlm/lm/blank.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_BLANK_H
 2 | #define LM_BLANK_H
 3 | 
 4 | #include <limits>
 5 | #include <stdint.h>
 6 | #include <cmath>
 7 | 
 8 | namespace lm {
 9 | namespace ngram {
10 | 
11 | /* Suppose "foo bar" appears with zero backoff but there is no trigram
12 |  * beginning with these words.  Then, when scoring "foo bar", the model could
13 |  * return out_state containing "bar" or even null context if "bar" also has no
14 |  * backoff and is never followed by another word.  Then the backoff is set to
15 |  * kNoExtensionBackoff.  If the n-gram might be extended, then out_state must
16 |  * contain the full n-gram, in which case kExtensionBackoff is set.  In any
17 |  * case, if an n-gram has non-zero backoff, the full state is returned so
18 |  * backoff can be properly charged.
19 |  * These differ only in sign bit because the backoff is in fact zero in either
20 |  * case.
21 |  */
22 | const float kNoExtensionBackoff = -0.0;
23 | const float kExtensionBackoff = 0.0;
24 | const uint64_t kNoExtensionQuant = 0;
25 | const uint64_t kExtensionQuant = 1;
26 | 
27 | inline void SetExtension(float &backoff) {
28 |   if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff;
29 | }
30 | 
31 | // This compiles down nicely.
32 | inline bool HasExtension(const float &backoff) {
33 |   typedef union { float f; uint32_t i; } UnionValue;
34 |   UnionValue compare, interpret;
35 |   compare.f = kNoExtensionBackoff;
36 |   interpret.f = backoff;
37 |   return compare.i != interpret.i;
38 | }
39 | 
40 | } // namespace ngram
41 | } // namespace lm
42 | #endif // LM_BLANK_H
43 | 


--------------------------------------------------------------------------------
/kenlm/lm/builder/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
 2 | 
 3 | # Explicitly list the source files for this subdirectory
 4 | #
 5 | # If you add any source files to this subdirectory
 6 | #    that should be included in the kenlm library,
 7 | #        (this excludes any unit test files)
 8 | #    you should add them to the following list:
 9 | #
10 | # In order to set correct paths to these files
11 | #    in case this variable is referenced by CMake files in the parent directory,
12 | #    we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
13 | #
14 | set(KENLM_BUILDER_SOURCE
15 | 		${CMAKE_CURRENT_SOURCE_DIR}/adjust_counts.cc
16 | 		${CMAKE_CURRENT_SOURCE_DIR}/corpus_count.cc
17 | 		${CMAKE_CURRENT_SOURCE_DIR}/initial_probabilities.cc
18 | 		${CMAKE_CURRENT_SOURCE_DIR}/interpolate.cc
19 | 		${CMAKE_CURRENT_SOURCE_DIR}/output.cc
20 | 		${CMAKE_CURRENT_SOURCE_DIR}/pipeline.cc
21 | 	)
22 | 
23 | 
24 | # Group these objects together for later use.
25 | #
26 | # Given add_library(foo OBJECT ${my_foo_sources}),
27 | # refer to these objects as $<TARGET_OBJECTS:foo>
28 | #
29 | add_library(kenlm_builder ${KENLM_BUILDER_SOURCE})
30 | 
31 | if (NOT MSVC)
32 | 	set(THREADS pthread)
33 | endif()
34 | 
35 | AddExes(EXES lmplz
36 |         LIBRARIES kenlm_builder kenlm kenlm_util ${Boost_LIBRARIES} ${THREADS})
37 | AddExes(EXES count_ngrams
38 |         LIBRARIES kenlm_builder kenlm kenlm_util ${Boost_LIBRARIES} ${THREADS})
39 | 
40 | if(BUILD_TESTING)
41 | 
42 |   # Explicitly list the Boost test files to be compiled
43 |   set(KENLM_BOOST_TESTS_LIST
44 |     adjust_counts_test
45 |     corpus_count_test
46 |   )
47 | 
48 |   AddTests(TESTS ${KENLM_BOOST_TESTS_LIST}
49 |            LIBRARIES kenlm_builder kenlm kenlm_util ${Boost_LIBRARIES} ${THREADS})
50 | endif()
51 | 


--------------------------------------------------------------------------------
/kenlm/lm/builder/README.md:
--------------------------------------------------------------------------------
 1 | Dependencies
 2 | ============
 3 | 
 4 | Boost >= 1.42.0 is required.  
 5 | 
 6 | For Ubuntu,
 7 | ```bash
 8 | sudo apt-get install libboost1.48-all-dev
 9 | ```
10 | 
11 | Alternatively, you can download, compile, and install it yourself:
12 | 
13 | ```bash
14 | wget http://sourceforge.net/projects/boost/files/boost/1.52.0/boost_1_52_0.tar.gz/download -O boost_1_52_0.tar.gz
15 | tar -xvzf boost_1_52_0.tar.gz
16 | cd boost_1_52_0
17 | ./bootstrap.sh
18 | ./b2
19 | sudo ./b2 install
20 | ```
21 | 
22 | Local install options (in a user-space prefix directory) are also possible. See http://www.boost.org/doc/libs/1_52_0/doc/html/bbv2/installation.html.
23 | 
24 | 
25 | Building
26 | ========
27 | 
28 | ```bash
29 | bjam
30 | ```
31 | Your distribution might package bjam and boost-build separately from Boost.  Both are required.   
32 | 
33 | Usage
34 | =====
35 | 
36 | Run
37 | ```bash
38 | $ bin/lmplz
39 | ```
40 | to see command line arguments
41 | 
42 | Running
43 | =======
44 | 
45 | ```bash
46 | bin/lmplz -o 5 <text >text.arpa
47 | ```
48 | 


--------------------------------------------------------------------------------
/kenlm/lm/builder/TODO:
--------------------------------------------------------------------------------
1 | More tests!
2 | Sharding.
3 | Some way to manage all the crazy config options.
4 | Option to build the binary file directly.  
5 | Interpolation of different orders.  
6 | 


--------------------------------------------------------------------------------
/kenlm/lm/builder/adjust_counts.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_BUILDER_ADJUST_COUNTS_H
 2 | #define LM_BUILDER_ADJUST_COUNTS_H
 3 | 
 4 | #include "lm/builder/discount.hh"
 5 | #include "lm/lm_exception.hh"
 6 | #include "util/exception.hh"
 7 | 
 8 | #include <vector>
 9 | 
10 | #include <stdint.h>
11 | 
12 | namespace util { namespace stream { class ChainPositions; } }
13 | 
14 | namespace lm {
15 | namespace builder {
16 | 
17 | class BadDiscountException : public util::Exception {
18 |   public:
19 |     BadDiscountException() throw();
20 |     ~BadDiscountException() throw();
21 | };
22 | 
23 | struct DiscountConfig {
24 |   // Overrides discounts for orders [1,discount_override.size()].
25 |   std::vector<Discount> overwrite;
26 |   // If discounting fails for an order, copy them from here.
27 |   Discount fallback;
28 |   // What to do when discounts are out of range or would trigger divison by
29 |   // zero.  It it does something other than THROW_UP, use fallback_discount.
30 |   WarningAction bad_action;
31 | };
32 | 
33 | /* Compute adjusted counts.
34 |  * Input: unique suffix sorted N-grams (and just the N-grams) with raw counts.
35 |  * Output: [1,N]-grams with adjusted counts.
36 |  * [1,N)-grams are in suffix order
37 |  * N-grams are in undefined order (they're going to be sorted anyway).
38 |  */
39 | class AdjustCounts {
40 |   public:
41 |     // counts: output
42 |     // counts_pruned: output
43 |     // discounts: mostly output.  If the input already has entries, they will be kept.
44 |     // prune_thresholds: input.  n-grams with normal (not adjusted) count below this will be pruned.
45 |     AdjustCounts(
46 |         const std::vector<uint64_t> &prune_thresholds,
47 |         std::vector<uint64_t> &counts,
48 |         std::vector<uint64_t> &counts_pruned,
49 |         const std::vector<bool> &prune_words,
50 |         const DiscountConfig &discount_config,
51 |         std::vector<Discount> &discounts)
52 |       : prune_thresholds_(prune_thresholds), counts_(counts), counts_pruned_(counts_pruned),
53 |         prune_words_(prune_words), discount_config_(discount_config), discounts_(discounts)
54 |     {}
55 | 
56 |     void Run(const util::stream::ChainPositions &positions);
57 | 
58 |   private:
59 |     const std::vector<uint64_t> &prune_thresholds_;
60 |     std::vector<uint64_t> &counts_;
61 |     std::vector<uint64_t> &counts_pruned_;
62 |     const std::vector<bool> &prune_words_;
63 | 
64 |     DiscountConfig discount_config_;
65 |     std::vector<Discount> &discounts_;
66 | };
67 | 
68 | } // namespace builder
69 | } // namespace lm
70 | 
71 | #endif // LM_BUILDER_ADJUST_COUNTS_H
72 | 
73 | 


--------------------------------------------------------------------------------
/kenlm/lm/builder/combine_counts.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_BUILDER_COMBINE_COUNTS_H
 2 | #define LM_BUILDER_COMBINE_COUNTS_H
 3 | 
 4 | #include "lm/builder/payload.hh"
 5 | #include "lm/common/ngram.hh"
 6 | #include "lm/common/compare.hh"
 7 | #include "lm/word_index.hh"
 8 | #include "util/stream/sort.hh"
 9 | 
10 | #include <functional>
11 | #include <string>
12 | 
13 | namespace lm {
14 | namespace builder {
15 | 
16 | // Sum counts for the same n-gram.
17 | struct CombineCounts {
18 |   bool operator()(void *first_void, const void *second_void, const SuffixOrder &compare) const {
19 |     NGram<BuildingPayload> first(first_void, compare.Order());
20 |     // There isn't a const version of NGram.
21 |     NGram<BuildingPayload> second(const_cast<void*>(second_void), compare.Order());
22 |     if (memcmp(first.begin(), second.begin(), sizeof(WordIndex) * compare.Order())) return false;
23 |     first.Value().count += second.Value().count;
24 |     return true;
25 |   }
26 | };
27 | 
28 | } // namespace builder
29 | } // namespace lm
30 | 
31 | #endif // LM_BUILDER_COMBINE_COUNTS_H
32 | 


--------------------------------------------------------------------------------
/kenlm/lm/builder/corpus_count.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_BUILDER_CORPUS_COUNT_H
 2 | #define LM_BUILDER_CORPUS_COUNT_H
 3 | 
 4 | #include "lm/lm_exception.hh"
 5 | #include "lm/word_index.hh"
 6 | #include "util/scoped.hh"
 7 | 
 8 | #include <cstddef>
 9 | #include <string>
10 | #include <stdint.h>
11 | #include <vector>
12 | 
13 | namespace util {
14 | class FilePiece;
15 | namespace stream {
16 | class ChainPosition;
17 | } // namespace stream
18 | } // namespace util
19 | 
20 | namespace lm {
21 | namespace builder {
22 | 
23 | class CorpusCount {
24 |   public:
25 |     // Memory usage will be DedupeMultipler(order) * block_size + total_chain_size + unknown vocab_hash_size
26 |     static float DedupeMultiplier(std::size_t order);
27 | 
28 |     // How much memory vocabulary will use based on estimated size of the vocab.
29 |     static std::size_t VocabUsage(std::size_t vocab_estimate);
30 | 
31 |     // token_count: out.
32 |     // type_count aka vocabulary size.  Initialize to an estimate.  It is set to the exact value.
33 |     CorpusCount(util::FilePiece &from, int vocab_write, bool dynamic_vocab, uint64_t &token_count, WordIndex &type_count, std::vector<bool> &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol);
34 | 
35 |     void Run(const util::stream::ChainPosition &position);
36 | 
37 |   private:
38 |     template <class Vocab> void RunWithVocab(const util::stream::ChainPosition &position, Vocab &vocab);
39 | 
40 |     util::FilePiece &from_;
41 |     int vocab_write_;
42 |     bool dynamic_vocab_;
43 |     uint64_t &token_count_;
44 |     WordIndex &type_count_;
45 |     std::vector<bool>& prune_words_;
46 |     const std::string& prune_vocab_filename_;
47 | 
48 |     std::size_t dedupe_mem_size_;
49 |     util::scoped_malloc dedupe_mem_;
50 | 
51 |     WarningAction disallowed_symbol_action_;
52 | };
53 | 
54 | } // namespace builder
55 | } // namespace lm
56 | #endif // LM_BUILDER_CORPUS_COUNT_H
57 | 


--------------------------------------------------------------------------------
/kenlm/lm/builder/discount.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_BUILDER_DISCOUNT_H
 2 | #define LM_BUILDER_DISCOUNT_H
 3 | 
 4 | #include <algorithm>
 5 | 
 6 | #include <stdint.h>
 7 | 
 8 | namespace lm {
 9 | namespace builder {
10 | 
11 | struct Discount {
12 |   float amount[4];
13 | 
14 |   float Get(uint64_t count) const {
15 |     return amount[std::min<uint64_t>(count, 3)];
16 |   }
17 | 
18 |   float Apply(uint64_t count) const {
19 |     return static_cast<float>(count) - Get(count);
20 |   }
21 | };
22 | 
23 | } // namespace builder
24 | } // namespace lm
25 | 
26 | #endif // LM_BUILDER_DISCOUNT_H
27 | 


--------------------------------------------------------------------------------
/kenlm/lm/builder/dump_counts_main.cc:
--------------------------------------------------------------------------------
 1 | #include "lm/common/print.hh"
 2 | #include "lm/word_index.hh"
 3 | #include "util/file.hh"
 4 | #include "util/read_compressed.hh"
 5 | 
 6 | #include <boost/lexical_cast.hpp>
 7 | 
 8 | #include <iostream>
 9 | #include <vector>
10 | 
11 | int main(int argc, char *argv[]) {
12 |   if (argc != 4) {
13 |     std::cerr << "Usage: " << argv[0] << " counts vocabulary order\n"
14 |     "The counts file contains records with 4-byte vocabulary ids followed by 8-byte\n"
15 |     "counts.  Each record has order many vocabulary ids.\n"
16 |     "The vocabulary file contains the words delimited by NULL in order of id.\n"
17 |     "The vocabulary file may not be compressed because it is mmapped but the counts\n"
18 |     "file can be compressed.\n";
19 |     return 1;
20 |   }
21 |   util::ReadCompressed counts(util::OpenReadOrThrow(argv[1]));
22 |   util::scoped_fd vocab_file(util::OpenReadOrThrow(argv[2]));
23 |   lm::VocabReconstitute vocab(vocab_file.get());
24 |   unsigned int order = boost::lexical_cast<unsigned int>(argv[3]);
25 |   std::vector<char> record(sizeof(uint32_t) * order + sizeof(uint64_t));
26 |   while (std::size_t got = counts.ReadOrEOF(&*record.begin(), record.size())) {
27 |     UTIL_THROW_IF(got != record.size(), util::Exception, "Read " << got << " bytes at the end of file, which is not a complete record of length " << record.size());
28 |     const lm::WordIndex *words = reinterpret_cast<const lm::WordIndex*>(&*record.begin());
29 |     for (const lm::WordIndex *i = words; i != words + order; ++i) {
30 |       UTIL_THROW_IF(*i >= vocab.Size(), util::Exception, "Vocab ID " << *i << " is larger than the vocab file's maximum of " << vocab.Size() << ".  Are you sure you have the right order and vocab file for these counts?");
31 |       std::cout << vocab.Lookup(*i) << ' ';
32 |     }
33 |     // TODO don't use std::cout because it is slow.  Add fast uint64_t printing support to FileStream.
34 |     std::cout << *reinterpret_cast<const uint64_t*>(words + order) << '\n';
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/kenlm/lm/builder/hash_gamma.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_BUILDER_HASH_GAMMA__
 2 | #define LM_BUILDER_HASH_GAMMA__
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | namespace lm { namespace builder {
 7 | 
 8 | #pragma pack(push)
 9 | #pragma pack(4)
10 | 
11 | struct HashGamma {
12 |     uint64_t hash_value;
13 |     float gamma;
14 | };
15 | 
16 | #pragma pack(pop)
17 | 
18 | }} // namespaces
19 | #endif // LM_BUILDER_HASH_GAMMA__
20 | 


--------------------------------------------------------------------------------
/kenlm/lm/builder/header_info.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_BUILDER_HEADER_INFO_H
 2 | #define LM_BUILDER_HEADER_INFO_H
 3 | 
 4 | #include <string>
 5 | #include <vector>
 6 | #include <stdint.h>
 7 | 
 8 | namespace lm { namespace builder {
 9 | 
10 | // Some configuration info that is used to add
11 | // comments to the beginning of an ARPA file
12 | struct HeaderInfo {
13 |   std::string input_file;
14 |   uint64_t token_count;
15 |   std::vector<uint64_t> counts_pruned;
16 | 
17 |   HeaderInfo() {}
18 | 
19 |   HeaderInfo(const std::string& input_file_in, uint64_t token_count_in, const std::vector<uint64_t> &counts_pruned_in)
20 |     : input_file(input_file_in), token_count(token_count_in), counts_pruned(counts_pruned_in) {}
21 | 
22 |   // TODO: Add smoothing type
23 |   // TODO: More info if multiple models were interpolated
24 | };
25 | 
26 | }} // namespaces
27 | 
28 | #endif
29 | 


--------------------------------------------------------------------------------
/kenlm/lm/builder/initial_probabilities.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_BUILDER_INITIAL_PROBABILITIES_H
 2 | #define LM_BUILDER_INITIAL_PROBABILITIES_H
 3 | 
 4 | #include "lm/builder/discount.hh"
 5 | #include "lm/word_index.hh"
 6 | #include "util/stream/config.hh"
 7 | 
 8 | #include <vector>
 9 | 
10 | namespace util { namespace stream { class Chains; } }
11 | 
12 | namespace lm {
13 | class SpecialVocab;
14 | namespace builder {
15 | 
16 | struct InitialProbabilitiesConfig {
17 |   // These should be small buffers to keep the adder from getting too far ahead
18 |   util::stream::ChainConfig adder_in;
19 |   util::stream::ChainConfig adder_out;
20 |   // SRILM doesn't normally interpolate unigrams.
21 |   bool interpolate_unigrams;
22 | };
23 | 
24 | /* Compute initial (uninterpolated) probabilities
25 |  * primary: the normal chain of n-grams.  Incoming is context sorted adjusted
26 |  *   counts.  Outgoing has uninterpolated probabilities for use by Interpolate.
27 |  * second_in: a second copy of the primary input.  Discard the output.
28 |  * gamma_out: Computed gamma values are output on these chains in suffix order.
29 |  *   The values are bare floats and should be buffered for interpolation to
30 |  *   use.
31 |  */
32 | void InitialProbabilities(
33 |     const InitialProbabilitiesConfig &config,
34 |     const std::vector<Discount> &discounts,
35 |     util::stream::Chains &primary,
36 |     util::stream::Chains &second_in,
37 |     util::stream::Chains &gamma_out,
38 |     const std::vector<uint64_t> &prune_thresholds,
39 |     bool prune_vocab,
40 |     const SpecialVocab &vocab);
41 | 
42 | } // namespace builder
43 | } // namespace lm
44 | 
45 | #endif // LM_BUILDER_INITIAL_PROBABILITIES_H
46 | 


--------------------------------------------------------------------------------
/kenlm/lm/builder/interpolate.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_BUILDER_INTERPOLATE_H
 2 | #define LM_BUILDER_INTERPOLATE_H
 3 | 
 4 | #include "lm/common/special.hh"
 5 | #include "lm/word_index.hh"
 6 | #include "util/stream/multi_stream.hh"
 7 | 
 8 | #include <vector>
 9 | 
10 | #include <stdint.h>
11 | 
12 | namespace lm { namespace builder {
13 | 
14 | /* Interpolate step.
15 |  * Input: suffix sorted n-grams with (p_uninterpolated, gamma) from
16 |  * InitialProbabilities.
17 |  * Output: suffix sorted n-grams with complete probability
18 |  */
19 | class Interpolate {
20 |   public:
21 |     // Normally vocab_size is the unigram count-1 (since p(<s>) = 0) but might
22 |     // be larger when the user specifies a consistent vocabulary size.
23 |     explicit Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t> &prune_thresholds, bool prune_vocab, bool output_q, const SpecialVocab &specials);
24 | 
25 |     void Run(const util::stream::ChainPositions &positions);
26 | 
27 |   private:
28 |     float uniform_prob_;
29 |     util::stream::ChainPositions backoffs_;
30 |     const std::vector<uint64_t> prune_thresholds_;
31 |     bool prune_vocab_;
32 |     bool output_q_;
33 |     const SpecialVocab specials_;
34 | };
35 | 
36 | }} // namespaces
37 | #endif // LM_BUILDER_INTERPOLATE_H
38 | 


--------------------------------------------------------------------------------
/kenlm/lm/builder/output.cc:
--------------------------------------------------------------------------------
 1 | #include "lm/builder/output.hh"
 2 | 
 3 | #include "lm/common/model_buffer.hh"
 4 | #include "lm/common/print.hh"
 5 | #include "util/file_stream.hh"
 6 | #include "util/stream/multi_stream.hh"
 7 | 
 8 | #include <iostream>
 9 | 
10 | namespace lm { namespace builder {
11 | 
12 | OutputHook::~OutputHook() {}
13 | 
14 | Output::Output(StringPiece file_base, bool keep_buffer, bool output_q)
15 |   : buffer_(file_base, keep_buffer, output_q) {}
16 | 
17 | void Output::SinkProbs(util::stream::Chains &chains) {
18 |   Apply(PROB_PARALLEL_HOOK, chains);
19 |   if (!buffer_.Keep() && !Have(PROB_SEQUENTIAL_HOOK)) {
20 |     chains >> util::stream::kRecycle;
21 |     chains.Wait(true);
22 |     return;
23 |   }
24 |   buffer_.Sink(chains, header_.counts_pruned);
25 |   chains >> util::stream::kRecycle;
26 |   chains.Wait(false);
27 |   if (Have(PROB_SEQUENTIAL_HOOK)) {
28 |     std::cerr << "=== 5/5 Writing ARPA model ===" << std::endl;
29 |     buffer_.Source(chains);
30 |     Apply(PROB_SEQUENTIAL_HOOK, chains);
31 |     chains >> util::stream::kRecycle;
32 |     chains.Wait(true);
33 |   }
34 | }
35 | 
36 | void Output::Apply(HookType hook_type, util::stream::Chains &chains) {
37 |   for (boost::ptr_vector<OutputHook>::iterator entry = outputs_[hook_type].begin(); entry != outputs_[hook_type].end(); ++entry) {
38 |     entry->Sink(header_, VocabFile(), chains);
39 |   }
40 | }
41 | 
42 | void PrintHook::Sink(const HeaderInfo &info, int vocab_file, util::stream::Chains &chains) {
43 |   if (verbose_header_) {
44 |     util::FileStream out(file_.get(), 50);
45 |     out << "# Input file: " << info.input_file << '\n';
46 |     out << "# Token count: " << info.token_count << '\n';
47 |     out << "# Smoothing: Modified Kneser-Ney" << '\n';
48 |   }
49 |   chains >> PrintARPA(vocab_file, file_.get(), info.counts_pruned);
50 | }
51 | 
52 | }} // namespaces
53 | 


--------------------------------------------------------------------------------
/kenlm/lm/builder/payload.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_BUILDER_PAYLOAD_H
 2 | #define LM_BUILDER_PAYLOAD_H
 3 | 
 4 | #include "lm/weights.hh"
 5 | #include "lm/word_index.hh"
 6 | #include <stdint.h>
 7 | 
 8 | namespace lm { namespace builder {
 9 | 
10 | struct Uninterpolated {
11 |   float prob;  // Uninterpolated probability.
12 |   float gamma; // Interpolation weight for lower order.
13 | };
14 | 
15 | union BuildingPayload {
16 |   uint64_t count;
17 |   Uninterpolated uninterp;
18 |   ProbBackoff complete;
19 | 
20 |   /*mjd**********************************************************************/
21 |   bool IsMarked() const {
22 |     return count >> (sizeof(count) * 8 - 1);
23 |   }
24 | 
25 |   void Mark() {
26 |     count |= (1ULL << (sizeof(count) * 8 - 1));
27 |   }
28 | 
29 |   void Unmark() {
30 |     count &= ~(1ULL << (sizeof(count) * 8 - 1));
31 |   }
32 | 
33 |   uint64_t UnmarkedCount() const {
34 |     return count & ~(1ULL << (sizeof(count) * 8 - 1));
35 |   }
36 | 
37 |   uint64_t CutoffCount() const {
38 |     return IsMarked() ? 0 : UnmarkedCount();
39 |   }
40 |   /*mjd**********************************************************************/
41 | };
42 | 
43 | const WordIndex kBOS = 1;
44 | const WordIndex kEOS = 2;
45 | 
46 | }} // namespaces
47 | 
48 | #endif // LM_BUILDER_PAYLOAD_H
49 | 


--------------------------------------------------------------------------------
/kenlm/lm/common/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
 2 | 
 3 | # Explicitly list the source files for this subdirectory
 4 | #
 5 | # If you add any source files to this subdirectory
 6 | #    that should be included in the kenlm library,
 7 | #        (this excludes any unit test files)
 8 | #    you should add them to the following list:
 9 | #
10 | # In order to set correct paths to these files
11 | #    in case this variable is referenced by CMake files in the parent directory,
12 | #    we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
13 | #
14 | set(KENLM_LM_COMMON_SOURCE
15 | 		${CMAKE_CURRENT_SOURCE_DIR}/model_buffer.cc
16 | 		${CMAKE_CURRENT_SOURCE_DIR}/print.cc
17 | 		${CMAKE_CURRENT_SOURCE_DIR}/renumber.cc
18 | 		${CMAKE_CURRENT_SOURCE_DIR}/size_option.cc
19 |   PARENT_SCOPE)
20 | 
21 | if(BUILD_TESTING)
22 |   KenLMAddTest(TEST model_buffer_test
23 |                LIBRARIES kenlm
24 |                TEST_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/test_data)
25 | endif()
26 | 


--------------------------------------------------------------------------------
/kenlm/lm/common/joint_order.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_COMMON_JOINT_ORDER_H
 2 | #define LM_COMMON_JOINT_ORDER_H
 3 | 
 4 | #include "lm/common/ngram_stream.hh"
 5 | #include "lm/lm_exception.hh"
 6 | 
 7 | #ifdef DEBUG
 8 | #include "util/fixed_array.hh"
 9 | #include <iostream>
10 | #endif
11 | 
12 | #include <cstring>
13 | 
14 | namespace lm {
15 | 
16 | template <class Callback, class Compare> void JointOrder(const util::stream::ChainPositions &positions, Callback &callback) {
17 |   // Allow matching to reference streams[-1].
18 |   util::FixedArray<ProxyStream<NGramHeader> > streams_with_dummy(positions.size() + 1);
19 |   // A bogus stream for [-1].
20 |   streams_with_dummy.push_back();
21 |   for (std::size_t i = 0; i < positions.size(); ++i) {
22 |     streams_with_dummy.push_back(positions[i], NGramHeader(NULL, i + 1));
23 |   }
24 |   ProxyStream<NGramHeader> *streams = streams_with_dummy.begin() + 1;
25 | 
26 |   std::size_t order;
27 |   for (order = 0; order < positions.size() && streams[order]; ++order) {}
28 |   assert(order); // should always have <unk>.
29 | 
30 |   // Debugging only: call comparison function to sanity check order.
31 | #ifdef DEBUG
32 |   util::FixedArray<Compare> less_compare(order);
33 |   for (unsigned i = 0; i < order; ++i)
34 |     less_compare.push_back(i + 1);
35 | #endif // DEBUG
36 | 
37 |   std::size_t current = 0;
38 |   while (true) {
39 |     // Does the context match the lower one?
40 |     if (!memcmp(streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset, sizeof(WordIndex) * current)) {
41 |       callback.Enter(current, streams[current].Get());
42 |       // Transition to looking for extensions.
43 |       if (++current < order) continue;
44 |     }
45 | #ifdef DEBUG
46 |     // match_check[current - 1] matches current-grams
47 |     // The lower-order stream (which skips fewer current-grams) should always be <= the higher order-stream (which can skip current-grams).
48 |     else if (!less_compare[current - 1](streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset)) {
49 |       std::cerr << "Stream out of order detected" << std::endl;
50 |       abort();
51 |     }
52 | #endif // DEBUG
53 |     // No extension left.
54 |     while(true) {
55 |       assert(current > 0);
56 |       --current;
57 |       callback.Exit(current, streams[current].Get());
58 | 
59 |       if (++streams[current]) break;
60 | 
61 |       UTIL_THROW_IF(order != current + 1, FormatLoadException, "Detected n-gram without matching suffix");
62 | 
63 |       order = current;
64 |       if (!order) return;
65 |     }
66 |   }
67 | }
68 | 
69 | } // namespaces
70 | 
71 | #endif // LM_COMMON_JOINT_ORDER_H
72 | 


--------------------------------------------------------------------------------
/kenlm/lm/common/model_buffer.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_COMMON_MODEL_BUFFER_H
 2 | #define LM_COMMON_MODEL_BUFFER_H
 3 | 
 4 | /* Format with separate files in suffix order.  Each file contains
 5 |  * n-grams of the same order.
 6 |  */
 7 | #include "lm/word_index.hh"
 8 | #include "util/file.hh"
 9 | #include "util/fixed_array.hh"
10 | #include "util/string_piece.hh"
11 | 
12 | #include <string>
13 | #include <vector>
14 | 
15 | namespace util { namespace stream {
16 | class Chains;
17 | class Chain;
18 | }} // namespaces
19 | 
20 | namespace lm {
21 | 
22 | namespace ngram { class State; }
23 | 
24 | class ModelBuffer {
25 |   public:
26 |     // Construct for writing.  Must call VocabFile() and fill it with null-delimited vocab words.
27 |     ModelBuffer(StringPiece file_base, bool keep_buffer, bool output_q);
28 | 
29 |     // Load from file.
30 |     explicit ModelBuffer(StringPiece file_base);
31 | 
32 |     // Must call VocabFile and populate before calling this function.
33 |     void Sink(util::stream::Chains &chains, const std::vector<uint64_t> &counts);
34 | 
35 |     // Read files and write to the given chains.  If fewer chains are provided,
36 |     // only do the lower orders.
37 |     void Source(util::stream::Chains &chains);
38 | 
39 |     void Source(std::size_t order_minus_1, util::stream::Chain &chain);
40 | 
41 |     // The order of the n-gram model that is associated with the model buffer.
42 |     std::size_t Order() const { return counts_.size(); }
43 |     // Requires Sink or load from file.
44 |     const std::vector<uint64_t> &Counts() const {
45 |       assert(!counts_.empty());
46 |       return counts_;
47 |     }
48 | 
49 |     int VocabFile() const { return vocab_file_.get(); }
50 | 
51 |     int RawFile(std::size_t order_minus_1) const {
52 |       return files_[order_minus_1].get();
53 |     }
54 | 
55 |     bool Keep() const { return keep_buffer_; }
56 | 
57 |     // Slowly execute a language model query with binary search.
58 |     // This is used by interpolation to gather tuning probabilities rather than
59 |     // scanning the files.
60 |     float SlowQuery(const ngram::State &context, WordIndex word, ngram::State &out) const;
61 | 
62 |   private:
63 |     const std::string file_base_;
64 |     const bool keep_buffer_;
65 |     bool output_q_;
66 |     std::vector<uint64_t> counts_;
67 | 
68 |     util::scoped_fd vocab_file_;
69 |     util::FixedArray<util::scoped_fd> files_;
70 | };
71 | 
72 | } // namespace lm
73 | 
74 | #endif // LM_COMMON_MODEL_BUFFER_H
75 | 


--------------------------------------------------------------------------------
/kenlm/lm/common/model_buffer_test.cc:
--------------------------------------------------------------------------------
 1 | #include "lm/common/model_buffer.hh"
 2 | #include "lm/model.hh"
 3 | #include "lm/state.hh"
 4 | 
 5 | #define BOOST_TEST_MODULE ModelBufferTest
 6 | #include <boost/test/unit_test.hpp>
 7 | 
 8 | namespace lm { namespace {
 9 | 
10 | BOOST_AUTO_TEST_CASE(Query) {
11 |   std::string dir("test_data/");
12 |   if (boost::unit_test::framework::master_test_suite().argc == 2) {
13 |     dir = boost::unit_test::framework::master_test_suite().argv[1];
14 |   }
15 |   ngram::Model ref((dir + "/toy0.arpa").c_str());
16 |   ModelBuffer test(dir + "/toy0");
17 |   ngram::State ref_state, test_state;
18 |   WordIndex a = ref.GetVocabulary().Index("a");
19 |   BOOST_CHECK_CLOSE(
20 |       ref.FullScore(ref.BeginSentenceState(), a, ref_state).prob,
21 |       test.SlowQuery(ref.BeginSentenceState(), a, test_state),
22 |       0.001);
23 |   BOOST_CHECK_EQUAL((unsigned)ref_state.length, (unsigned)test_state.length);
24 |   BOOST_CHECK_EQUAL(ref_state.words[0], test_state.words[0]);
25 |   BOOST_CHECK_EQUAL(ref_state.backoff[0], test_state.backoff[0]);
26 |   BOOST_CHECK(ref_state == test_state);
27 | 
28 |   ngram::State ref_state2, test_state2;
29 |   WordIndex b = ref.GetVocabulary().Index("b");
30 |   BOOST_CHECK_CLOSE(
31 |       ref.FullScore(ref_state, b, ref_state2).prob,
32 |       test.SlowQuery(test_state, b, test_state2),
33 |       0.001);
34 |   BOOST_CHECK(ref_state2 == test_state2);
35 |   BOOST_CHECK_EQUAL(ref_state2.backoff[0], test_state2.backoff[0]);
36 | 
37 |   BOOST_CHECK_CLOSE(
38 |       ref.FullScore(ref_state2, 0, ref_state).prob,
39 |       test.SlowQuery(test_state2, 0, test_state),
40 |       0.001);
41 |   // The reference does state minimization but this doesn't.
42 | }
43 | 
44 | }} // namespaces
45 | 


--------------------------------------------------------------------------------
/kenlm/lm/common/ngram.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_COMMON_NGRAM_H
 2 | #define LM_COMMON_NGRAM_H
 3 | 
 4 | #include "lm/weights.hh"
 5 | #include "lm/word_index.hh"
 6 | 
 7 | #include <cstddef>
 8 | #include <cassert>
 9 | #include <stdint.h>
10 | #include <cstring>
11 | 
12 | namespace lm {
13 | 
14 | class NGramHeader {
15 |   public:
16 |     NGramHeader(void *begin, std::size_t order)
17 |       : begin_(static_cast<WordIndex*>(begin)), end_(begin_ + order) {}
18 | 
19 |     NGramHeader() : begin_(NULL), end_(NULL) {}
20 | 
21 |     const uint8_t *Base() const { return reinterpret_cast<const uint8_t*>(begin_); }
22 |     uint8_t *Base() { return reinterpret_cast<uint8_t*>(begin_); }
23 | 
24 |     void ReBase(void *to) {
25 |       std::size_t difference = end_ - begin_;
26 |       begin_ = reinterpret_cast<WordIndex*>(to);
27 |       end_ = begin_ + difference;
28 |     }
29 | 
30 |     // These are for the vocab index.
31 |     // Lower-case in deference to STL.
32 |     const WordIndex *begin() const { return begin_; }
33 |     WordIndex *begin() { return begin_; }
34 |     const WordIndex *end() const { return end_; }
35 |     WordIndex *end() { return end_; }
36 | 
37 |     std::size_t size() const { return end_ - begin_; }
38 |     std::size_t Order() const { return end_ - begin_; }
39 | 
40 |   private:
41 |     WordIndex *begin_, *end_;
42 | };
43 | 
44 | template <class PayloadT> class NGram : public NGramHeader {
45 |   public:
46 |     typedef PayloadT Payload;
47 | 
48 |     NGram() : NGramHeader(NULL, 0) {}
49 | 
50 |     NGram(void *begin, std::size_t order) : NGramHeader(begin, order) {}
51 | 
52 |     // Would do operator++ but that can get confusing for a stream.
53 |     void NextInMemory() {
54 |       ReBase(&Value() + 1);
55 |     }
56 | 
57 |     static std::size_t TotalSize(std::size_t order) {
58 |       return order * sizeof(WordIndex) + sizeof(Payload);
59 |     }
60 |     std::size_t TotalSize() const {
61 |       // Compiler should optimize this.
62 |       return TotalSize(Order());
63 |     }
64 | 
65 |     static std::size_t OrderFromSize(std::size_t size) {
66 |       std::size_t ret = (size - sizeof(Payload)) / sizeof(WordIndex);
67 |       assert(size == TotalSize(ret));
68 |       return ret;
69 |     }
70 | 
71 |     const Payload &Value() const { return *reinterpret_cast<const Payload *>(end()); }
72 |     Payload &Value() { return *reinterpret_cast<Payload *>(end()); }
73 | };
74 | 
75 | } // namespace lm
76 | 
77 | #endif // LM_COMMON_NGRAM_H
78 | 


--------------------------------------------------------------------------------
/kenlm/lm/common/ngram_stream.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_BUILDER_NGRAM_STREAM_H
 2 | #define LM_BUILDER_NGRAM_STREAM_H
 3 | 
 4 | #include "lm/common/ngram.hh"
 5 | #include "util/stream/chain.hh"
 6 | #include "util/stream/multi_stream.hh"
 7 | #include "util/stream/stream.hh"
 8 | 
 9 | #include <cstddef>
10 | 
11 | namespace lm {
12 | 
13 | template <class Proxy> class ProxyStream {
14 |   public:
15 |     // Make an invalid stream.
16 |     ProxyStream() {}
17 | 
18 |     explicit ProxyStream(const util::stream::ChainPosition &position, const Proxy &proxy = Proxy())
19 |       : proxy_(proxy), stream_(position) {
20 |       proxy_.ReBase(stream_.Get());
21 |     }
22 | 
23 |     Proxy &operator*() { return proxy_; }
24 |     const Proxy &operator*() const { return proxy_; }
25 | 
26 |     Proxy *operator->() { return &proxy_; }
27 |     const Proxy *operator->() const { return &proxy_; }
28 | 
29 |     void *Get() { return stream_.Get(); }
30 |     const void *Get() const { return stream_.Get(); }
31 | 
32 |     operator bool() const { return stream_; }
33 |     bool operator!() const { return !stream_; }
34 |     void Poison() { stream_.Poison(); }
35 | 
36 |     ProxyStream<Proxy> &operator++() {
37 |       ++stream_;
38 |       proxy_.ReBase(stream_.Get());
39 |       return *this;
40 |     }
41 | 
42 |   private:
43 |     Proxy proxy_;
44 |     util::stream::Stream stream_;
45 | };
46 | 
47 | template <class Payload> class NGramStream : public ProxyStream<NGram<Payload> > {
48 |   public:
49 |     // Make an invalid stream.
50 |     NGramStream() {}
51 | 
52 |     explicit NGramStream(const util::stream::ChainPosition &position) :
53 |       ProxyStream<NGram<Payload> >(position, NGram<Payload>(NULL, NGram<Payload>::OrderFromSize(position.GetChain().EntrySize()))) {}
54 | };
55 | 
56 | template <class Payload> class NGramStreams : public util::stream::GenericStreams<NGramStream<Payload> > {
57 |   private:
58 |     typedef util::stream::GenericStreams<NGramStream<Payload> > P;
59 |   public:
60 |     NGramStreams() : P() {}
61 |     NGramStreams(const util::stream::ChainPositions &positions) : P(positions) {}
62 | };
63 | 
64 | } // namespace
65 | #endif // LM_BUILDER_NGRAM_STREAM_H
66 | 


--------------------------------------------------------------------------------
/kenlm/lm/common/print.cc:
--------------------------------------------------------------------------------
 1 | #include "lm/common/print.hh"
 2 | 
 3 | #include "lm/common/ngram_stream.hh"
 4 | #include "util/file_stream.hh"
 5 | #include "util/file.hh"
 6 | #include "util/mmap.hh"
 7 | #include "util/scoped.hh"
 8 | 
 9 | #include <sstream>
10 | #include <cstring>
11 | 
12 | namespace lm {
13 | 
14 | VocabReconstitute::VocabReconstitute(int fd) {
15 |   uint64_t size = util::SizeOrThrow(fd);
16 |   util::MapRead(util::POPULATE_OR_READ, fd, 0, size, memory_);
17 |   const char *const start = static_cast<const char*>(memory_.get());
18 |   const char *i;
19 |   for (i = start; i != start + size; i += strlen(i) + 1) {
20 |     map_.push_back(i);
21 |   }
22 |   // Last one for LookupPiece.
23 |   map_.push_back(i);
24 | }
25 | 
26 | namespace {
27 | template <class Payload> void PrintLead(const VocabReconstitute &vocab, ProxyStream<Payload> &stream, util::FileStream &out) {
28 |   out << stream->Value().prob << '\t' << vocab.Lookup(*stream->begin());
29 |   for (const WordIndex *i = stream->begin() + 1; i != stream->end(); ++i) {
30 |     out << ' ' << vocab.Lookup(*i);
31 |   }
32 | }
33 | } // namespace
34 | 
35 | void PrintARPA::Run(const util::stream::ChainPositions &positions) {
36 |   VocabReconstitute vocab(vocab_fd_);
37 |   util::FileStream out(out_fd_);
38 |   out << "\\data\\\n";
39 |   for (size_t i = 0; i < positions.size(); ++i) {
40 |     out << "ngram " << (i+1) << '=' << counts_[i] << '\n';
41 |   }
42 |   out << '\n';
43 | 
44 |   for (unsigned order = 1; order < positions.size(); ++order) {
45 |     out << "\\" << order << "-grams:" << '\n';
46 |     for (ProxyStream<NGram<ProbBackoff> > stream(positions[order - 1], NGram<ProbBackoff>(NULL, order)); stream; ++stream) {
47 |       PrintLead(vocab, stream, out);
48 |       out << '\t' << stream->Value().backoff << '\n';
49 |     }
50 |     out << '\n';
51 |   }
52 | 
53 |   out << "\\" << positions.size() << "-grams:" << '\n';
54 |   for (ProxyStream<NGram<Prob> > stream(positions.back(), NGram<Prob>(NULL, positions.size())); stream; ++stream) {
55 |     PrintLead(vocab, stream, out);
56 |     out << '\n';
57 |   }
58 |   out << '\n';
59 |   out << "\\end\\\n";
60 | }
61 | 
62 | } // namespace lm
63 | 


--------------------------------------------------------------------------------
/kenlm/lm/common/print.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_COMMON_PRINT_H
 2 | #define LM_COMMON_PRINT_H
 3 | 
 4 | #include "lm/word_index.hh"
 5 | #include "util/mmap.hh"
 6 | #include "util/string_piece.hh"
 7 | 
 8 | #include <cassert>
 9 | #include <vector>
10 | 
11 | namespace util { namespace stream { class ChainPositions; }}
12 | 
13 | // Warning: PrintARPA routines read all unigrams before all bigrams before all
14 | // trigrams etc.  So if other parts of the chain move jointly, you'll have to
15 | // buffer.
16 | 
17 | namespace lm {
18 | 
19 | class VocabReconstitute {
20 |   public:
21 |     // fd must be alive for life of this object; does not take ownership.
22 |     explicit VocabReconstitute(int fd);
23 | 
24 |     const char *Lookup(WordIndex index) const {
25 |       assert(index < map_.size() - 1);
26 |       return map_[index];
27 |     }
28 | 
29 |     StringPiece LookupPiece(WordIndex index) const {
30 |       return StringPiece(map_[index], map_[index + 1] - 1 - map_[index]);
31 |     }
32 | 
33 |     std::size_t Size() const {
34 |       // There's an extra entry to support StringPiece lengths.
35 |       return map_.size() - 1;
36 |     }
37 | 
38 |   private:
39 |     util::scoped_memory memory_;
40 |     std::vector<const char*> map_;
41 | };
42 | 
43 | class PrintARPA {
44 |   public:
45 |     // Does not take ownership of vocab_fd or out_fd.
46 |     explicit PrintARPA(int vocab_fd, int out_fd, const std::vector<uint64_t> &counts)
47 |       : vocab_fd_(vocab_fd), out_fd_(out_fd), counts_(counts) {}
48 | 
49 |     void Run(const util::stream::ChainPositions &positions);
50 | 
51 |   private:
52 |     int vocab_fd_;
53 |     int out_fd_;
54 |     std::vector<uint64_t> counts_;
55 | };
56 | 
57 | } // namespace lm
58 | #endif // LM_COMMON_PRINT_H
59 | 


--------------------------------------------------------------------------------
/kenlm/lm/common/renumber.cc:
--------------------------------------------------------------------------------
 1 | #include "lm/common/renumber.hh"
 2 | #include "lm/common/ngram.hh"
 3 | 
 4 | #include "util/stream/stream.hh"
 5 | 
 6 | namespace lm {
 7 | 
 8 | void Renumber::Run(const util::stream::ChainPosition &position) {
 9 |   for (util::stream::Stream stream(position); stream; ++stream) {
10 |     NGramHeader gram(stream.Get(), order_);
11 |     for (WordIndex *w = gram.begin(); w != gram.end(); ++w) {
12 |       *w = new_numbers_[*w];
13 |     }
14 |   }
15 | }
16 | 
17 | } // namespace lm
18 | 


--------------------------------------------------------------------------------
/kenlm/lm/common/renumber.hh:
--------------------------------------------------------------------------------
 1 | /* Map vocab ids.  This is useful to merge independently collected counts or
 2 |  * change the vocab ids to the order used by the trie.
 3 |  */
 4 | #ifndef LM_COMMON_RENUMBER_H
 5 | #define LM_COMMON_RENUMBER_H
 6 | 
 7 | #include "lm/word_index.hh"
 8 | 
 9 | #include <cstddef>
10 | 
11 | namespace util { namespace stream { class ChainPosition; }}
12 | 
13 | namespace lm {
14 | 
15 | class Renumber {
16 |   public:
17 |     // Assumes the array is large enough to map all words and stays alive while
18 |     // the thread is active.
19 |     Renumber(const WordIndex *new_numbers, std::size_t order)
20 |       : new_numbers_(new_numbers), order_(order) {}
21 | 
22 |     void Run(const util::stream::ChainPosition &position);
23 | 
24 |   private:
25 |     const WordIndex *new_numbers_;
26 |     std::size_t order_;
27 | };
28 | 
29 | } // namespace lm
30 | #endif // LM_COMMON_RENUMBER_H
31 | 


--------------------------------------------------------------------------------
/kenlm/lm/common/size_option.cc:
--------------------------------------------------------------------------------
 1 | #include <boost/program_options.hpp>
 2 | #include "util/usage.hh"
 3 | 
 4 | namespace lm {
 5 | 
 6 | namespace {
 7 | class SizeNotify {
 8 |   public:
 9 |     explicit SizeNotify(std::size_t &out) : behind_(out) {}
10 | 
11 |     void operator()(const std::string &from) {
12 |       behind_ = util::ParseSize(from);
13 |     }
14 | 
15 |   private:
16 |     std::size_t &behind_;
17 | };
18 | }
19 | 
20 | boost::program_options::typed_value<std::string> *SizeOption(std::size_t &to, const char *default_value) {
21 |   return boost::program_options::value<std::string>()->notifier(SizeNotify(to))->default_value(default_value);
22 | }
23 | 
24 | } // namespace lm
25 | 


--------------------------------------------------------------------------------
/kenlm/lm/common/size_option.hh:
--------------------------------------------------------------------------------
 1 | #include <boost/program_options.hpp>
 2 | 
 3 | #include <cstddef>
 4 | #include <string>
 5 | 
 6 | namespace lm {
 7 | 
 8 | // Create a boost program option for data sizes.  This parses sizes like 1T and 10k.
 9 | boost::program_options::typed_value<std::string> *SizeOption(std::size_t &to, const char *default_value);
10 | 
11 | } // namespace lm
12 | 


--------------------------------------------------------------------------------
/kenlm/lm/common/special.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_COMMON_SPECIAL_H
 2 | #define LM_COMMON_SPECIAL_H
 3 | 
 4 | #include "lm/word_index.hh"
 5 | 
 6 | namespace lm {
 7 | 
 8 | class SpecialVocab {
 9 |   public:
10 |     SpecialVocab(WordIndex bos, WordIndex eos) : bos_(bos), eos_(eos) {}
11 | 
12 |     bool IsSpecial(WordIndex word) const {
13 |       return word == kUNK || word == bos_ || word == eos_;
14 |     }
15 | 
16 |     WordIndex UNK() const { return kUNK; }
17 |     WordIndex BOS() const { return bos_; }
18 |     WordIndex EOS() const { return eos_; }
19 | 
20 |   private:
21 |     WordIndex bos_;
22 |     WordIndex eos_;
23 | };
24 | 
25 | } // namespace lm
26 | 
27 | #endif // LM_COMMON_SPECIAL_H
28 | 


--------------------------------------------------------------------------------
/kenlm/lm/common/test_data/generate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ../../../bin/lmplz --discount_fallback -o 3 -S 100M --intermediate toy0 --arpa toy0.arpa <<EOF
 3 | a a b a
 4 | b a a b
 5 | EOF
 6 | ../../../bin/lmplz --discount_fallback -o 3 -S 100M --intermediate toy1 --arpa toy1.arpa <<EOF
 7 | a a b b b b b b b
 8 | c
 9 | EOF
10 | 


--------------------------------------------------------------------------------
/kenlm/lm/common/test_data/toy0.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/kenlm/lm/common/test_data/toy0.1


--------------------------------------------------------------------------------
/kenlm/lm/common/test_data/toy0.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/kenlm/lm/common/test_data/toy0.2


--------------------------------------------------------------------------------
/kenlm/lm/common/test_data/toy0.3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/kenlm/lm/common/test_data/toy0.3


--------------------------------------------------------------------------------
/kenlm/lm/common/test_data/toy0.arpa:
--------------------------------------------------------------------------------
 1 | \data\
 2 | ngram 1=5
 3 | ngram 2=7
 4 | ngram 3=7
 5 | 
 6 | \1-grams:
 7 | -0.90309	<unk>	0
 8 | 0	<s>	-0.30103
 9 | -0.46943438	a	-0.30103
10 | -0.5720968	</s>	0
11 | -0.5720968	b	-0.30103
12 | 
13 | \2-grams:
14 | -0.37712017	<s> a	-0.30103
15 | -0.37712017	a a	-0.30103
16 | -0.2984526	b a	-0.30103
17 | -0.58682007	a </s>	0
18 | -0.52201796	b </s>	0
19 | -0.41574955	<s> b	-0.30103
20 | -0.58682007	a b	-0.30103
21 | 
22 | \3-grams:
23 | -0.14885087	<s> a a
24 | -0.33741078	b a a
25 | -0.124077894	<s> b a
26 | -0.2997394	a b a
27 | -0.42082912	b a </s>
28 | -0.397617	a b </s>
29 | -0.20102891	a a b
30 | 
31 | \end\
32 | 


--------------------------------------------------------------------------------
/kenlm/lm/common/test_data/toy0.kenlm_intermediate:
--------------------------------------------------------------------------------
1 | KenLM intermediate binary file
2 | Counts 5 7 7
3 | Payload pb
4 | 


--------------------------------------------------------------------------------
/kenlm/lm/common/test_data/toy0.vocab:
--------------------------------------------------------------------------------
1 | <unk> <s> a </s> b 


--------------------------------------------------------------------------------
/kenlm/lm/common/test_data/toy1.1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/kenlm/lm/common/test_data/toy1.1


--------------------------------------------------------------------------------
/kenlm/lm/common/test_data/toy1.2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/kenlm/lm/common/test_data/toy1.2


--------------------------------------------------------------------------------
/kenlm/lm/common/test_data/toy1.3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/kenlm/lm/common/test_data/toy1.3


--------------------------------------------------------------------------------
/kenlm/lm/common/test_data/toy1.arpa:
--------------------------------------------------------------------------------
 1 | \data\
 2 | ngram 1=6
 3 | ngram 2=7
 4 | ngram 3=6
 5 | 
 6 | \1-grams:
 7 | -1	<unk>	0
 8 | 0	<s>	-0.30103
 9 | -0.6146491	a	-0.30103
10 | -0.6146491	</s>	0
11 | -0.7659168	c	-0.30103
12 | -0.6146491	b	-0.30103
13 | 
14 | \2-grams:
15 | -0.4301247	<s> a	-0.30103
16 | -0.4301247	a a	-0.30103
17 | -0.20660876	c </s>	0
18 | -0.5404639	b </s>	0
19 | -0.4740302	<s> c	-0.30103
20 | -0.4301247	a b	-0.30103
21 | -0.3422159	b b	-0.47712123
22 | 
23 | \3-grams:
24 | -0.1638568	<s> a a
25 | -0.09113217	<s> c </s>
26 | -0.7462621	b b </s>
27 | -0.1638568	a a b
28 | -0.13823806	a b b
29 | -0.13375957	b b b
30 | 
31 | \end\
32 | 


--------------------------------------------------------------------------------
/kenlm/lm/common/test_data/toy1.kenlm_intermediate:
--------------------------------------------------------------------------------
1 | KenLM intermediate binary file
2 | Counts 6 7 6
3 | Payload pb
4 | 


--------------------------------------------------------------------------------
/kenlm/lm/common/test_data/toy1.vocab:
--------------------------------------------------------------------------------
1 | <unk> <s> a </s> c b 


--------------------------------------------------------------------------------
/kenlm/lm/config.cc:
--------------------------------------------------------------------------------
 1 | #include "lm/config.hh"
 2 | 
 3 | #include <iostream>
 4 | 
 5 | namespace lm {
 6 | namespace ngram {
 7 | 
 8 | Config::Config() :
 9 |   show_progress(true),
10 |   messages(&std::cerr),
11 |   enumerate_vocab(NULL),
12 |   unknown_missing(COMPLAIN),
13 |   sentence_marker_missing(THROW_UP),
14 |   positive_log_probability(THROW_UP),
15 |   unknown_missing_logprob(-100.0),
16 |   probing_multiplier(1.5),
17 |   building_memory(1073741824ULL), // 1 GB
18 |   temporary_directory_prefix(""),
19 |   arpa_complain(ALL),
20 |   write_mmap(NULL),
21 |   write_method(WRITE_AFTER),
22 |   include_vocab(true),
23 |   rest_function(REST_MAX),
24 |   prob_bits(8),
25 |   backoff_bits(8),
26 |   pointer_bhiksha_bits(22),
27 |   load_method(util::POPULATE_OR_READ) {}
28 | 
29 | } // namespace ngram
30 | } // namespace lm
31 | 


--------------------------------------------------------------------------------
/kenlm/lm/enumerate_vocab.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_ENUMERATE_VOCAB_H
 2 | #define LM_ENUMERATE_VOCAB_H
 3 | 
 4 | #include "lm/word_index.hh"
 5 | #include "util/string_piece.hh"
 6 | 
 7 | namespace lm {
 8 | 
 9 | /* If you need the actual strings in the vocabulary, inherit from this class
10 |  * and implement Add.  Then put a pointer in Config.enumerate_vocab; it does
11 |  * not take ownership.  Add is called once per vocab word.  index starts at 0
12 |  * and increases by 1 each time.  This is only used by the Model constructor;
13 |  * the pointer is not retained by the class.
14 |  */
15 | class EnumerateVocab {
16 |   public:
17 |     virtual ~EnumerateVocab() {}
18 | 
19 |     virtual void Add(WordIndex index, const StringPiece &str) = 0;
20 | 
21 |   protected:
22 |     EnumerateVocab() {}
23 | };
24 | 
25 | } // namespace lm
26 | 
27 | #endif // LM_ENUMERATE_VOCAB_H
28 | 
29 | 


--------------------------------------------------------------------------------
/kenlm/lm/filter/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
 2 | 
 3 | # Explicitly list the source files for this subdirectory
 4 | #
 5 | # If you add any source files to this subdirectory
 6 | #    that should be included in the kenlm library,
 7 | #        (this excludes any unit test files)
 8 | #    you should add them to the following list:
 9 | #
10 | # In order to set correct paths to these files
11 | #    in case this variable is referenced by CMake files in the parent directory,
12 | #    we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
13 | #
14 | set(KENLM_FILTER_SOURCE
15 | 		${CMAKE_CURRENT_SOURCE_DIR}/arpa_io.cc
16 | 		${CMAKE_CURRENT_SOURCE_DIR}/phrase.cc
17 | 		${CMAKE_CURRENT_SOURCE_DIR}/vocab.cc
18 | 	)
19 | 
20 | # Group these objects together for later use.
21 | #
22 | # Given add_library(foo OBJECT ${my_foo_sources}),
23 | # refer to these objects as $<TARGET_OBJECTS:foo>
24 | #
25 | add_library(kenlm_filter ${KENLM_FILTER_SOURCE})
26 | 
27 | if (NOT MSVC)
28 | 	set(THREADS pthread)
29 | endif()
30 | 
31 | AddExes(EXES filter phrase_table_vocab
32 |         LIBRARIES kenlm_filter kenlm kenlm_util ${Boost_LIBRARIES} ${THREADS})
33 | 
34 | 


--------------------------------------------------------------------------------
/kenlm/lm/filter/arpa_io.cc:
--------------------------------------------------------------------------------
 1 | #include "lm/filter/arpa_io.hh"
 2 | #include "util/file_piece.hh"
 3 | #include "util/string_stream.hh"
 4 | 
 5 | #include <iostream>
 6 | #include <ostream>
 7 | #include <string>
 8 | #include <vector>
 9 | 
10 | #include <cctype>
11 | #include <cerrno>
12 | #include <cstring>
13 | 
14 | namespace lm {
15 | 
16 | ARPAInputException::ARPAInputException(const StringPiece &message) throw() {
17 |   *this << message;
18 | }
19 | 
20 | ARPAInputException::ARPAInputException(const StringPiece &message, const StringPiece &line) throw() {
21 |   *this << message << " in line " << line;
22 | }
23 | 
24 | ARPAInputException::~ARPAInputException() throw() {}
25 | 
26 | // Seeking is the responsibility of the caller.
27 | template <class Stream> void WriteCounts(Stream &out, const std::vector<uint64_t> &number) {
28 |   out << "\n\\data\\\n";
29 |   for (unsigned int i = 0; i < number.size(); ++i) {
30 |     out << "ngram " << i+1 << "=" << number[i] << '\n';
31 |   }
32 |   out << '\n';
33 | }
34 | 
35 | size_t SizeNeededForCounts(const std::vector<uint64_t> &number) {
36 |   util::StringStream stream;
37 |   WriteCounts(stream, number);
38 |   return stream.str().size();
39 | }
40 | 
41 | bool IsEntirelyWhiteSpace(const StringPiece &line) {
42 |   for (size_t i = 0; i < static_cast<size_t>(line.size()); ++i) {
43 |     if (!isspace(line.data()[i])) return false;
44 |   }
45 |   return true;
46 | }
47 | 
48 | ARPAOutput::ARPAOutput(const char *name, size_t buffer_size)
49 |   : file_backing_(util::CreateOrThrow(name)), file_(file_backing_.get(), buffer_size) {}
50 | 
51 | void ARPAOutput::ReserveForCounts(std::streampos reserve) {
52 |   for (std::streampos i = 0; i < reserve; i += std::streampos(1)) {
53 |     file_ << '\n';
54 |   }
55 | }
56 | 
57 | void ARPAOutput::BeginLength(unsigned int length) {
58 |   file_ << '\\' << length << "-grams:" << '\n';
59 |   fast_counter_ = 0;
60 | }
61 | 
62 | void ARPAOutput::EndLength(unsigned int length) {
63 |   file_ << '\n';
64 |   if (length > counts_.size()) {
65 |     counts_.resize(length);
66 |   }
67 |   counts_[length - 1] = fast_counter_;
68 | }
69 | 
70 | void ARPAOutput::Finish() {
71 |   file_ << "\\end\\\n";
72 |   file_.seekp(0);
73 |   WriteCounts(file_, counts_);
74 |   file_.flush();
75 | }
76 | 
77 | } // namespace lm
78 | 


--------------------------------------------------------------------------------
/kenlm/lm/filter/vocab.cc:
--------------------------------------------------------------------------------
 1 | #include "lm/filter/vocab.hh"
 2 | 
 3 | #include <istream>
 4 | #include <iostream>
 5 | 
 6 | #include <cctype>
 7 | 
 8 | namespace lm {
 9 | namespace vocab {
10 | 
11 | void ReadSingle(std::istream &in, boost::unordered_set<std::string> &out) {
12 |   in.exceptions(std::istream::badbit);
13 |   std::string word;
14 |   while (in >> word) {
15 |     out.insert(word);
16 |   }
17 | }
18 | 
19 | namespace {
20 | bool IsLineEnd(std::istream &in) {
21 |   int got;
22 |   do {
23 |     got = in.get();
24 |     if (!in) return true;
25 |     if (got == '\n') return true;
26 |   } while (isspace(got));
27 |   in.unget();
28 |   return false;
29 | }
30 | }// namespace
31 | 
32 | // Read space separated words in enter separated lines.  These lines can be
33 | // very long, so don't read an entire line at a time.
34 | unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out) {
35 |   in.exceptions(std::istream::badbit);
36 |   unsigned int sentence = 0;
37 |   bool used_id = false;
38 |   std::string word;
39 |   while (in >> word) {
40 |     used_id = true;
41 |     std::vector<unsigned int> &posting = out[word];
42 |     if (posting.empty() || (posting.back() != sentence))
43 |       posting.push_back(sentence);
44 |     if (IsLineEnd(in)) {
45 |       ++sentence;
46 |       used_id = false;
47 |     }
48 |   }
49 |   return sentence + used_id;
50 | }
51 | 
52 | } // namespace vocab
53 | } // namespace lm
54 | 


--------------------------------------------------------------------------------
/kenlm/lm/filter/wrapper.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_FILTER_WRAPPER_H
 2 | #define LM_FILTER_WRAPPER_H
 3 | 
 4 | #include "util/string_piece.hh"
 5 | 
 6 | #include <algorithm>
 7 | #include <string>
 8 | #include <vector>
 9 | 
10 | namespace lm {
11 | 
12 | // Provide a single-output filter with the same interface as a
13 | // multiple-output filter so clients code against one interface.
14 | template <class Binary> class BinaryFilter {
15 |   public:
16 |     // Binary modes are just references (and a set) and it makes the API cleaner to copy them.
17 |     explicit BinaryFilter(Binary binary) : binary_(binary) {}
18 | 
19 |     template <class Iterator, class Output> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) {
20 |       if (binary_.PassNGram(begin, end))
21 |         output.AddNGram(line);
22 |     }
23 | 
24 |     template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) {
25 |       AddNGram(util::TokenIter<util::SingleCharacter, true>(ngram, ' '), util::TokenIter<util::SingleCharacter, true>::end(), line, output);
26 |     }
27 | 
28 |     void Flush() const {}
29 | 
30 |   private:
31 |     Binary binary_;
32 | };
33 | 
34 | // Wrap another filter to pay attention only to context words
35 | template <class FilterT> class ContextFilter {
36 |   public:
37 |     typedef FilterT Filter;
38 | 
39 |     explicit ContextFilter(Filter &backend) : backend_(backend) {}
40 | 
41 |     template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) {
42 |       // Find beginning of string or last space.
43 |       const char *last_space;
44 |       for (last_space = ngram.data() + ngram.size() - 1; last_space > ngram.data() && *last_space != ' '; --last_space) {}
45 |       backend_.AddNGram(StringPiece(ngram.data(), last_space - ngram.data()), line, output);
46 |     }
47 | 
48 |     void Flush() const {}
49 | 
50 |   private:
51 |     Filter backend_;
52 | };
53 | 
54 | } // namespace lm
55 | 
56 | #endif // LM_FILTER_WRAPPER_H
57 | 


--------------------------------------------------------------------------------
/kenlm/lm/fragment_main.cc:
--------------------------------------------------------------------------------
 1 | #include "lm/binary_format.hh"
 2 | #include "lm/model.hh"
 3 | #include "lm/left.hh"
 4 | #include "util/tokenize_piece.hh"
 5 | 
 6 | template <class Model> void Query(const char *name) {
 7 |   Model model(name);
 8 |   std::string line;
 9 |   lm::ngram::ChartState ignored;
10 |   while (getline(std::cin, line)) {
11 |     lm::ngram::RuleScore<Model> scorer(model, ignored);
12 |     for (util::TokenIter<util::SingleCharacter, true> i(line, ' '); i; ++i) {
13 |       scorer.Terminal(model.GetVocabulary().Index(*i));
14 |     }
15 |     std::cout << scorer.Finish() << '\n';
16 |   }
17 | }
18 | 
19 | int main(int argc, char *argv[]) {
20 |   if (argc != 2) {
21 |     std::cerr << "Expected model file name." << std::endl;
22 |     return 1;
23 |   }
24 |   const char *name = argv[1];
25 |   lm::ngram::ModelType model_type = lm::ngram::PROBING;
26 |   lm::ngram::RecognizeBinary(name, model_type);
27 |   switch (model_type) {
28 |     case lm::ngram::PROBING:
29 |       Query<lm::ngram::ProbingModel>(name);
30 |       break;
31 |     case lm::ngram::REST_PROBING:
32 |       Query<lm::ngram::RestProbingModel>(name);
33 |       break;
34 |     default:
35 |       std::cerr << "Model type not supported yet." << std::endl;
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/kenlm/lm/interpolate/backoff_matrix.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_INTERPOLATE_BACKOFF_MATRIX_H
 2 | #define LM_INTERPOLATE_BACKOFF_MATRIX_H
 3 | 
 4 | #include <cstddef>
 5 | #include <vector>
 6 | 
 7 | namespace lm { namespace interpolate {
 8 | 
 9 | class BackoffMatrix {
10 |   public:
11 |     BackoffMatrix(std::size_t num_models, std::size_t max_order)
12 |       : max_order_(max_order), backing_(num_models * max_order) {}
13 | 
14 |     float &Backoff(std::size_t model, std::size_t order_minus_1) {
15 |       return backing_[model * max_order_ + order_minus_1];
16 |     }
17 | 
18 |     float Backoff(std::size_t model, std::size_t order_minus_1) const {
19 |       return backing_[model * max_order_ + order_minus_1];
20 |     }
21 | 
22 |   private:
23 |     const std::size_t max_order_;
24 |     std::vector<float> backing_;
25 | };
26 | 
27 | }} // namespaces
28 | 
29 | #endif // LM_INTERPOLATE_BACKOFF_MATRIX_H
30 | 


--------------------------------------------------------------------------------
/kenlm/lm/interpolate/backoff_reunification.cc:
--------------------------------------------------------------------------------
 1 | #include "lm/interpolate/backoff_reunification.hh"
 2 | #include "lm/common/model_buffer.hh"
 3 | #include "lm/common/ngram_stream.hh"
 4 | #include "lm/common/ngram.hh"
 5 | #include "lm/common/compare.hh"
 6 | 
 7 | #include <algorithm>
 8 | #include <cassert>
 9 | 
10 | namespace lm {
11 | namespace interpolate {
12 | 
13 | namespace {
14 | class MergeWorker {
15 | public:
16 |   MergeWorker(std::size_t order, const util::stream::ChainPosition &prob_pos,
17 |               const util::stream::ChainPosition &boff_pos)
18 |       : order_(order), prob_pos_(prob_pos), boff_pos_(boff_pos) {
19 |     // nothing
20 |   }
21 | 
22 |   void Run(const util::stream::ChainPosition &position) {
23 |     lm::NGramStream<ProbBackoff> stream(position);
24 | 
25 |     lm::NGramStream<float> prob_input(prob_pos_);
26 |     util::stream::Stream boff_input(boff_pos_);
27 |     for (; prob_input && boff_input; ++prob_input, ++boff_input, ++stream) {
28 |       std::copy(prob_input->begin(), prob_input->end(), stream->begin());
29 |       stream->Value().prob = std::min(0.0f, prob_input->Value());
30 |       stream->Value().backoff = *reinterpret_cast<float *>(boff_input.Get());
31 |     }
32 |     UTIL_THROW_IF2(prob_input || boff_input,
33 |                    "Streams were not the same size during merging");
34 |     stream.Poison();
35 |   }
36 | 
37 | private:
38 |   std::size_t order_;
39 |   util::stream::ChainPosition prob_pos_;
40 |   util::stream::ChainPosition boff_pos_;
41 | };
42 | }
43 | 
44 | // Since we are *adding* something to the output chain here, we pass in the
45 | // chain itself so that we can safely add a new step to the chain without
46 | // creating a deadlock situation (since creating a new ChainPosition will
47 | // make a new input/output pair---we want that position to be created
48 | // *here*, not before).
49 | void ReunifyBackoff(util::stream::ChainPositions &prob_pos,
50 |                     util::stream::ChainPositions &boff_pos,
51 |                     util::stream::Chains &output_chains) {
52 |   assert(prob_pos.size() == boff_pos.size());
53 | 
54 |   for (size_t i = 0; i < prob_pos.size(); ++i)
55 |     output_chains[i] >> MergeWorker(i + 1, prob_pos[i], boff_pos[i]);
56 | }
57 | }
58 | }
59 | 


--------------------------------------------------------------------------------
/kenlm/lm/interpolate/backoff_reunification.hh:
--------------------------------------------------------------------------------
 1 | #ifndef KENLM_INTERPOLATE_BACKOFF_REUNIFICATION_
 2 | #define KENLM_INTERPOLATE_BACKOFF_REUNIFICATION_
 3 | 
 4 | #include "util/stream/stream.hh"
 5 | #include "util/stream/multi_stream.hh"
 6 | 
 7 | namespace lm {
 8 | namespace interpolate {
 9 | 
10 | /**
11 |  * The third pass for the offline log-linear interpolation algorithm. This
12 |  * reads **suffix-ordered** probability values (ngram-id, float) and
13 |  * **suffix-ordered** backoff values (float) and writes the merged contents
14 |  * to the output.
15 |  *
16 |  * @param prob_pos The chain position for each order from which to read
17 |  *  the probability values
18 |  * @param boff_pos The chain position for each order from which to read
19 |  *  the backoff values
20 |  * @param output_chains The output chains for each order
21 |  */
22 | void ReunifyBackoff(util::stream::ChainPositions &prob_pos,
23 |                     util::stream::ChainPositions &boff_pos,
24 |                     util::stream::Chains &output_chains);
25 | }
26 | }
27 | #endif
28 | 


--------------------------------------------------------------------------------
/kenlm/lm/interpolate/bounded_sequence_encoding.cc:
--------------------------------------------------------------------------------
 1 | #include "lm/interpolate/bounded_sequence_encoding.hh"
 2 | 
 3 | #include <algorithm>
 4 | 
 5 | namespace lm { namespace interpolate {
 6 | 
 7 | BoundedSequenceEncoding::BoundedSequenceEncoding(const unsigned char *bound_begin, const unsigned char *bound_end)
 8 |   : entries_(bound_end - bound_begin) {
 9 |   std::size_t full = 0;
10 |   Entry entry;
11 |   entry.shift = 0;
12 |   for (const unsigned char *i = bound_begin; i != bound_end; ++i) {
13 |     uint8_t length;
14 |     if (*i <= 1) {
15 |       length = 0;
16 |     } else {
17 |       length = sizeof(unsigned int) * 8 - __builtin_clz((unsigned int)*i);
18 |     }
19 |     entry.mask = (1ULL << length) - 1ULL;
20 |     if (entry.shift + length > 64) {
21 |       entry.shift = 0;
22 |       entry.next = true;
23 |       ++full;
24 |     } else {
25 |       entry.next = false;
26 |     }
27 |     entries_.push_back(entry);
28 |     entry.shift += length;
29 |   }
30 |   byte_length_ = full * sizeof(uint64_t) + (entry.shift + 7) / 8;
31 |   first_copy_ = std::min<std::size_t>(byte_length_, sizeof(uint64_t));
32 |   // Size of last uint64_t.  Zero if empty, otherwise [1,8] depending on mod.
33 |   overhang_ = byte_length_ == 0 ? 0 : ((byte_length_ - 1) % 8 + 1);
34 | }
35 | 
36 | }} // namespaces
37 | 


--------------------------------------------------------------------------------
/kenlm/lm/interpolate/bounded_sequence_encoding.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_INTERPOLATE_BOUNDED_SEQUENCE_ENCODING_H
 2 | #define LM_INTERPOLATE_BOUNDED_SEQUENCE_ENCODING_H
 3 | 
 4 | /* Encodes fixed-length sequences of integers with known bounds on each entry.
 5 |  * This is used to encode how far each model has backed off.
 6 |  * TODO: make this class efficient.  Bit-level packing or multiply by bound and
 7 |  * add.
 8 |  */
 9 | 
10 | #include "util/exception.hh"
11 | #include "util/fixed_array.hh"
12 | 
13 | #if BYTE_ORDER != LITTLE_ENDIAN
14 | #warning The interpolation code assumes little endian for now.
15 | #endif
16 | 
17 | #include <algorithm>
18 | #include <cstring>
19 | 
20 | namespace lm {
21 | namespace interpolate {
22 | 
23 | class BoundedSequenceEncoding {
24 |   public:
25 |     // Encode [0, bound_begin[0]) x [0, bound_begin[1]) x [0, bound_begin[2]) x ... x [0, *(bound_end - 1)) for entries in the sequence
26 |     BoundedSequenceEncoding(const unsigned char *bound_begin, const unsigned char *bound_end);
27 | 
28 |     std::size_t Entries() const { return entries_.size(); }
29 | 
30 |     std::size_t EncodedLength() const { return byte_length_; }
31 | 
32 |     void Encode(const unsigned char *from, void *to_void) const {
33 |       uint8_t *to = static_cast<uint8_t*>(to_void);
34 |       uint64_t cur = 0;
35 |       for (const Entry *i = entries_.begin(); i != entries_.end(); ++i, ++from) {
36 |         if (UTIL_UNLIKELY(i->next)) {
37 |           std::memcpy(to, &cur, sizeof(uint64_t));
38 |           to += sizeof(uint64_t);
39 |           cur = 0;
40 |         }
41 |         cur |= static_cast<uint64_t>(*from) << i->shift;
42 |       }
43 |       memcpy(to, &cur, overhang_);
44 |     }
45 | 
46 |     void Decode(const void *from_void, unsigned char *to) const {
47 |       const uint8_t *from = static_cast<const uint8_t*>(from_void);
48 |       uint64_t cur = 0;
49 |       memcpy(&cur, from, first_copy_);
50 |       for (const Entry *i = entries_.begin(); i != entries_.end(); ++i, ++to) {
51 |         if (UTIL_UNLIKELY(i->next)) {
52 |           from += sizeof(uint64_t);
53 |           cur = 0;
54 |           std::memcpy(&cur, from,
55 |               std::min<std::size_t>(sizeof(uint64_t), static_cast<const uint8_t*>(from_void) + byte_length_ - from));
56 |         }
57 |         *to = (cur >> i->shift) & i->mask;
58 |       }
59 |     }
60 | 
61 |   private:
62 |     struct Entry {
63 |       bool next;
64 |       uint8_t shift;
65 |       uint64_t mask;
66 |     };
67 |     util::FixedArray<Entry> entries_;
68 |     std::size_t byte_length_;
69 |     std::size_t first_copy_;
70 |     std::size_t overhang_;
71 | };
72 | 
73 | 
74 | }} // namespaces
75 | 
76 | #endif // LM_INTERPOLATE_BOUNDED_SEQUENCE_ENCODING_H
77 | 


--------------------------------------------------------------------------------
/kenlm/lm/interpolate/interpolate_info.hh:
--------------------------------------------------------------------------------
 1 | #ifndef KENLM_INTERPOLATE_INTERPOLATE_INFO_H
 2 | #define KENLM_INTERPOLATE_INTERPOLATE_INFO_H
 3 | 
 4 | #include <cstddef>
 5 | #include <vector>
 6 | #include <stdint.h>
 7 | 
 8 | namespace lm {
 9 | namespace interpolate {
10 | 
11 | /**
12 |  * Stores relevant info for interpolating several language models, for use
13 |  * during the three-pass offline log-linear interpolation algorithm.
14 |  */
15 | struct InterpolateInfo {
16 |   /**
17 |    * @return the number of models being interpolated
18 |    */
19 |   std::size_t Models() const {
20 |     return orders.size();
21 |   }
22 | 
23 |   /**
24 |    * The lambda (interpolation weight) for each model.
25 |    */
26 |   std::vector<float> lambdas;
27 | 
28 |   /**
29 |    * The maximum ngram order for each model.
30 |    */
31 |   std::vector<uint8_t> orders;
32 | };
33 | }
34 | }
35 | #endif
36 | 


--------------------------------------------------------------------------------
/kenlm/lm/interpolate/merge_test/test1:
--------------------------------------------------------------------------------
1 | <unk> a this cut is first 


--------------------------------------------------------------------------------
/kenlm/lm/interpolate/merge_test/test2:
--------------------------------------------------------------------------------
1 | <unk> is this this a first cut a first 


--------------------------------------------------------------------------------
/kenlm/lm/interpolate/merge_test/test3:
--------------------------------------------------------------------------------
1 | <unk> is i secd 


--------------------------------------------------------------------------------
/kenlm/lm/interpolate/merge_test/test_bad_order:
--------------------------------------------------------------------------------
1 | <unk> secd is 


--------------------------------------------------------------------------------
/kenlm/lm/interpolate/merge_test/test_no_unk:
--------------------------------------------------------------------------------
1 | toto
2 | 


--------------------------------------------------------------------------------
/kenlm/lm/interpolate/merge_vocab.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_INTERPOLATE_MERGE_VOCAB_H
 2 | #define LM_INTERPOLATE_MERGE_VOCAB_H
 3 | 
 4 | #include "lm/word_index.hh"
 5 | #include "util/file.hh"
 6 | #include "util/fixed_array.hh"
 7 | 
 8 | namespace lm {
 9 | 
10 | class EnumerateVocab;
11 | 
12 | namespace interpolate {
13 | 
14 | class UniversalVocab;
15 | 
16 | // The combined vocabulary is enumerated with enumerate.
17 | // Returns the size of the combined vocabulary.
18 | // Does not take ownership of vocab_files.
19 | WordIndex MergeVocab(util::FixedArray<int> &vocab_files, UniversalVocab &vocab, EnumerateVocab &enumerate);
20 | 
21 | }} // namespaces
22 | 
23 | #endif // LM_INTERPOLATE_MERGE_VOCAB_H
24 | 


--------------------------------------------------------------------------------
/kenlm/lm/interpolate/normalize.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_INTERPOLATE_NORMALIZE_H
 2 | #define LM_INTERPOLATE_NORMALIZE_H
 3 | 
 4 | #include "util/fixed_array.hh"
 5 | 
 6 | /* Pass 2:
 7 |  * - Multiply backoff weights by the backed off probabilities from pass 1.
 8 |  * - Compute the normalization factor Z.
 9 |  * - Send Z to the next highest order.
10 |  * - Rewind and divide by Z.
11 |  */
12 | 
13 | namespace util { namespace stream {
14 | class ChainPositions;
15 | class Chains;
16 | }} // namespaces
17 | 
18 | namespace lm { namespace interpolate {
19 | 
20 | struct InterpolateInfo;
21 | 
22 | void Normalize(
23 |     const InterpolateInfo &info,
24 |     // Input full models for backoffs.  Assumes that renumbering has been done. Suffix order.
25 |     util::FixedArray<util::stream::ChainPositions> &models_by_order,
26 |     // Input PartialProbGamma from MergeProbabilities. Context order.
27 |     util::stream::Chains &merged_probabilities,
28 |     // Output NGram<float> with normalized probabilities. Context order.
29 |     util::stream::Chains &probabilities_out,
30 |     // Output bare floats with backoffs.  Note backoffs.size() == order - 1.  Suffix order.
31 |     util::stream::Chains &backoffs_out);
32 | 
33 | }} // namespaces
34 | 
35 | #endif // LM_INTERPOLATE_NORMALIZE_H
36 | 


--------------------------------------------------------------------------------
/kenlm/lm/interpolate/pipeline.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_INTERPOLATE_PIPELINE_H
 2 | #define LM_INTERPOLATE_PIPELINE_H
 3 | 
 4 | #include "lm/common/model_buffer.hh"
 5 | #include "util/fixed_array.hh"
 6 | #include "util/stream/config.hh"
 7 | 
 8 | #include <cstddef>
 9 | #include <string>
10 | 
11 | namespace lm { namespace interpolate {
12 | 
13 | struct Config {
14 |   std::vector<float> lambdas;
15 |   util::stream::SortConfig sort;
16 |   std::size_t BufferSize() const { return sort.buffer_size; }
17 | };
18 | 
19 | void Pipeline(util::FixedArray<ModelBuffer> &models, const Config &config, int write_file);
20 | 
21 | }} // namespaces
22 | #endif // LM_INTERPOLATE_PIPELINE_H
23 | 


--------------------------------------------------------------------------------
/kenlm/lm/interpolate/split_worker.cc:
--------------------------------------------------------------------------------
 1 | #include "lm/interpolate/split_worker.hh"
 2 | #include "lm/common/ngram.hh"
 3 | 
 4 | namespace lm {
 5 | namespace interpolate {
 6 | 
 7 | SplitWorker::SplitWorker(std::size_t order, util::stream::Chain &backoff_chain,
 8 |                          util::stream::Chain &sort_chain)
 9 |     : order_(order) {
10 |   backoff_chain >> backoff_input_;
11 |   sort_chain >> sort_input_;
12 | }
13 | 
14 | void SplitWorker::Run(const util::stream::ChainPosition &position) {
15 |   // input: ngram record (id, prob, and backoff)
16 |   // output: a float to the backoff_input stream
17 |   //         an ngram id and a float to the sort_input stream
18 |   for (util::stream::Stream stream(position); stream; ++stream) {
19 |     NGram<ProbBackoff> ngram(stream.Get(), order_);
20 | 
21 |     // write id and prob to the sort stream
22 |     float prob = ngram.Value().prob;
23 |     lm::WordIndex *out = reinterpret_cast<lm::WordIndex *>(sort_input_.Get());
24 |     for (const lm::WordIndex *it = ngram.begin(); it != ngram.end(); ++it) {
25 |       *out++ = *it;
26 |     }
27 |     *reinterpret_cast<float *>(out) = prob;
28 |     ++sort_input_;
29 | 
30 |     // write backoff to the backoff output stream
31 |     float boff = ngram.Value().backoff;
32 |     *reinterpret_cast<float *>(backoff_input_.Get()) = boff;
33 |     ++backoff_input_;
34 |   }
35 |   sort_input_.Poison();
36 |   backoff_input_.Poison();
37 | }
38 | 
39 | }
40 | }
41 | 


--------------------------------------------------------------------------------
/kenlm/lm/interpolate/split_worker.hh:
--------------------------------------------------------------------------------
 1 | #ifndef KENLM_INTERPOLATE_SPLIT_WORKER_H_
 2 | #define KENLM_INTERPOLATE_SPLIT_WORKER_H_
 3 | 
 4 | #include "util/stream/chain.hh"
 5 | #include "util/stream/stream.hh"
 6 | 
 7 | namespace lm {
 8 | namespace interpolate {
 9 | 
10 | class SplitWorker {
11 |   public:
12 |     /**
13 |      * Constructs a split worker for a particular order. It writes the
14 |      * split-off backoff values to the backoff chain and the ngram id and
15 |      * probability to the sort chain for each ngram in the input.
16 |      */
17 |     SplitWorker(std::size_t order, util::stream::Chain &backoff_chain,
18 |                 util::stream::Chain &sort_chain);
19 | 
20 |     /**
21 |      * The callback invoked to handle the input from the ngram intermediate
22 |      * files.
23 |      */
24 |     void Run(const util::stream::ChainPosition& position);
25 | 
26 |   private:
27 |     /**
28 |      * The ngram order we are reading/writing for.
29 |      */
30 |     std::size_t order_;
31 | 
32 |     /**
33 |      * The stream to write to for the backoff values.
34 |      */
35 |     util::stream::Stream backoff_input_;
36 | 
37 |     /**
38 |      * The stream to write to for the ngram id + probability values.
39 |      */
40 |     util::stream::Stream sort_input_;
41 | };
42 | }
43 | }
44 | #endif
45 | 


--------------------------------------------------------------------------------
/kenlm/lm/interpolate/tune_derivatives.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_INTERPOLATE_TUNE_DERIVATIVES_H
 2 | #define LM_INTERPOLATE_TUNE_DERIVATIVES_H
 3 | 
 4 | #include "lm/interpolate/tune_matrix.hh"
 5 | 
 6 | #include <Eigen/Core>
 7 | #include <cmath>
 8 | 
 9 | namespace lm { namespace interpolate {
10 | 
11 | class Instances;
12 | 
13 | // Given tuning instances and model weights, computes the objective function (log probability), gradient, and Hessian.
14 | // Returns log probability / number of instances.
15 | Accum Derivatives(Instances &instances /* Doesn't modify but ReadExtensions is lazy */, const Vector &weights, Vector &gradient, Matrix &hessian);
16 | 
17 | }} // namespaces
18 | 
19 | #endif // LM_INTERPOLATE_TUNE_DERIVATIVES_H
20 | 
21 | 


--------------------------------------------------------------------------------
/kenlm/lm/interpolate/tune_matrix.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_INTERPOLATE_TUNE_MATRIX_H
 2 | #define LM_INTERPOLATE_TUNE_MATRIX_H
 3 | 
 4 | #pragma GCC diagnostic push
 5 | #pragma GCC diagnostic ignored "-Wpragmas" // Older gcc doesn't have "-Wunused-local-typedefs" and complains.
 6 | #pragma GCC diagnostic ignored "-Wunused-local-typedefs"
 7 | #include <Eigen/Core>
 8 | #pragma GCC diagnostic pop
 9 | 
10 | namespace lm { namespace interpolate {
11 | 
12 | typedef Eigen::MatrixXf Matrix;
13 | typedef Eigen::VectorXf Vector;
14 | 
15 | typedef Matrix::Scalar Accum;
16 | 
17 | }} // namespaces
18 | #endif // LM_INTERPOLATE_TUNE_MATRIX_H
19 | 


--------------------------------------------------------------------------------
/kenlm/lm/interpolate/tune_weights.cc:
--------------------------------------------------------------------------------
 1 | #include "lm/interpolate/tune_weights.hh"
 2 | 
 3 | #include "lm/interpolate/tune_derivatives.hh"
 4 | #include "lm/interpolate/tune_instances.hh"
 5 | 
 6 | #pragma GCC diagnostic push
 7 | #pragma GCC diagnostic ignored "-Wpragmas" // Older gcc doesn't have "-Wunused-local-typedefs" and complains.
 8 | #pragma GCC diagnostic ignored "-Wunused-local-typedefs"
 9 | #include <Eigen/Dense>
10 | #pragma GCC diagnostic pop
11 | #include <boost/program_options.hpp>
12 | 
13 | #include <iostream>
14 | 
15 | namespace lm { namespace interpolate {
16 | void TuneWeights(int tune_file, const std::vector<StringPiece> &model_names, const InstancesConfig &config, std::vector<float> &weights_out) {
17 |   Instances instances(tune_file, model_names, config);
18 |   Vector weights = Vector::Constant(model_names.size(), 1.0 / model_names.size());
19 |   Vector gradient;
20 |   Matrix hessian;
21 |   for (std::size_t iteration = 0; iteration < 10 /*TODO fancy stopping criteria */; ++iteration) {
22 |     std::cerr << "Iteration " << iteration << ": weights =";
23 |     for (Vector::Index i = 0; i < weights.rows(); ++i) {
24 |       std::cerr << ' ' << weights(i);
25 |     }
26 |     std::cerr << std::endl;
27 |     std::cerr << "Perplexity = " << Derivatives(instances, weights, gradient, hessian) << std::endl;
28 |     // TODO: 1.0 step size was too big and it kept getting unstable.  More math.
29 |     weights -= 0.7 * hessian.inverse() * gradient;
30 |   }
31 |   weights_out.assign(weights.data(), weights.data() + weights.size());
32 | }
33 | }} // namespaces
34 | 


--------------------------------------------------------------------------------
/kenlm/lm/interpolate/tune_weights.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_INTERPOLATE_TUNE_WEIGHTS_H
 2 | #define LM_INTERPOLATE_TUNE_WEIGHTS_H
 3 | 
 4 | #include "util/string_piece.hh"
 5 | 
 6 | #include <vector>
 7 | 
 8 | namespace lm { namespace interpolate {
 9 | struct InstancesConfig;
10 | 
11 | // Run a tuning loop, producing weights as output.
12 | void TuneWeights(int tune_file, const std::vector<StringPiece> &model_names, const InstancesConfig &config, std::vector<float> &weights);
13 | 
14 | }} // namespaces
15 | #endif // LM_INTERPOLATE_TUNE_WEIGHTS_H
16 | 


--------------------------------------------------------------------------------
/kenlm/lm/interpolate/universal_vocab.cc:
--------------------------------------------------------------------------------
 1 | #include "lm/interpolate/universal_vocab.hh"
 2 | 
 3 | namespace lm {
 4 | namespace interpolate {
 5 | 
 6 | UniversalVocab::UniversalVocab(const std::vector<WordIndex>& model_vocab_sizes) {
 7 |   model_index_map_.resize(model_vocab_sizes.size());
 8 |   for (size_t i = 0; i < model_vocab_sizes.size(); ++i) {
 9 |     model_index_map_[i].resize(model_vocab_sizes[i]);
10 |   }
11 | }
12 | 
13 | }} // namespaces
14 | 


--------------------------------------------------------------------------------
/kenlm/lm/interpolate/universal_vocab.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_INTERPOLATE_UNIVERSAL_VOCAB_H
 2 | #define LM_INTERPOLATE_UNIVERSAL_VOCAB_H
 3 | 
 4 | #include "lm/word_index.hh"
 5 | 
 6 | #include <vector>
 7 | #include <cstddef>
 8 | 
 9 | namespace lm {
10 | namespace interpolate {
11 | 
12 | class UniversalVocab {
13 | public:
14 |   explicit UniversalVocab(const std::vector<WordIndex>& model_vocab_sizes);
15 | 
16 |   // GetUniversalIndex takes the model number and index for the specific
17 |   // model and returns the universal model number
18 |   WordIndex GetUniversalIdx(std::size_t model_num, WordIndex model_word_index) const {
19 |     return model_index_map_[model_num][model_word_index];
20 |   }
21 | 
22 |   const WordIndex *Mapping(std::size_t model) const {
23 |     return &*model_index_map_[model].begin();
24 |   }
25 | 
26 |   WordIndex SlowConvertToModel(std::size_t model, WordIndex index) const {
27 |     std::vector<WordIndex>::const_iterator i = lower_bound(model_index_map_[model].begin(), model_index_map_[model].end(), index);
28 |     if (i == model_index_map_[model].end() || *i != index) return 0;
29 |     return i - model_index_map_[model].begin();
30 |   }
31 | 
32 |   void InsertUniversalIdx(std::size_t model_num, WordIndex word_index,
33 |       WordIndex universal_word_index) {
34 |     model_index_map_[model_num][word_index] = universal_word_index;
35 |   }
36 | 
37 | private:
38 |   std::vector<std::vector<WordIndex> > model_index_map_;
39 | };
40 | 
41 | } // namespace interpolate
42 | } // namespace lm
43 | 
44 | #endif // LM_INTERPOLATE_UNIVERSAL_VOCAB_H
45 | 


--------------------------------------------------------------------------------
/kenlm/lm/lm_exception.cc:
--------------------------------------------------------------------------------
 1 | #include "lm/lm_exception.hh"
 2 | 
 3 | #include <cerrno>
 4 | #include <cstdio>
 5 | 
 6 | namespace lm {
 7 | 
 8 | ConfigException::ConfigException() throw() {}
 9 | ConfigException::~ConfigException() throw() {}
10 | 
11 | LoadException::LoadException() throw() {}
12 | LoadException::~LoadException() throw() {}
13 | 
14 | FormatLoadException::FormatLoadException() throw() {}
15 | FormatLoadException::~FormatLoadException() throw() {}
16 | 
17 | VocabLoadException::VocabLoadException() throw() {}
18 | VocabLoadException::~VocabLoadException() throw() {}
19 | 
20 | SpecialWordMissingException::SpecialWordMissingException() throw() {}
21 | SpecialWordMissingException::~SpecialWordMissingException() throw() {}
22 | 
23 | } // namespace lm
24 | 


--------------------------------------------------------------------------------
/kenlm/lm/lm_exception.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_LM_EXCEPTION_H
 2 | #define LM_LM_EXCEPTION_H
 3 | 
 4 | // Named to avoid conflict with util/exception.hh.
 5 | 
 6 | #include "util/exception.hh"
 7 | #include "util/string_piece.hh"
 8 | 
 9 | #include <exception>
10 | #include <string>
11 | 
12 | namespace lm {
13 | 
14 | typedef enum {THROW_UP, COMPLAIN, SILENT} WarningAction;
15 | 
16 | class ConfigException : public util::Exception {
17 |   public:
18 |     ConfigException() throw();
19 |     ~ConfigException() throw();
20 | };
21 | 
22 | class LoadException : public util::Exception {
23 |    public:
24 |       virtual ~LoadException() throw();
25 | 
26 |    protected:
27 |       LoadException() throw();
28 | };
29 | 
30 | class FormatLoadException : public LoadException {
31 |   public:
32 |     FormatLoadException() throw();
33 |     ~FormatLoadException() throw();
34 | };
35 | 
36 | class VocabLoadException : public LoadException {
37 |   public:
38 |     virtual ~VocabLoadException() throw();
39 |     VocabLoadException() throw();
40 | };
41 | 
42 | class SpecialWordMissingException : public VocabLoadException {
43 |   public:
44 |     explicit SpecialWordMissingException() throw();
45 |     ~SpecialWordMissingException() throw();
46 | };
47 | 
48 | } // namespace lm
49 | 
50 | #endif // LM_LM_EXCEPTION
51 | 


--------------------------------------------------------------------------------
/kenlm/lm/max_order.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_MAX_ORDER_H
 2 | #define LM_MAX_ORDER_H
 3 | /* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER, THEN CHANGE THE BUILD SYSTEM.
 4 |  * If not, this is the default maximum order.
 5 |  * Having this limit means that State can be
 6 |  * (kMaxOrder - 1) * sizeof(float) bytes instead of
 7 |  * sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead
 8 |  */
 9 | #ifndef KENLM_ORDER_MESSAGE
10 | #define KENLM_ORDER_MESSAGE "If your build system supports changing KENLM_MAX_ORDER, change it there and recompile.  In the KenLM tarball or Moses, use e.g. `bjam --max-kenlm-order=6 -a'.  Otherwise, edit lm/max_order.hh."
11 | #endif
12 | 
13 | #endif // LM_MAX_ORDER_H
14 | 


--------------------------------------------------------------------------------
/kenlm/lm/model_type.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_MODEL_TYPE_H
 2 | #define LM_MODEL_TYPE_H
 3 | 
 4 | namespace lm {
 5 | namespace ngram {
 6 | 
 7 | /* Not the best numbering system, but it grew this way for historical reasons
 8 |  * and I want to preserve existing binary files. */
 9 | typedef enum {PROBING=0, REST_PROBING=1, TRIE=2, QUANT_TRIE=3, ARRAY_TRIE=4, QUANT_ARRAY_TRIE=5} ModelType;
10 | 
11 | // Historical names.
12 | const ModelType HASH_PROBING = PROBING;
13 | const ModelType TRIE_SORTED = TRIE;
14 | const ModelType QUANT_TRIE_SORTED = QUANT_TRIE;
15 | const ModelType ARRAY_TRIE_SORTED = ARRAY_TRIE;
16 | const ModelType QUANT_ARRAY_TRIE_SORTED = QUANT_ARRAY_TRIE;
17 | 
18 | const static ModelType kQuantAdd = static_cast<ModelType>(QUANT_TRIE - TRIE);
19 | const static ModelType kArrayAdd = static_cast<ModelType>(ARRAY_TRIE - TRIE);
20 | 
21 | } // namespace ngram
22 | } // namespace lm
23 | #endif // LM_MODEL_TYPE_H
24 | 


--------------------------------------------------------------------------------
/kenlm/lm/return.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_RETURN_H
 2 | #define LM_RETURN_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | namespace lm {
 7 | /* Structure returned by scoring routines. */
 8 | struct FullScoreReturn {
 9 |   // log10 probability
10 |   float prob;
11 | 
12 |   /* The length of n-gram matched.  Do not use this for recombination.
13 |    * Consider a model containing only the following n-grams:
14 |    * -1 foo
15 |    * -3.14  bar
16 |    * -2.718 baz -5
17 |    * -6 foo bar
18 |    *
19 |    * If you score ``bar'' then ngram_length is 1 and recombination state is the
20 |    * empty string because bar has zero backoff and does not extend to the
21 |    * right.
22 |    * If you score ``foo'' then ngram_length is 1 and recombination state is
23 |    * ``foo''.
24 |    *
25 |    * Ideally, keep output states around and compare them.  Failing that,
26 |    * get out_state.ValidLength() and use that length for recombination.
27 |    */
28 |   unsigned char ngram_length;
29 | 
30 |   /* Left extension information.  If independent_left is set, then prob is
31 |    * independent of words to the left (up to additional backoff).  Otherwise,
32 |    * extend_left indicates how to efficiently extend further to the left.
33 |    */
34 |   bool independent_left;
35 |   uint64_t extend_left; // Defined only if independent_left
36 | 
37 |   // Rest cost for extension to the left.
38 |   float rest;
39 | };
40 | 
41 | } // namespace lm
42 | #endif // LM_RETURN_H
43 | 


--------------------------------------------------------------------------------
/kenlm/lm/sizes.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_SIZES_H
 2 | #define LM_SIZES_H
 3 | 
 4 | #include <vector>
 5 | 
 6 | #include <stdint.h>
 7 | 
 8 | namespace lm { namespace ngram {
 9 | 
10 | struct Config;
11 | 
12 | void ShowSizes(const std::vector<uint64_t> &counts, const lm::ngram::Config &config);
13 | void ShowSizes(const std::vector<uint64_t> &counts);
14 | void ShowSizes(const char *file, const lm::ngram::Config &config);
15 | 
16 | }} // namespaces
17 | #endif // LM_SIZES_H
18 | 


--------------------------------------------------------------------------------
/kenlm/lm/value_build.cc:
--------------------------------------------------------------------------------
 1 | #include "lm/value_build.hh"
 2 | 
 3 | #include "lm/model.hh"
 4 | #include "lm/read_arpa.hh"
 5 | 
 6 | namespace lm {
 7 | namespace ngram {
 8 | 
 9 | template <class Model> LowerRestBuild<Model>::LowerRestBuild(const Config &config, unsigned int order, const typename Model::Vocabulary &vocab) {
10 |   UTIL_THROW_IF(config.rest_lower_files.size() != order - 1, ConfigException, "This model has order " << order << " so there should be " << (order - 1) << " lower-order models for rest cost purposes.");
11 |   Config for_lower = config;
12 |   for_lower.write_mmap = NULL;
13 |   for_lower.rest_lower_files.clear();
14 | 
15 |   // Unigram models aren't supported, so this is a custom loader.
16 |   // TODO: optimize the unigram loading?
17 |   {
18 |     util::FilePiece uni(config.rest_lower_files[0].c_str());
19 |     std::vector<uint64_t> number;
20 |     ReadARPACounts(uni, number);
21 |     UTIL_THROW_IF(number.size() != 1, FormatLoadException, "Expected the unigram model to have order 1, not " << number.size());
22 |     ReadNGramHeader(uni, 1);
23 |     unigrams_.resize(number[0]);
24 |     unigrams_[0] = config.unknown_missing_logprob;
25 |     PositiveProbWarn warn;
26 |     for (uint64_t i = 0; i < number[0]; ++i) {
27 |       WordIndex w;
28 |       Prob entry;
29 |       ReadNGram(uni, 1, vocab, &w, entry, warn);
30 |       unigrams_[w] = entry.prob;
31 |     }
32 |   }
33 | 
34 |   try {
35 |     for (unsigned int i = 2; i < order; ++i) {
36 |       models_.push_back(new Model(config.rest_lower_files[i - 1].c_str(), for_lower));
37 |       UTIL_THROW_IF(models_.back()->Order() != i, FormatLoadException, "Lower order file " << config.rest_lower_files[i-1] << " should have order " << i);
38 |     }
39 |   } catch (...) {
40 |     for (typename std::vector<const Model*>::const_iterator i = models_.begin(); i != models_.end(); ++i) {
41 |       delete *i;
42 |     }
43 |     models_.clear();
44 |     throw;
45 |   }
46 | 
47 |   // TODO: force/check same vocab.
48 | }
49 | 
50 | template <class Model> LowerRestBuild<Model>::~LowerRestBuild() {
51 |   for (typename std::vector<const Model*>::const_iterator i = models_.begin(); i != models_.end(); ++i) {
52 |     delete *i;
53 |   }
54 | }
55 | 
56 | template class LowerRestBuild<ProbingModel>;
57 | 
58 | } // namespace ngram
59 | } // namespace lm
60 | 


--------------------------------------------------------------------------------
/kenlm/lm/virtual_interface.cc:
--------------------------------------------------------------------------------
 1 | #include "lm/virtual_interface.hh"
 2 | 
 3 | #include "lm/lm_exception.hh"
 4 | 
 5 | namespace lm {
 6 | namespace base {
 7 | 
 8 | Vocabulary::~Vocabulary() {}
 9 | 
10 | void Vocabulary::SetSpecial(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found) {
11 |   begin_sentence_ = begin_sentence;
12 |   end_sentence_ = end_sentence;
13 |   not_found_ = not_found;
14 | }
15 | 
16 | Model::~Model() {}
17 | 
18 | } // namespace base
19 | } // namespace lm
20 | 


--------------------------------------------------------------------------------
/kenlm/lm/weights.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_WEIGHTS_H
 2 | #define LM_WEIGHTS_H
 3 | 
 4 | // Weights for n-grams.  Probability and possibly a backoff.
 5 | 
 6 | namespace lm {
 7 | struct Prob {
 8 |   float prob;
 9 | };
10 | // No inheritance so this will be a POD.
11 | struct ProbBackoff {
12 |   float prob;
13 |   float backoff;
14 | };
15 | struct RestWeights {
16 |   float prob;
17 |   float backoff;
18 |   float rest;
19 | };
20 | 
21 | } // namespace lm
22 | #endif // LM_WEIGHTS_H
23 | 


--------------------------------------------------------------------------------
/kenlm/lm/word_index.hh:
--------------------------------------------------------------------------------
 1 | // Separate header because this is used often.
 2 | #ifndef LM_WORD_INDEX_H
 3 | #define LM_WORD_INDEX_H
 4 | 
 5 | #include <climits>
 6 | 
 7 | namespace lm {
 8 | typedef unsigned int WordIndex;
 9 | const WordIndex kMaxWordIndex = UINT_MAX;
10 | const WordIndex kUNK = 0;
11 | } // namespace lm
12 | 
13 | typedef lm::WordIndex LMWordIndex;
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/kenlm/lm/wrappers/README:
--------------------------------------------------------------------------------
1 | This directory is for wrappers around other people's LMs, presenting an interface similar to KenLM's.  You will need to have their LM installed.
2 | 
3 | NPLM is a work in progress.  
4 | 


--------------------------------------------------------------------------------
/kenlm/lm/wrappers/nplm.hh:
--------------------------------------------------------------------------------
 1 | #ifndef LM_WRAPPERS_NPLM_H
 2 | #define LM_WRAPPERS_NPLM_H
 3 | 
 4 | #include "lm/facade.hh"
 5 | #include "lm/max_order.hh"
 6 | #include "util/string_piece.hh"
 7 | 
 8 | #include <boost/thread/tss.hpp>
 9 | #include <boost/scoped_ptr.hpp>
10 | 
11 | /* Wrapper to NPLM "by Ashish Vaswani, with contributions from David Chiang
12 |  * and Victoria Fossum."
13 |  * http://nlg.isi.edu/software/nplm/
14 |  */
15 | 
16 | namespace nplm {
17 | class vocabulary;
18 | class neuralLM;
19 | } // namespace nplm
20 | 
21 | namespace lm {
22 | namespace np {
23 | 
24 | class Vocabulary : public base::Vocabulary {
25 |   public:
26 |     Vocabulary(const nplm::vocabulary &vocab);
27 | 
28 |     ~Vocabulary();
29 | 
30 |     WordIndex Index(const std::string &str) const;
31 | 
32 |     // TODO: lobby them to support StringPiece
33 |     WordIndex Index(const StringPiece &str) const {
34 |       return Index(std::string(str.data(), str.size()));
35 |     }
36 | 
37 |     lm::WordIndex NullWord() const { return null_word_; }
38 | 
39 |   private:
40 |     const nplm::vocabulary &vocab_;
41 | 
42 |     const lm::WordIndex null_word_;
43 | };
44 | 
45 | // Sorry for imposing my limitations on your code.
46 | #define NPLM_MAX_ORDER 7
47 | 
48 | struct State {
49 |   WordIndex words[NPLM_MAX_ORDER - 1];
50 | };
51 | 
52 | class Backend;
53 | 
54 | class Model : public lm::base::ModelFacade<Model, State, Vocabulary> {
55 |   private:
56 |     typedef lm::base::ModelFacade<Model, State, Vocabulary> P;
57 | 
58 |   public:
59 |     // Does this look like an NPLM?
60 |     static bool Recognize(const std::string &file);
61 | 
62 |     explicit Model(const std::string &file, std::size_t cache_size = 1 << 20);
63 | 
64 |     ~Model();
65 | 
66 |     FullScoreReturn FullScore(const State &from, const WordIndex new_word, State &out_state) const;
67 | 
68 |     FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;
69 | 
70 |   private:
71 |     boost::scoped_ptr<nplm::neuralLM> base_instance_;
72 | 
73 |     mutable boost::thread_specific_ptr<Backend> backend_;
74 | 
75 |     Vocabulary vocab_;
76 | 
77 |     lm::WordIndex null_word_;
78 | 
79 |     const std::size_t cache_size_;
80 | };
81 | 
82 | } // namespace np
83 | } // namespace lm
84 | 
85 | #endif // LM_WRAPPERS_NPLM_H
86 | 


--------------------------------------------------------------------------------
/kenlm/python/_kenlm.pxd:
--------------------------------------------------------------------------------
 1 | cdef extern from "lm/word_index.hh" namespace "lm":
 2 |     ctypedef unsigned WordIndex
 3 | 
 4 | cdef extern from "lm/return.hh" namespace "lm":
 5 |     cdef struct FullScoreReturn:
 6 |         float prob
 7 |         unsigned char ngram_length
 8 | 
 9 | cdef extern from "lm/state.hh" namespace "lm::ngram":
10 |     cdef cppclass State :
11 |         int Compare(const State &other) const
12 | 
13 |     int hash_value(const State &state) 
14 | 
15 | cdef extern from "lm/virtual_interface.hh" namespace "lm::base":
16 |     cdef cppclass Vocabulary:
17 |         WordIndex Index(char*)
18 |         WordIndex BeginSentence() 
19 |         WordIndex EndSentence()
20 |         WordIndex NotFound()
21 | 
22 |     ctypedef Vocabulary const_Vocabulary "const lm::base::Vocabulary"
23 | 
24 |     cdef cppclass Model:
25 |         void BeginSentenceWrite(void *)
26 |         void NullContextWrite(void *)
27 |         unsigned int Order()
28 |         const_Vocabulary& BaseVocabulary()
29 |         float BaseScore(void *in_state, WordIndex new_word, void *out_state)
30 |         FullScoreReturn BaseFullScore(void *in_state, WordIndex new_word, void *out_state)
31 | 
32 | cdef extern from "util/mmap.hh" namespace "util":
33 |     cdef enum LoadMethod:
34 |         LAZY
35 |         POPULATE_OR_LAZY
36 |         POPULATE_OR_READ
37 |         READ
38 |         PARALLEL_READ
39 | 
40 | cdef extern from "lm/config.hh" namespace "lm::ngram":
41 |     cdef cppclass Config:
42 |         Config()
43 |         float probing_multiplier
44 |         LoadMethod load_method
45 | 
46 | cdef extern from "lm/model.hh" namespace "lm::ngram":
47 |     cdef Model *LoadVirtual(char *, Config &config) except +
48 |     #default constructor
49 |     cdef Model *LoadVirtual(char *) except +
50 | 
51 | 


--------------------------------------------------------------------------------
/kenlm/python/example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import kenlm
 4 | 
 5 | LM = os.path.join(os.path.dirname(__file__), '..', 'lm', 'test.arpa')
 6 | model = kenlm.Model(LM)
 7 | print('{0}-gram model'.format(model.order))
 8 | 
 9 | sentence = 'language modeling is fun .'
10 | print(sentence)
11 | print(model.score(sentence))
12 | 
13 | # Check that total full score = direct score
14 | def score(s):
15 |     return sum(prob for prob, _, _ in model.full_scores(s))
16 | 
17 | assert (abs(score(sentence) - model.score(sentence)) < 1e-3)
18 | 
19 | # Show scores and n-gram matches
20 | words = ['<s>'] + sentence.split() + ['</s>']
21 | for i, (prob, length, oov) in enumerate(model.full_scores(sentence)):
22 |     print('{0} {1}: {2}'.format(prob, length, ' '.join(words[i+2-length:i+2])))
23 |     if oov:
24 |         print('\t"{0}" is an OOV'.format(words[i+1]))
25 | 
26 | # Find out-of-vocabulary words
27 | for w in words:
28 |     if not w in model:
29 |         print('"{0}" is an OOV'.format(w))
30 | 
31 | #Stateful query
32 | state = kenlm.State()
33 | state2 = kenlm.State()
34 | #Use <s> as context.  If you don't want <s>, use model.NullContextWrite(state).
35 | model.BeginSentenceWrite(state)
36 | accum = 0.0
37 | accum += model.BaseScore(state, "a", state2)
38 | accum += model.BaseScore(state2, "sentence", state)
39 | #score defaults to bos = True and eos = True.  Here we'll check without the end
40 | #of sentence marker.  
41 | assert (abs(accum - model.score("a sentence", eos = False)) < 1e-3)
42 | accum += model.BaseScore(state, "</s>", state2)
43 | assert (abs(accum - model.score("a sentence")) < 1e-3)
44 | 


--------------------------------------------------------------------------------
/kenlm/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, Extension
 2 | import glob
 3 | import platform
 4 | import os
 5 | 
 6 | #Does gcc compile with this header and library?
 7 | def compile_test(header, library):
 8 |     dummy_path = os.path.join(os.path.dirname(__file__), "dummy")
 9 |     command = "bash -c \"g++ -include " + header + " -l" + library + " -x c++ - <<<'int main() {}' -o " + dummy_path + " >/dev/null 2>/dev/null && rm " + dummy_path + " 2>/dev/null\""
10 |     return os.system(command) == 0
11 | 
12 | 
13 | FILES = glob.glob('util/*.cc') + glob.glob('lm/*.cc') + glob.glob('util/double-conversion/*.cc')
14 | FILES = [fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc'))]
15 | 
16 | LIBS = ['stdc++']
17 | if platform.system() != 'Darwin':
18 |     LIBS.append('rt')
19 | 
20 | #We don't need -std=c++11 but python seems to be compiled with it now.  https://github.com/kpu/kenlm/issues/86
21 | ARGS = ['-O3', '-DNDEBUG', '-DKENLM_MAX_ORDER=6', '-std=c++11']
22 | 
23 | if compile_test('zlib.h', 'z'):
24 |     ARGS.append('-DHAVE_ZLIB')
25 |     LIBS.append('z')
26 | 
27 | if compile_test('bzlib.h', 'bz2'):
28 |     ARGS.append('-DHAVE_BZLIB')
29 |     LIBS.append('bz2')
30 | 
31 | if compile_test('lzma.h', 'lzma'):
32 |     ARGS.append('-DHAVE_XZLIB')
33 |     LIBS.append('lzma')
34 | 
35 | ext_modules = [
36 |     Extension(name='kenlm',
37 |         sources=FILES + ['python/kenlm.cpp'],
38 |         language='C++', 
39 |         include_dirs=['.'],
40 |         libraries=LIBS, 
41 |         extra_compile_args=ARGS)
42 | ]
43 | 
44 | setup(
45 |     name='kenlm',
46 |     ext_modules=ext_modules,
47 |     include_package_data=True,
48 | )
49 | 


--------------------------------------------------------------------------------
/kenlm/util/bit_packing.cc:
--------------------------------------------------------------------------------
 1 | #include "util/bit_packing.hh"
 2 | #include "util/exception.hh"
 3 | 
 4 | #include <cstring>
 5 | 
 6 | namespace util {
 7 | 
 8 | namespace {
 9 | template <bool> struct StaticCheck {};
10 | template <> struct StaticCheck<true> { typedef bool StaticAssertionPassed; };
11 | 
12 | // If your float isn't 4 bytes, we're hosed.
13 | typedef StaticCheck<sizeof(float) == 4>::StaticAssertionPassed FloatSize;
14 | 
15 | } // namespace
16 | 
17 | uint8_t RequiredBits(uint64_t max_value) {
18 |   if (!max_value) return 0;
19 |   uint8_t ret = 1;
20 |   while (max_value >>= 1) ++ret;
21 |   return ret;
22 | }
23 | 
24 | void BitPackingSanity() {
25 |   const FloatEnc neg1 = { -1.0 }, pos1 = { 1.0 };
26 |   if ((neg1.i ^ pos1.i) != 0x80000000) UTIL_THROW(Exception, "Sign bit is not 0x80000000");
27 |   char mem[57+8];
28 |   memset(mem, 0, sizeof(mem));
29 |   const uint64_t test57 = 0x123456789abcdefULL;
30 |   for (uint64_t b = 0; b < 57 * 8; b += 57) {
31 |     WriteInt57(mem, b, 57, test57);
32 |   }
33 |   for (uint64_t b = 0; b < 57 * 8; b += 57) {
34 |     if (test57 != ReadInt57(mem, b, 57, (1ULL << 57) - 1))
35 |       UTIL_THROW(Exception, "The bit packing routines are failing for your architecture.  Please send a bug report with your architecture, operating system, and compiler.");
36 |   }
37 |   // TODO: more checks.
38 | }
39 | 
40 | } // namespace util
41 | 


--------------------------------------------------------------------------------
/kenlm/util/bit_packing_test.cc:
--------------------------------------------------------------------------------
 1 | #include "util/bit_packing.hh"
 2 | 
 3 | #define BOOST_TEST_MODULE BitPackingTest
 4 | #include <boost/test/unit_test.hpp>
 5 | 
 6 | #include <cstring>
 7 | 
 8 | namespace util {
 9 | namespace {
10 | 
11 | const uint64_t test57 = 0x123456789abcdefULL;
12 | const uint32_t test25 = 0x1234567;
13 | 
14 | BOOST_AUTO_TEST_CASE(ZeroBit57) {
15 |   char mem[16];
16 |   memset(mem, 0, sizeof(mem));
17 |   WriteInt57(mem, 0, 57, test57);
18 |   BOOST_CHECK_EQUAL(test57, ReadInt57(mem, 0, 57, (1ULL << 57) - 1));
19 | }
20 | 
21 | BOOST_AUTO_TEST_CASE(EachBit57) {
22 |   char mem[16];
23 |   for (uint8_t b = 0; b < 8; ++b) {
24 |     memset(mem, 0, sizeof(mem));
25 |     WriteInt57(mem, b, 57, test57);
26 |     BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1));
27 |   }
28 | }
29 | 
30 | BOOST_AUTO_TEST_CASE(Consecutive57) {
31 |   char mem[57+8];
32 |   memset(mem, 0, sizeof(mem));
33 |   for (uint64_t b = 0; b < 57 * 8; b += 57) {
34 |     WriteInt57(mem, b, 57, test57);
35 |     BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1));
36 |   }
37 |   for (uint64_t b = 0; b < 57 * 8; b += 57) {
38 |     BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1));
39 |   }
40 | }
41 | 
42 | BOOST_AUTO_TEST_CASE(Consecutive25) {
43 |   char mem[25+8];
44 |   memset(mem, 0, sizeof(mem));
45 |   for (uint64_t b = 0; b < 25 * 8; b += 25) {
46 |     WriteInt25(mem, b, 25, test25);
47 |     BOOST_CHECK_EQUAL(test25, ReadInt25(mem, b, 25, (1ULL << 25) - 1));
48 |   }
49 |   for (uint64_t b = 0; b < 25 * 8; b += 25) {
50 |     BOOST_CHECK_EQUAL(test25, ReadInt25(mem, b, 25, (1ULL << 25) - 1));
51 |   }
52 | }
53 | 
54 | BOOST_AUTO_TEST_CASE(Sanity) {
55 |   BitPackingSanity();
56 | }
57 | 
58 | } // namespace
59 | } // namespace util
60 | 


--------------------------------------------------------------------------------
/kenlm/util/cat_compressed_main.cc:
--------------------------------------------------------------------------------
 1 | // Like cat but interprets compressed files.
 2 | #include "util/file.hh"
 3 | #include "util/read_compressed.hh"
 4 | 
 5 | #include <cstring>
 6 | #include <iostream>
 7 | 
 8 | namespace {
 9 | const std::size_t kBufSize = 16384;
10 | void Copy(util::ReadCompressed &from, int to) {
11 |   util::scoped_malloc buffer(util::MallocOrThrow(kBufSize));
12 |   while (std::size_t amount = from.Read(buffer.get(), kBufSize)) {
13 |     util::WriteOrThrow(to, buffer.get(), amount);
14 |   }
15 | }
16 | } // namespace
17 | 
18 | int main(int argc, char *argv[]) {
19 |   // Lane Schwartz likes -h and --help
20 |   for (int i = 1; i < argc; ++i) {
21 |     char *arg = argv[i];
22 |     if (!strcmp(arg, "--")) break;
23 |     if (!strcmp(arg, "-h") || !strcmp(arg, "--help")) {
24 |       std::cerr <<
25 |         "A cat implementation that interprets compressed files.\n"
26 |         "Usage: " << argv[0] << " [file1] [file2] ...\n"
27 |         "If no file is provided, then stdin is read.\n";
28 |       return 1;
29 |     }
30 |   }
31 | 
32 |   try {
33 |     if (argc == 1) {
34 |       util::ReadCompressed in(0);
35 |       Copy(in, 1);
36 |     } else {
37 |       for (int i = 1; i < argc; ++i) {
38 |         util::ReadCompressed in(util::OpenReadOrThrow(argv[i]));
39 |         Copy(in, 1);
40 |       }
41 |     }
42 |   } catch (const std::exception &e) {
43 |     std::cerr << e.what() << std::endl;
44 |     return 2;
45 |   }
46 |   return 0;
47 | }
48 | 


--------------------------------------------------------------------------------
/kenlm/util/double-conversion/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
 2 | 
 3 | # Explicitly list the source files for this subdirectory
 4 | #
 5 | # If you add any source files to this subdirectory
 6 | #    that should be included in the kenlm library,
 7 | #        (this excludes any unit test files)
 8 | #    you should add them to the following list:
 9 | #
10 | # In order to allow CMake files in the parent directory
11 | #    to see this variable definition, we set PARENT_SCOPE.
12 | #
13 | # In order to set correct paths to these files
14 | #    when this variable is referenced by CMake files in the parent directory,
15 | #    we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
16 | #
17 | set(KENLM_UTIL_DOUBLECONVERSION_SOURCE
18 | 		${CMAKE_CURRENT_SOURCE_DIR}/bignum-dtoa.cc
19 | 		${CMAKE_CURRENT_SOURCE_DIR}/bignum.cc
20 | 		${CMAKE_CURRENT_SOURCE_DIR}/cached-powers.cc
21 | 		${CMAKE_CURRENT_SOURCE_DIR}/diy-fp.cc
22 | 		${CMAKE_CURRENT_SOURCE_DIR}/double-conversion.cc
23 | 		${CMAKE_CURRENT_SOURCE_DIR}/fast-dtoa.cc
24 | 		${CMAKE_CURRENT_SOURCE_DIR}/fixed-dtoa.cc
25 | 		${CMAKE_CURRENT_SOURCE_DIR}/strtod.cc
26 | 	PARENT_SCOPE)
27 | 
28 | 


--------------------------------------------------------------------------------
/kenlm/util/double-conversion/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2006-2011, the V8 project authors. All rights reserved.
 2 | Redistribution and use in source and binary forms, with or without
 3 | modification, are permitted provided that the following conditions are
 4 | met:
 5 | 
 6 |     * Redistributions of source code must retain the above copyright
 7 |       notice, this list of conditions and the following disclaimer.
 8 |     * Redistributions in binary form must reproduce the above
 9 |       copyright notice, this list of conditions and the following
10 |       disclaimer in the documentation and/or other materials provided
11 |       with the distribution.
12 |     * Neither the name of Google Inc. nor the names of its
13 |       contributors may be used to endorse or promote products derived
14 |       from this software without specific prior written permission.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 


--------------------------------------------------------------------------------
/kenlm/util/double-conversion/strtod.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2010 the V8 project authors. All rights reserved.
 2 | // Redistribution and use in source and binary forms, with or without
 3 | // modification, are permitted provided that the following conditions are
 4 | // met:
 5 | //
 6 | //     * Redistributions of source code must retain the above copyright
 7 | //       notice, this list of conditions and the following disclaimer.
 8 | //     * Redistributions in binary form must reproduce the above
 9 | //       copyright notice, this list of conditions and the following
10 | //       disclaimer in the documentation and/or other materials provided
11 | //       with the distribution.
12 | //     * Neither the name of Google Inc. nor the names of its
13 | //       contributors may be used to endorse or promote products derived
14 | //       from this software without specific prior written permission.
15 | //
16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 
28 | #ifndef DOUBLE_CONVERSION_STRTOD_H_
29 | #define DOUBLE_CONVERSION_STRTOD_H_
30 | 
31 | #include "utils.h"
32 | 
33 | namespace double_conversion {
34 | 
35 | // The buffer must only contain digits in the range [0-9]. It must not
36 | // contain a dot or a sign. It must not start with '0', and must not be empty.
37 | double Strtod(Vector<const char> buffer, int exponent);
38 | 
39 | // The buffer must only contain digits in the range [0-9]. It must not
40 | // contain a dot or a sign. It must not start with '0', and must not be empty.
41 | float Strtof(Vector<const char> buffer, int exponent);
42 | 
43 | }  // namespace double_conversion
44 | 
45 | #endif  // DOUBLE_CONVERSION_STRTOD_H_
46 | 


--------------------------------------------------------------------------------
/kenlm/util/ersatz_progress.cc:
--------------------------------------------------------------------------------
 1 | #include "util/ersatz_progress.hh"
 2 | 
 3 | #include <algorithm>
 4 | #include <ostream>
 5 | #include <limits>
 6 | #include <string>
 7 | 
 8 | namespace util {
 9 | 
10 | namespace { const unsigned char kWidth = 100; }
11 | 
12 | const char kProgressBanner[] = "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n";
13 | 
14 | ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits<uint64_t>::max()), complete_(next_), out_(NULL) {}
15 | 
16 | ErsatzProgress::~ErsatzProgress() {
17 |   if (out_) Finished();
18 | }
19 | 
20 | ErsatzProgress::ErsatzProgress(uint64_t complete, std::ostream *to, const std::string &message)
21 |   : current_(0), next_(complete / kWidth), complete_(complete), stones_written_(0), out_(to) {
22 |   if (!out_) {
23 |     next_ = std::numeric_limits<uint64_t>::max();
24 |     return;
25 |   }
26 |   if (!message.empty()) *out_ << message << '\n';
27 |   *out_ << kProgressBanner;
28 | }
29 | 
30 | void ErsatzProgress::Milestone() {
31 |   if (!out_) { current_ = 0; return; }
32 |   if (!complete_) return;
33 |   unsigned char stone = std::min(static_cast<uint64_t>(kWidth), (current_ * kWidth) / complete_);
34 | 
35 |   for (; stones_written_ < stone; ++stones_written_) {
36 |     (*out_) << '*';
37 |   }
38 |   if (stone == kWidth) {
39 |     (*out_) << std::endl;
40 |     next_ = std::numeric_limits<uint64_t>::max();
41 |     out_ = NULL;
42 |   } else {
43 |     next_ = std::max(next_, ((stone + 1) * complete_ + kWidth - 1) / kWidth);
44 |   }
45 | }
46 | 
47 | } // namespace util
48 | 


--------------------------------------------------------------------------------
/kenlm/util/ersatz_progress.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_ERSATZ_PROGRESS_H
 2 | #define UTIL_ERSATZ_PROGRESS_H
 3 | 
 4 | #include <iostream>
 5 | #include <string>
 6 | #include <stdint.h>
 7 | 
 8 | // Ersatz version of boost::progress so core language model doesn't depend on
 9 | // boost.  Also adds option to print nothing.
10 | 
11 | namespace util {
12 | 
13 | extern const char kProgressBanner[];
14 | 
15 | class ErsatzProgress {
16 |   public:
17 |     // No output.
18 |     ErsatzProgress();
19 | 
20 |     // Null means no output.  The null value is useful for passing along the ostream pointer from another caller.
21 |     explicit ErsatzProgress(uint64_t complete, std::ostream *to = &std::cerr, const std::string &message = "");
22 | 
23 | #if __cplusplus >= 201103L
24 |     ErsatzProgress(ErsatzProgress &&from) noexcept : current_(from.current_), next_(from.next_), complete_(from.complete_), stones_written_(from.stones_written_), out_(from.out_) {
25 |       from.out_ = nullptr;
26 |       from.next_ = (uint64_t)-1;
27 |     }
28 | #endif
29 | 
30 |     ~ErsatzProgress();
31 | 
32 |     ErsatzProgress &operator++() {
33 |       if (++current_ >= next_) Milestone();
34 |       return *this;
35 |     }
36 | 
37 |     ErsatzProgress &operator+=(uint64_t amount) {
38 |       if ((current_ += amount) >= next_) Milestone();
39 |       return *this;
40 |     }
41 | 
42 |     void Set(uint64_t to) {
43 |       if ((current_ = to) >= next_) Milestone();
44 |     }
45 | 
46 |     void Finished() {
47 |       Set(complete_);
48 |     }
49 | 
50 |   private:
51 |     void Milestone();
52 | 
53 |     uint64_t current_, next_, complete_;
54 |     unsigned char stones_written_;
55 |     std::ostream *out_;
56 | 
57 |     // noncopyable
58 |     ErsatzProgress(const ErsatzProgress &other);
59 |     ErsatzProgress &operator=(const ErsatzProgress &other);
60 | };
61 | 
62 | } // namespace util
63 | 
64 | #endif // UTIL_ERSATZ_PROGRESS_H
65 | 


--------------------------------------------------------------------------------
/kenlm/util/float_to_string.cc:
--------------------------------------------------------------------------------
 1 | #include "util/float_to_string.hh"
 2 | 
 3 | #include "util/double-conversion/double-conversion.h"
 4 | #include "util/double-conversion/utils.h"
 5 | 
 6 | namespace util {
 7 | namespace {
 8 | const double_conversion::DoubleToStringConverter kConverter(double_conversion::DoubleToStringConverter::NO_FLAGS, "inf", "NaN", 'e', -6, 21, 6, 0);
 9 | } // namespace
10 | 
11 | char *ToString(double value, char *to) {
12 |   double_conversion::StringBuilder builder(to, ToStringBuf<double>::kBytes);
13 |   kConverter.ToShortest(value, &builder);
14 |   return &to[builder.position()];
15 | }
16 | 
17 | char *ToString(float value, char *to) {
18 |   double_conversion::StringBuilder builder(to, ToStringBuf<float>::kBytes);
19 |   kConverter.ToShortestSingle(value, &builder);
20 |   return &to[builder.position()];
21 | }
22 | 
23 | } // namespace util
24 | 


--------------------------------------------------------------------------------
/kenlm/util/float_to_string.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_FLOAT_TO_STRING_H
 2 | #define UTIL_FLOAT_TO_STRING_H
 3 | 
 4 | // Just for ToStringBuf
 5 | #include "util/integer_to_string.hh"
 6 | 
 7 | namespace util {
 8 | 
 9 | template <> struct ToStringBuf<double> {
10 |   // DoubleToStringConverter::kBase10MaximalLength + 1 for null paranoia.
11 |   static const unsigned kBytes = 19;
12 | };
13 | 
14 | // Single wasn't documented in double conversion, so be conservative and
15 | // say the same as double.
16 | template <> struct ToStringBuf<float> {
17 |   static const unsigned kBytes = 19;
18 | };
19 | 
20 | char *ToString(double value, char *to);
21 | char *ToString(float value, char *to);
22 | 
23 | } // namespace util
24 | 
25 | #endif // UTIL_FLOAT_TO_STRING_H
26 | 


--------------------------------------------------------------------------------
/kenlm/util/getopt.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | POSIX getopt for Windows
 3 | 
 4 | AT&T Public License
 5 | 
 6 | Code given out at the 1985 UNIFORUM conference in Dallas.
 7 | */
 8 | 
 9 | #ifndef __GNUC__
10 | 
11 | #include "getopt.hh"
12 | #include <stdio.h>
13 | #include <string.h>
14 | 
15 | #define NULL	0
16 | #define EOF	(-1)
17 | #define ERR(s, c)	if(opterr){\
18 | 	char errbuf[2];\
19 | 	errbuf[0] = c; errbuf[1] = '\n';\
20 | 	fputs(argv[0], stderr);\
21 | 	fputs(s, stderr);\
22 | 	fputc(c, stderr);}
23 | 	//(void) write(2, argv[0], (unsigned)strlen(argv[0]));\
24 | 	//(void) write(2, s, (unsigned)strlen(s));\
25 | 	//(void) write(2, errbuf, 2);}
26 | 
27 | int	opterr = 1;
28 | int	optind = 1;
29 | int	optopt;
30 | char	*optarg;
31 | 
32 | int
33 | getopt(argc, argv, opts)
34 | int	argc;
35 | char	**argv, *opts;
36 | {
37 | 	static int sp = 1;
38 | 	register int c;
39 | 	register char *cp;
40 | 
41 | 	if(sp == 1)
42 | 		if(optind >= argc ||
43 | 		   argv[optind][0] != '-' || argv[optind][1] == '\0')
44 | 			return(EOF);
45 | 		else if(strcmp(argv[optind], "--") == NULL) {
46 | 			optind++;
47 | 			return(EOF);
48 | 		}
49 | 	optopt = c = argv[optind][sp];
50 | 	if(c == ':' || (cp=strchr(opts, c)) == NULL) {
51 | 		ERR(": illegal option -- ", c);
52 | 		if(argv[optind][++sp] == '\0') {
53 | 			optind++;
54 | 			sp = 1;
55 | 		}
56 | 		return('?');
57 | 	}
58 | 	if(*++cp == ':') {
59 | 		if(argv[optind][sp+1] != '\0')
60 | 			optarg = &argv[optind++][sp+1];
61 | 		else if(++optind >= argc) {
62 | 			ERR(": option requires an argument -- ", c);
63 | 			sp = 1;
64 | 			return('?');
65 | 		} else
66 | 			optarg = argv[optind++];
67 | 		sp = 1;
68 | 	} else {
69 | 		if(argv[optind][++sp] == '\0') {
70 | 			sp = 1;
71 | 			optind++;
72 | 		}
73 | 		optarg = NULL;
74 | 	}
75 | 	return(c);
76 | }
77 | 
78 | #endif  /* __GNUC__ */
79 | 


--------------------------------------------------------------------------------
/kenlm/util/getopt.hh:
--------------------------------------------------------------------------------
 1 | /*
 2 | POSIX getopt for Windows
 3 | 
 4 | AT&T Public License
 5 | 
 6 | Code given out at the 1985 UNIFORUM conference in Dallas.
 7 | */
 8 | 
 9 | #ifdef __GNUC__
10 | #include <getopt.h>
11 | #endif
12 | #ifndef __GNUC__
13 | 
14 | #ifndef UTIL_GETOPT_H
15 | #define UTIL_GETOPT_H
16 | 
17 | #ifdef __cplusplus
18 | extern "C" {
19 | #endif
20 | 
21 | extern int opterr;
22 | extern int optind;
23 | extern int optopt;
24 | extern char *optarg;
25 | extern int getopt(int argc, char **argv, char *opts);
26 | 
27 | #ifdef __cplusplus
28 | }
29 | #endif
30 | 
31 | #endif  /* UTIL_GETOPT_H */
32 | #endif  /* __GNUC__ */
33 | 
34 | 


--------------------------------------------------------------------------------
/kenlm/util/have.hh:
--------------------------------------------------------------------------------
 1 | /* Optional packages.  You might want to integrate this with your build system e.g. config.h from ./configure. */
 2 | #ifndef UTIL_HAVE_H
 3 | #define UTIL_HAVE_H
 4 | 
 5 | #ifdef HAVE_CONFIG_H
 6 | #include "config.h"
 7 | #endif
 8 | 
 9 | #ifndef HAVE_ICU
10 | //#define HAVE_ICU
11 | #endif
12 | 
13 | #endif // UTIL_HAVE_H
14 | 


--------------------------------------------------------------------------------
/kenlm/util/integer_to_string.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_INTEGER_TO_STRING_H
 2 | #define UTIL_INTEGER_TO_STRING_H
 3 | #include <cstddef>
 4 | #include <stdint.h>
 5 | 
 6 | namespace util {
 7 | 
 8 | /* These functions convert integers to strings and return the end pointer.
 9 |  */
10 | char *ToString(uint32_t value, char *to);
11 | char *ToString(uint64_t value, char *to);
12 | 
13 | // Implemented as wrappers to above
14 | char *ToString(int32_t value, char *to);
15 | char *ToString(int64_t value, char *to);
16 | 
17 | // Calls the 32-bit versions for now.
18 | char *ToString(uint16_t value, char *to);
19 | char *ToString(int16_t value, char *to);
20 | 
21 | char *ToString(const void *value, char *to);
22 | 
23 | inline char *ToString(bool value, char *to) {
24 |   *to++ = '0' + value;
25 |   return to;
26 | }
27 | 
28 | // How many bytes to reserve in the buffer for these strings:
29 | // g++ 4.9.1 doesn't work with this:
30 | // static const std::size_t kBytes = 5;
31 | // So use enum.
32 | template <class T> struct ToStringBuf;
33 | template <> struct ToStringBuf<bool> {
34 |   enum { kBytes = 1 };
35 | };
36 | template <> struct ToStringBuf<uint16_t> {
37 |   enum { kBytes = 5 };
38 | };
39 | template <> struct ToStringBuf<int16_t> {
40 |   enum { kBytes = 6 };
41 | };
42 | template <> struct ToStringBuf<uint32_t> {
43 |   enum { kBytes = 10 };
44 | };
45 | template <> struct ToStringBuf<int32_t> {
46 |   enum { kBytes = 11 };
47 | };
48 | template <> struct ToStringBuf<uint64_t> {
49 |   enum { kBytes = 20 };
50 | };
51 | template <> struct ToStringBuf<int64_t> {
52 |   // Not a typo.  2^63 has 19 digits.
53 |   enum { kBytes = 20 };
54 | };
55 | 
56 | template <> struct ToStringBuf<const void*> {
57 |   // Either 18 on 64-bit or 10 on 32-bit.
58 |   enum { kBytes = sizeof(const void*) * 2 + 2 };
59 | };
60 | 
61 | // Maximum over this and float.
62 | enum { kToStringMaxBytes = 20 };
63 | 
64 | } // namespace util
65 | 
66 | #endif // UTIL_INTEGER_TO_STRING_H
67 | 


--------------------------------------------------------------------------------
/kenlm/util/integer_to_string_test.cc:
--------------------------------------------------------------------------------
 1 | #define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE
 2 | #include "util/integer_to_string.hh"
 3 | #include "util/string_piece.hh"
 4 | 
 5 | #define BOOST_TEST_MODULE IntegerToStringTest
 6 | #include <boost/test/unit_test.hpp>
 7 | #include <boost/lexical_cast.hpp>
 8 | 
 9 | #include <limits>
10 | 
11 | namespace util {
12 | namespace {
13 | 
14 | template <class T> void TestValue(const T value) {
15 |   char buf[ToStringBuf<T>::kBytes];
16 |   StringPiece result(buf, ToString(value, buf) - buf);
17 |   BOOST_REQUIRE_GE(static_cast<std::size_t>(ToStringBuf<T>::kBytes), result.size());
18 |   if (value) {
19 |     BOOST_CHECK_EQUAL(boost::lexical_cast<std::string>(value), result);
20 |   } else {
21 |     // Platforms can do void * as 0x0 or 0.
22 |     BOOST_CHECK(result == "0x0" || result == "0");
23 |   }
24 | }
25 | 
26 | template <class T> void TestCorners() {
27 |   TestValue(std::numeric_limits<T>::min());
28 |   TestValue(std::numeric_limits<T>::max());
29 |   TestValue((T)0);
30 |   TestValue((T)-1);
31 |   TestValue((T)1);
32 | }
33 | 
34 | BOOST_AUTO_TEST_CASE(Corners) {
35 |   TestCorners<uint16_t>();
36 |   TestCorners<uint32_t>();
37 |   TestCorners<uint64_t>();
38 |   TestCorners<int16_t>();
39 |   TestCorners<int32_t>();
40 |   TestCorners<int64_t>();
41 |   TestCorners<const void*>();
42 | }
43 | 
44 | template <class T> void TestAll() {
45 |   for (T i = std::numeric_limits<T>::min(); i < std::numeric_limits<T>::max(); ++i) {
46 |     TestValue(i);
47 |   }
48 |   TestValue(std::numeric_limits<T>::max());
49 | }
50 | 
51 | BOOST_AUTO_TEST_CASE(Short) {
52 |   TestAll<uint16_t>();
53 |   TestAll<int16_t>();
54 | }
55 | 
56 | template <class T> void Test10s() {
57 |   for (T i = 1; i < std::numeric_limits<T>::max() / 10; i *= 10) {
58 |     TestValue(i);
59 |     TestValue(i - 1);
60 |     TestValue(i + 1);
61 |   }
62 | }
63 | 
64 | BOOST_AUTO_TEST_CASE(Tens) {
65 |   Test10s<uint64_t>();
66 |   Test10s<int64_t>();
67 |   Test10s<uint32_t>();
68 |   Test10s<int32_t>();
69 | }
70 | 
71 | BOOST_AUTO_TEST_CASE(Pointers) {
72 |   for (uintptr_t i = 1; i < std::numeric_limits<uintptr_t>::max() / 10; i *= 10) {
73 |     TestValue((const void*)i);
74 |   }
75 |   for (uintptr_t i = 0; i < 256; ++i) {
76 |     TestValue((const void*)i);
77 |     TestValue((const void*)(i + 0xf00));
78 |   }
79 | }
80 | 
81 | }} // namespaces
82 | 


--------------------------------------------------------------------------------
/kenlm/util/joint_sort_test.cc:
--------------------------------------------------------------------------------
 1 | #include "util/joint_sort.hh"
 2 | 
 3 | #define BOOST_TEST_MODULE JointSortTest
 4 | #include <boost/test/unit_test.hpp>
 5 | 
 6 | namespace util { namespace {
 7 | 
 8 | BOOST_AUTO_TEST_CASE(just_flip) {
 9 |   char keys[2];
10 |   int values[2];
11 |   keys[0] = 1; values[0] = 327;
12 |   keys[1] = 0; values[1] = 87897;
13 |   JointSort<char *, int *>(keys + 0, keys + 2, values + 0);
14 |   BOOST_CHECK_EQUAL(0, keys[0]);
15 |   BOOST_CHECK_EQUAL(87897, values[0]);
16 |   BOOST_CHECK_EQUAL(1, keys[1]);
17 |   BOOST_CHECK_EQUAL(327, values[1]);
18 | }
19 | 
20 | BOOST_AUTO_TEST_CASE(three) {
21 |   char keys[3];
22 |   int values[3];
23 |   keys[0] = 1; values[0] = 327;
24 |   keys[1] = 2; values[1] = 87897;
25 |   keys[2] = 0; values[2] = 10;
26 |   JointSort<char *, int *>(keys + 0, keys + 3, values + 0);
27 |   BOOST_CHECK_EQUAL(0, keys[0]);
28 |   BOOST_CHECK_EQUAL(1, keys[1]);
29 |   BOOST_CHECK_EQUAL(2, keys[2]);
30 | }
31 | 
32 | BOOST_AUTO_TEST_CASE(char_int) {
33 |   char keys[4];
34 |   int values[4];
35 |   keys[0] = 3; values[0] = 327;
36 |   keys[1] = 1; values[1] = 87897;
37 |   keys[2] = 2; values[2] = 10;
38 |   keys[3] = 0; values[3] = 24347;
39 |   JointSort<char *, int *>(keys + 0, keys + 4, values + 0);
40 |   BOOST_CHECK_EQUAL(0, keys[0]);
41 |   BOOST_CHECK_EQUAL(24347, values[0]);
42 |   BOOST_CHECK_EQUAL(1, keys[1]);
43 |   BOOST_CHECK_EQUAL(87897, values[1]);
44 |   BOOST_CHECK_EQUAL(2, keys[2]);
45 |   BOOST_CHECK_EQUAL(10, values[2]);
46 |   BOOST_CHECK_EQUAL(3, keys[3]);
47 |   BOOST_CHECK_EQUAL(327, values[3]);
48 | }
49 | 
50 | BOOST_AUTO_TEST_CASE(swap_proxy) {
51 |   char keys[2] = {0, 1};
52 |   int values[2] = {2, 3};
53 |   detail::JointProxy<char *, int *> first(keys, values);
54 |   detail::JointProxy<char *, int *> second(keys + 1, values + 1);
55 |   swap(first, second);
56 |   BOOST_CHECK_EQUAL(1, keys[0]);
57 |   BOOST_CHECK_EQUAL(0, keys[1]);
58 |   BOOST_CHECK_EQUAL(3, values[0]);
59 |   BOOST_CHECK_EQUAL(2, values[1]);
60 | }
61 | 
62 | }} // namespace anonymous util
63 | 


--------------------------------------------------------------------------------
/kenlm/util/multi_intersection_test.cc:
--------------------------------------------------------------------------------
 1 | #include "util/multi_intersection.hh"
 2 | 
 3 | #define BOOST_TEST_MODULE MultiIntersectionTest
 4 | #include <boost/test/unit_test.hpp>
 5 | 
 6 | namespace util {
 7 | namespace {
 8 | 
 9 | BOOST_AUTO_TEST_CASE(Empty) {
10 |   std::vector<boost::iterator_range<const unsigned int*> > sets;
11 | 
12 |   sets.push_back(boost::iterator_range<const unsigned int*>(static_cast<const unsigned int*>(NULL), static_cast<const unsigned int*>(NULL)));
13 |   BOOST_CHECK(!FirstIntersection(sets));
14 | }
15 | 
16 | BOOST_AUTO_TEST_CASE(Single) {
17 |   std::vector<unsigned int> nums;
18 |   nums.push_back(1);
19 |   nums.push_back(4);
20 |   nums.push_back(100);
21 |   std::vector<boost::iterator_range<std::vector<unsigned int>::const_iterator> > sets;
22 |   sets.push_back(nums);
23 | 
24 |   boost::optional<unsigned int> ret(FirstIntersection(sets));
25 | 
26 |   BOOST_REQUIRE(ret);
27 |   BOOST_CHECK_EQUAL(static_cast<unsigned int>(1), *ret);
28 | }
29 | 
30 | template <class T, unsigned int len> boost::iterator_range<const T*> RangeFromArray(const T (&arr)[len]) {
31 |   return boost::iterator_range<const T*>(arr, arr + len);
32 | }
33 | 
34 | BOOST_AUTO_TEST_CASE(MultiNone) {
35 |   unsigned int nums0[] = {1, 3, 4, 22};
36 |   unsigned int nums1[] = {2, 5, 12};
37 |   unsigned int nums2[] = {4, 17};
38 | 
39 |   std::vector<boost::iterator_range<const unsigned int*> > sets;
40 |   sets.push_back(RangeFromArray(nums0));
41 |   sets.push_back(RangeFromArray(nums1));
42 |   sets.push_back(RangeFromArray(nums2));
43 | 
44 |   BOOST_CHECK(!FirstIntersection(sets));
45 | }
46 | 
47 | BOOST_AUTO_TEST_CASE(MultiOne) {
48 |   unsigned int nums0[] = {1, 3, 4, 17, 22};
49 |   unsigned int nums1[] = {2, 5, 12, 17};
50 |   unsigned int nums2[] = {4, 17};
51 | 
52 |   std::vector<boost::iterator_range<const unsigned int*> > sets;
53 |   sets.push_back(RangeFromArray(nums0));
54 |   sets.push_back(RangeFromArray(nums1));
55 |   sets.push_back(RangeFromArray(nums2));
56 | 
57 |   boost::optional<unsigned int> ret(FirstIntersection(sets));
58 |   BOOST_REQUIRE(ret);
59 |   BOOST_CHECK_EQUAL(static_cast<unsigned int>(17), *ret);
60 | }
61 | 
62 | } // namespace
63 | } // namespace util
64 | 


--------------------------------------------------------------------------------
/kenlm/util/murmur_hash.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_MURMUR_HASH_H
 2 | #define UTIL_MURMUR_HASH_H
 3 | #include <cstddef>
 4 | #include <stdint.h>
 5 | 
 6 | namespace util {
 7 | 
 8 | // 64-bit machine version
 9 | uint64_t MurmurHash64A(const void * key, std::size_t len, uint64_t seed = 0);
10 | // 32-bit machine version (not the same function as above)
11 | uint64_t MurmurHash64B(const void * key, std::size_t len, uint64_t seed = 0);
12 | // Use the version for this arch.  Because the values differ across
13 | // architectures, really only use it for in-memory structures.
14 | uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed = 0);
15 | 
16 | } // namespace util
17 | 
18 | #endif // UTIL_MURMUR_HASH_H
19 | 


--------------------------------------------------------------------------------
/kenlm/util/parallel_read.cc:
--------------------------------------------------------------------------------
 1 | #include "util/parallel_read.hh"
 2 | 
 3 | #include "util/file.hh"
 4 | 
 5 | #ifdef WITH_THREADS
 6 | #include "util/thread_pool.hh"
 7 | 
 8 | namespace util {
 9 | namespace {
10 | 
11 | class Reader {
12 |   public:
13 |     explicit Reader(int fd) : fd_(fd) {}
14 | 
15 |     struct Request {
16 |       void *to;
17 |       std::size_t size;
18 |       uint64_t offset;
19 | 
20 |       bool operator==(const Request &other) const {
21 |         return (to == other.to) && (size == other.size) && (offset == other.offset);
22 |       }
23 |     };
24 | 
25 |     void operator()(const Request &request) {
26 |       util::ErsatzPRead(fd_, request.to, request.size, request.offset);
27 |     }
28 | 
29 |   private:
30 |     int fd_;
31 | };
32 | 
33 | } // namespace
34 | 
35 | void ParallelRead(int fd, void *to, std::size_t amount, uint64_t offset) {
36 |   Reader::Request poison;
37 |   poison.to = NULL;
38 |   poison.size = 0;
39 |   poison.offset = 0;
40 |   unsigned threads = boost::thread::hardware_concurrency();
41 |   if (!threads) threads = 2;
42 |   ThreadPool<Reader> pool(2 /* don't need much of a queue */, threads, fd, poison);
43 |   const std::size_t kBatch = 1ULL << 25; // 32 MB
44 |   Reader::Request request;
45 |   request.to = to;
46 |   request.size = kBatch;
47 |   request.offset = offset;
48 |   for (; amount > kBatch; amount -= kBatch) {
49 |     pool.Produce(request);
50 |     request.to = reinterpret_cast<uint8_t*>(request.to) + kBatch;
51 |     request.offset += kBatch;
52 |   }
53 |   request.size = amount;
54 |   if (request.size) {
55 |     pool.Produce(request);
56 |   }
57 | }
58 | 
59 | } // namespace util
60 | 
61 | #else // WITH_THREADS
62 | 
63 | namespace util {
64 | void ParallelRead(int fd, void *to, std::size_t amount, uint64_t offset) {
65 |  util::ErsatzPRead(fd, to, amount, offset);
66 | }
67 | } // namespace util
68 | 
69 | #endif
70 | 


--------------------------------------------------------------------------------
/kenlm/util/parallel_read.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_PARALLEL_READ__
 2 | #define UTIL_PARALLEL_READ__
 3 | 
 4 | /* Read pieces of a file in parallel.  This has a very specific use case:
 5 |  * reading files from Lustre is CPU bound so multiple threads actually
 6 |  * increases throughput.  Speed matters when an LM takes a terabyte.
 7 |  */
 8 | 
 9 | #include <cstddef>
10 | #include <stdint.h>
11 | 
12 | namespace util {
13 | void ParallelRead(int fd, void *to, std::size_t amount, uint64_t offset);
14 | } // namespace util
15 | 
16 | #endif // UTIL_PARALLEL_READ__
17 | 


--------------------------------------------------------------------------------
/kenlm/util/pcqueue_test.cc:
--------------------------------------------------------------------------------
 1 | #include "util/pcqueue.hh"
 2 | 
 3 | #define BOOST_TEST_MODULE PCQueueTest
 4 | #include <boost/test/unit_test.hpp>
 5 | 
 6 | namespace util {
 7 | namespace {
 8 | 
 9 | BOOST_AUTO_TEST_CASE(SingleThread) {
10 |   PCQueue<int> queue(10);
11 |   for (int i = 0; i < 10; ++i) {
12 |     queue.Produce(i);
13 |   }
14 |   for (int i = 0; i < 10; ++i) {
15 |     BOOST_CHECK_EQUAL(i, queue.Consume());
16 |   }
17 | }
18 | 
19 | }
20 | } // namespace util
21 | 


--------------------------------------------------------------------------------
/kenlm/util/pool.cc:
--------------------------------------------------------------------------------
 1 | #include "util/pool.hh"
 2 | 
 3 | #include "util/scoped.hh"
 4 | 
 5 | #include <cstdlib>
 6 | 
 7 | #include <algorithm>
 8 | 
 9 | namespace util {
10 | 
11 | Pool::Pool() {
12 |   current_ = NULL;
13 |   current_end_ = NULL;
14 | }
15 | 
16 | Pool::~Pool() {
17 |   FreeAll();
18 | }
19 | 
20 | void Pool::FreeAll() {
21 |   for (std::vector<void *>::const_iterator i(free_list_.begin()); i != free_list_.end(); ++i) {
22 |     free(*i);
23 |   }
24 |   free_list_.clear();
25 |   current_ = NULL;
26 |   current_end_ = NULL;
27 | }
28 | 
29 | void *Pool::More(std::size_t size) {
30 |   std::size_t amount = std::max(static_cast<size_t>(32) << free_list_.size(), size);
31 |   uint8_t *ret = static_cast<uint8_t*>(MallocOrThrow(amount));
32 |   free_list_.push_back(ret);
33 |   current_ = ret + size;
34 |   current_end_ = ret + amount;
35 |   return ret;
36 | }
37 | 
38 | } // namespace util
39 | 


--------------------------------------------------------------------------------
/kenlm/util/read_compressed.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_READ_COMPRESSED_H
 2 | #define UTIL_READ_COMPRESSED_H
 3 | 
 4 | #include "util/exception.hh"
 5 | #include "util/scoped.hh"
 6 | 
 7 | #include <cstddef>
 8 | #include <stdint.h>
 9 | 
10 | namespace util {
11 | 
12 | class CompressedException : public Exception {
13 |   public:
14 |     CompressedException() throw();
15 |     virtual ~CompressedException() throw();
16 | };
17 | 
18 | class GZException : public CompressedException {
19 |   public:
20 |     GZException() throw();
21 |     ~GZException() throw();
22 | };
23 | 
24 | class BZException : public CompressedException {
25 |   public:
26 |     BZException() throw();
27 |     ~BZException() throw();
28 | };
29 | 
30 | class XZException : public CompressedException {
31 |   public:
32 |     XZException() throw();
33 |     ~XZException() throw();
34 | };
35 | 
36 | class ReadCompressed;
37 | 
38 | class ReadBase {
39 |   public:
40 |     virtual ~ReadBase() {}
41 | 
42 |     virtual std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) = 0;
43 | 
44 |   protected:
45 |     static void ReplaceThis(ReadBase *with, ReadCompressed &thunk);
46 | 
47 |     ReadBase *Current(ReadCompressed &thunk);
48 | 
49 |     static uint64_t &ReadCount(ReadCompressed &thunk);
50 | };
51 | 
52 | class ReadCompressed {
53 |   public:
54 |     static const std::size_t kMagicSize = 6;
55 |     // Must have at least kMagicSize bytes.
56 |     static bool DetectCompressedMagic(const void *from);
57 | 
58 |     // Takes ownership of fd.
59 |     explicit ReadCompressed(int fd);
60 | 
61 |     // Try to avoid using this.  Use the fd instead.
62 |     // There is no decompression support for istreams.
63 |     explicit ReadCompressed(std::istream &in);
64 | 
65 |     // Must call Reset later.
66 |     ReadCompressed();
67 | 
68 |     // Takes ownership of fd.
69 |     void Reset(int fd);
70 | 
71 |     // Same advice as the constructor.
72 |     void Reset(std::istream &in);
73 | 
74 |     std::size_t Read(void *to, std::size_t amount);
75 | 
76 |     // Repeatedly call read to fill a buffer unless EOF is hit.
77 |     // Return number of bytes read.
78 |     std::size_t ReadOrEOF(void *const to, std::size_t amount);
79 | 
80 |     uint64_t RawAmount() const { return raw_amount_; }
81 | 
82 |   private:
83 |     friend class ReadBase;
84 | 
85 |     scoped_ptr<ReadBase> internal_;
86 | 
87 |     uint64_t raw_amount_;
88 | };
89 | 
90 | } // namespace util
91 | 
92 | #endif // UTIL_READ_COMPRESSED_H
93 | 


--------------------------------------------------------------------------------
/kenlm/util/scoped.cc:
--------------------------------------------------------------------------------
 1 | #include "util/scoped.hh"
 2 | 
 3 | #include <cstdlib>
 4 | #if !defined(_WIN32) && !defined(_WIN64)
 5 | #include <sys/mman.h>
 6 | #endif
 7 | 
 8 | namespace util {
 9 | 
10 | // TODO: if we're really under memory pressure, don't allocate memory to
11 | // display the error.
12 | MallocException::MallocException(std::size_t requested) throw() {
13 |   *this << "for " << requested << " bytes ";
14 | }
15 | 
16 | MallocException::~MallocException() throw() {}
17 | 
18 | namespace {
19 | void *InspectAddr(void *addr, std::size_t requested, const char *func_name) {
20 |   UTIL_THROW_IF_ARG(!addr && requested, MallocException, (requested), "in " << func_name);
21 |   return addr;
22 | }
23 | } // namespace
24 | 
25 | void *MallocOrThrow(std::size_t requested) {
26 |   return InspectAddr(std::malloc(requested), requested, "malloc");
27 | }
28 | 
29 | void *CallocOrThrow(std::size_t requested) {
30 |   return InspectAddr(std::calloc(requested, 1), requested, "calloc");
31 | }
32 | 
33 | void scoped_malloc::call_realloc(std::size_t requested) {
34 |   p_ = InspectAddr(std::realloc(p_, requested), requested, "realloc");
35 | }
36 | 
37 | void AdviseHugePages(const void *addr, std::size_t size) {
38 | #if MADV_HUGEPAGE
39 |   madvise((void*)addr, size, MADV_HUGEPAGE);
40 | #endif
41 | }
42 | 
43 | } // namespace util
44 | 


--------------------------------------------------------------------------------
/kenlm/util/sized_iterator_test.cc:
--------------------------------------------------------------------------------
 1 | #include "util/sized_iterator.hh"
 2 | 
 3 | #define BOOST_TEST_MODULE SizedIteratorTest
 4 | #include <boost/test/unit_test.hpp>
 5 | 
 6 | namespace util { namespace {
 7 | 
 8 | struct CompareChar {
 9 |   bool operator()(const void *first, const void *second) const {
10 |     return *static_cast<const char*>(first) < *static_cast<const char*>(second);
11 |   }
12 | };
13 | 
14 | BOOST_AUTO_TEST_CASE(sort) {
15 |   char items[3] = {1, 2, 0};
16 |   SizedSort(items, items + 3, 1, CompareChar());
17 |   BOOST_CHECK_EQUAL(0, items[0]);
18 |   BOOST_CHECK_EQUAL(1, items[1]);
19 |   BOOST_CHECK_EQUAL(2, items[2]);
20 | }
21 | 
22 | }} // namespace anonymous util
23 | 


--------------------------------------------------------------------------------
/kenlm/util/spaces.cc:
--------------------------------------------------------------------------------
1 | #include "util/spaces.hh"
2 | 
3 | namespace util {
4 | 
5 | // Sigh this is the only way I could come up with to do a _const_ bool.  It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale).
6 | const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
7 | 
8 | } // namespace util
9 | 


--------------------------------------------------------------------------------
/kenlm/util/spaces.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_SPACES_H
 2 | #define UTIL_SPACES_H
 3 | 
 4 | // bool array of spaces.
 5 | 
 6 | namespace util {
 7 | 
 8 | extern const bool kSpaces[256];
 9 | 
10 | } // namespace util
11 | 
12 | #endif // UTIL_SPACES_H
13 | 


--------------------------------------------------------------------------------
/kenlm/util/stream/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # This CMake file was created by Lane Schwartz <dowobeha@gmail.com>
 2 | 
 3 | # Explicitly list the source files for this subdirectory
 4 | #
 5 | # If you add any source files to this subdirectory
 6 | #    that should be included in the kenlm library,
 7 | #        (this excludes any unit test files)
 8 | #    you should add them to the following list:
 9 | #
10 | # In order to allow CMake files in the parent directory
11 | #    to see this variable definition, we set PARENT_SCOPE.
12 | #
13 | # In order to set correct paths to these files
14 | #    when this variable is referenced by CMake files in the parent directory,
15 | #    we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
16 | #
17 | set(KENLM_UTIL_STREAM_SOURCE
18 | 		${CMAKE_CURRENT_SOURCE_DIR}/chain.cc
19 | 		${CMAKE_CURRENT_SOURCE_DIR}/count_records.cc
20 | 		${CMAKE_CURRENT_SOURCE_DIR}/io.cc
21 | 		${CMAKE_CURRENT_SOURCE_DIR}/line_input.cc
22 | 		${CMAKE_CURRENT_SOURCE_DIR}/multi_progress.cc
23 | 		${CMAKE_CURRENT_SOURCE_DIR}/rewindable_stream.cc
24 | 	PARENT_SCOPE)
25 | 
26 | 
27 | 
28 | if(BUILD_TESTING)
29 |   # Explicitly list the Boost test files to be compiled
30 |   set(KENLM_BOOST_TESTS_LIST
31 |     io_test
32 |     sort_test
33 |     stream_test
34 |     rewindable_stream_test
35 |   )
36 | 
37 |   AddTests(TESTS ${KENLM_BOOST_TESTS_LIST}
38 |            LIBRARIES kenlm_util ${Boost_LIBRARIES} pthread)
39 | endif()
40 | 


--------------------------------------------------------------------------------
/kenlm/util/stream/config.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_STREAM_CONFIG_H
 2 | #define UTIL_STREAM_CONFIG_H
 3 | 
 4 | #include <cstddef>
 5 | #include <string>
 6 | 
 7 | namespace util { namespace stream {
 8 | 
 9 | /**
10 |  * Represents how a chain should be configured.
11 |  */
12 | struct ChainConfig {
13 | 
14 |   /** Constructs an configuration with underspecified (or default) parameters. */
15 |   ChainConfig() {}
16 | 
17 |   /**
18 |    * Constructs a chain configuration object.
19 |    *
20 |    * @param [in] in_entry_size   Number of bytes in each record.
21 |    * @param [in] in_block_count  Number of blocks in the chain.
22 |    * @param [in] in_total_memory Total number of bytes available to the chain.
23 |    *             This value will be divided amongst the blocks in the chain.
24 |    */
25 |   ChainConfig(std::size_t in_entry_size, std::size_t in_block_count, std::size_t in_total_memory)
26 |     : entry_size(in_entry_size), block_count(in_block_count), total_memory(in_total_memory) {}
27 | 
28 |   /**
29 |    * Number of bytes in each record.
30 |    */
31 |   std::size_t entry_size;
32 | 
33 |   /**
34 |    * Number of blocks in the chain.
35 |    */
36 |   std::size_t block_count;
37 | 
38 |   /**
39 |    * Total number of bytes available to the chain.
40 |    * This value will be divided amongst the blocks in the chain.
41 |    * Chain's constructor will make this a multiple of entry_size.
42 |    */
43 |   std::size_t total_memory;
44 | };
45 | 
46 | 
47 | /**
48 |  * Represents how a sorter should be configured.
49 |  */
50 | struct SortConfig {
51 | 
52 |   /** Filename prefix where temporary files should be placed. */
53 |   std::string temp_prefix;
54 | 
55 |   /** Size of each input/output buffer. */
56 |   std::size_t buffer_size;
57 | 
58 |   /** Total memory to use when running alone. */
59 |   std::size_t total_memory;
60 | };
61 | 
62 | }} // namespaces
63 | #endif // UTIL_STREAM_CONFIG_H
64 | 


--------------------------------------------------------------------------------
/kenlm/util/stream/count_records.cc:
--------------------------------------------------------------------------------
 1 | #include "util/stream/count_records.hh"
 2 | #include "util/stream/chain.hh"
 3 | 
 4 | namespace util { namespace stream {
 5 | 
 6 | void CountRecords::Run(const ChainPosition &position) {
 7 |   for (Link link(position); link; ++link) {
 8 |     *count_ += link->ValidSize() / position.GetChain().EntrySize();
 9 |   }
10 | }
11 | 
12 | }} // namespaces
13 | 


--------------------------------------------------------------------------------
/kenlm/util/stream/count_records.hh:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | 
 3 | namespace util { namespace stream {
 4 | 
 5 | class ChainPosition;
 6 | 
 7 | class CountRecords {
 8 |   public:
 9 |     explicit CountRecords(uint64_t *out)
10 |       : count_(out) {
11 |       *count_ = 0;
12 |     }
13 | 
14 |     void Run(const ChainPosition &position);
15 | 
16 |   private:
17 |     uint64_t *count_;
18 | };
19 | 
20 | }} // namespaces
21 | 


--------------------------------------------------------------------------------
/kenlm/util/stream/io.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_STREAM_IO_H
 2 | #define UTIL_STREAM_IO_H
 3 | 
 4 | #include "util/exception.hh"
 5 | #include "util/file.hh"
 6 | 
 7 | namespace util {
 8 | namespace stream {
 9 | 
10 | class ChainPosition;
11 | 
12 | class ReadSizeException : public util::Exception {
13 |   public:
14 |     ReadSizeException() throw();
15 |     ~ReadSizeException() throw();
16 | };
17 | 
18 | class Read {
19 |   public:
20 |     explicit Read(int fd) : file_(fd) {}
21 |     void Run(const ChainPosition &position);
22 |   private:
23 |     int file_;
24 | };
25 | 
26 | // Like read but uses pread so that the file can be accessed from multiple threads.
27 | class PRead {
28 |   public:
29 |     explicit PRead(int fd, bool take_own = false) : file_(fd), own_(take_own) {}
30 |     void Run(const ChainPosition &position);
31 |   private:
32 |     int file_;
33 |     bool own_;
34 | };
35 | 
36 | class Write {
37 |   public:
38 |     explicit Write(int fd) : file_(fd) {}
39 |     void Run(const ChainPosition &position);
40 |   private:
41 |     int file_;
42 | };
43 | 
44 | // It's a common case that stuff is written and then recycled.  So rather than
45 | // spawn another thread to Recycle, this combines the two roles.
46 | class WriteAndRecycle {
47 |   public:
48 |     explicit WriteAndRecycle(int fd) : file_(fd) {}
49 |     void Run(const ChainPosition &position);
50 |   private:
51 |     int file_;
52 | };
53 | 
54 | class PWrite {
55 |   public:
56 |     explicit PWrite(int fd) : file_(fd) {}
57 |     void Run(const ChainPosition &position);
58 |   private:
59 |     int file_;
60 | };
61 | 
62 | 
63 | // Reuse the same file over and over again to buffer output.
64 | class FileBuffer {
65 |   public:
66 |     explicit FileBuffer(int fd) : file_(fd) {}
67 | 
68 |     PWrite Sink() const {
69 |       util::SeekOrThrow(file_.get(), 0);
70 |       return PWrite(file_.get());
71 |     }
72 | 
73 |     PRead Source(bool discard = false) {
74 |       return PRead(discard ? file_.release() : file_.get(), discard);
75 |     }
76 | 
77 |     uint64_t Size() const {
78 |       return SizeOrThrow(file_.get());
79 |     }
80 | 
81 |   private:
82 |     scoped_fd file_;
83 | };
84 | 
85 | } // namespace stream
86 | } // namespace util
87 | #endif // UTIL_STREAM_IO_H
88 | 


--------------------------------------------------------------------------------
/kenlm/util/stream/io_test.cc:
--------------------------------------------------------------------------------
 1 | #include "util/stream/io.hh"
 2 | 
 3 | #include "util/stream/chain.hh"
 4 | #include "util/file.hh"
 5 | 
 6 | #define BOOST_TEST_MODULE IOTest
 7 | #include <boost/test/unit_test.hpp>
 8 | 
 9 | #include <unistd.h>
10 | 
11 | namespace util { namespace stream { namespace {
12 | 
13 | BOOST_AUTO_TEST_CASE(CopyFile) {
14 |   std::string temps("io_test_temp");
15 | 
16 |   scoped_fd in(MakeTemp(temps));
17 |   for (uint64_t i = 0; i < 100000; ++i) {
18 |     WriteOrThrow(in.get(), &i, sizeof(uint64_t));
19 |   }
20 |   SeekOrThrow(in.get(), 0);
21 |   scoped_fd out(MakeTemp(temps));
22 | 
23 |   ChainConfig config;
24 |   config.entry_size = 8;
25 |   config.total_memory = 1024;
26 |   config.block_count = 10;
27 | 
28 |   Chain(config) >> PRead(in.get()) >> Write(out.get());
29 | 
30 |   SeekOrThrow(out.get(), 0);
31 |   for (uint64_t i = 0; i < 100000; ++i) {
32 |     uint64_t got;
33 |     ReadOrThrow(out.get(), &got, sizeof(uint64_t));
34 |     BOOST_CHECK_EQUAL(i, got);
35 |   }
36 | }
37 | 
38 | }}} // namespaces
39 | 


--------------------------------------------------------------------------------
/kenlm/util/stream/line_input.cc:
--------------------------------------------------------------------------------
 1 | #include "util/stream/line_input.hh"
 2 | 
 3 | #include "util/exception.hh"
 4 | #include "util/file.hh"
 5 | #include "util/read_compressed.hh"
 6 | #include "util/stream/chain.hh"
 7 | 
 8 | #include <algorithm>
 9 | #include <vector>
10 | 
11 | namespace util { namespace stream {
12 | 
13 | void LineInput::Run(const ChainPosition &position) {
14 |   ReadCompressed reader(fd_);
15 |   // Holding area for beginning of line to be placed in next block.
16 |   std::vector<char> carry;
17 | 
18 |   for (Link block(position); ; ++block) {
19 |     char *to = static_cast<char*>(block->Get());
20 |     char *begin = to;
21 |     char *end = to + position.GetChain().BlockSize();
22 |     std::copy(carry.begin(), carry.end(), to);
23 |     to += carry.size();
24 |     while (to != end) {
25 |       std::size_t got = reader.Read(to, end - to);
26 |       if (!got) {
27 |         // EOF
28 |         block->SetValidSize(to - begin);
29 |         ++block;
30 |         block.Poison();
31 |         return;
32 |       }
33 |       to += got;
34 |     }
35 | 
36 |     // Find the last newline.
37 |     char *newline;
38 |     for (newline = to - 1; ; --newline) {
39 |       UTIL_THROW_IF(newline < begin, Exception, "Did not find a newline in " << position.GetChain().BlockSize() << " bytes of input of " << NameFromFD(fd_) << ".  Is this a text file?");
40 |       if (*newline == '\n') break;
41 |     }
42 | 
43 |     // Copy everything after the last newline to the carry.
44 |     carry.clear();
45 |     carry.resize(to - (newline + 1));
46 |     std::copy(newline + 1, to, &*carry.begin());
47 | 
48 |     block->SetValidSize(newline + 1 - begin);
49 |   }
50 | }
51 | 
52 | }} // namespaces
53 | 


--------------------------------------------------------------------------------
/kenlm/util/stream/line_input.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_STREAM_LINE_INPUT_H
 2 | #define UTIL_STREAM_LINE_INPUT_H
 3 | namespace util {namespace stream {
 4 | 
 5 | class ChainPosition;
 6 | 
 7 | /* Worker that reads input into blocks, ensuring that blocks contain whole
 8 |  * lines.  Assumes that the maximum size of a line is less than the block size
 9 |  */
10 | class LineInput {
11 |   public:
12 |     // Takes ownership upon thread execution.
13 |     explicit LineInput(int fd);
14 | 
15 |     void Run(const ChainPosition &position);
16 | 
17 |   private:
18 |     int fd_;
19 | };
20 | 
21 | }} // namespaces
22 | #endif // UTIL_STREAM_LINE_INPUT_H
23 | 


--------------------------------------------------------------------------------
/kenlm/util/stream/multi_progress.hh:
--------------------------------------------------------------------------------
 1 | /* Progress bar suitable for chains of workers */
 2 | #ifndef UTIL_STREAM_MULTI_PROGRESS_H
 3 | #define UTIL_STREAM_MULTI_PROGRESS_H
 4 | 
 5 | #include <boost/thread/mutex.hpp>
 6 | 
 7 | #include <cstddef>
 8 | #include <stdint.h>
 9 | 
10 | namespace util { namespace stream {
11 | 
12 | class WorkerProgress;
13 | 
14 | class MultiProgress {
15 |   public:
16 |     static const unsigned char kWidth = 100;
17 | 
18 |     MultiProgress();
19 | 
20 |     ~MultiProgress();
21 | 
22 |     // Turns on showing (requires SetTarget too).
23 |     void Activate();
24 | 
25 |     void SetTarget(uint64_t complete);
26 | 
27 |     WorkerProgress Add();
28 | 
29 |     void Finished();
30 | 
31 |   private:
32 |     friend class WorkerProgress;
33 |     void Milestone(WorkerProgress &worker);
34 | 
35 |     bool active_;
36 | 
37 |     uint64_t complete_;
38 | 
39 |     boost::mutex mutex_;
40 | 
41 |     // \0 at the end.
42 |     char display_[kWidth + 1];
43 | 
44 |     std::size_t character_handout_;
45 | 
46 |     MultiProgress(const MultiProgress &);
47 |     MultiProgress &operator=(const MultiProgress &);
48 | };
49 | 
50 | class WorkerProgress {
51 |   public:
52 |     // Default contrutor must be initialized with operator= later.
53 |     WorkerProgress() : parent_(NULL) {}
54 | 
55 |     // Not threadsafe for the same worker by default.
56 |     WorkerProgress &operator++() {
57 |       if (++current_ >= next_) {
58 |         parent_->Milestone(*this);
59 |       }
60 |       return *this;
61 |     }
62 | 
63 |     WorkerProgress &operator+=(uint64_t amount) {
64 |       current_ += amount;
65 |       if (current_ >= next_) {
66 |         parent_->Milestone(*this);
67 |       }
68 |       return *this;
69 |     }
70 | 
71 |   private:
72 |     friend class MultiProgress;
73 |     WorkerProgress(uint64_t next, MultiProgress &parent, char character)
74 |       : current_(0), next_(next), parent_(&parent), stone_(0), character_(character) {}
75 | 
76 |     uint64_t current_, next_;
77 | 
78 |     MultiProgress *parent_;
79 | 
80 |     // Previous milestone reached.
81 |     unsigned char stone_;
82 | 
83 |     // Character to display in bar.
84 |     char character_;
85 | };
86 | 
87 | }} // namespaces
88 | 
89 | #endif // UTIL_STREAM_MULTI_PROGRESS_H
90 | 


--------------------------------------------------------------------------------
/kenlm/util/stream/rewindable_stream_test.cc:
--------------------------------------------------------------------------------
 1 | #include "util/stream/io.hh"
 2 | 
 3 | #include "util/stream/rewindable_stream.hh"
 4 | #include "util/file.hh"
 5 | 
 6 | #define BOOST_TEST_MODULE RewindableStreamTest
 7 | #include <boost/test/unit_test.hpp>
 8 | 
 9 | namespace util {
10 | namespace stream {
11 | namespace {
12 | 
13 | BOOST_AUTO_TEST_CASE(RewindableStreamTest) {
14 |   scoped_fd in(MakeTemp("io_test_temp"));
15 |   for (uint64_t i = 0; i < 100000; ++i) {
16 |     WriteOrThrow(in.get(), &i, sizeof(uint64_t));
17 |   }
18 |   SeekOrThrow(in.get(), 0);
19 | 
20 |   ChainConfig config;
21 |   config.entry_size = 8;
22 |   config.total_memory = 100;
23 |   config.block_count = 6;
24 | 
25 |   Chain chain(config);
26 |   RewindableStream s;
27 |   chain >> Read(in.get()) >> s >> kRecycle;
28 |   uint64_t i = 0;
29 |   for (; s; ++s, ++i) {
30 |     BOOST_CHECK_EQUAL(i, *static_cast<const uint64_t*>(s.Get()));
31 |     if (100000UL - i == 2)
32 |       s.Mark();
33 |   }
34 |   BOOST_CHECK_EQUAL(100000ULL, i);
35 |   s.Rewind();
36 |   BOOST_CHECK_EQUAL(100000ULL - 2, *static_cast<const uint64_t*>(s.Get()));
37 | }
38 | 
39 | }
40 | }
41 | }
42 | 


--------------------------------------------------------------------------------
/kenlm/util/stream/sort_test.cc:
--------------------------------------------------------------------------------
 1 | #include "util/stream/sort.hh"
 2 | 
 3 | #define BOOST_TEST_MODULE SortTest
 4 | #include <boost/test/unit_test.hpp>
 5 | 
 6 | #include <algorithm>
 7 | 
 8 | #include <unistd.h>
 9 | 
10 | namespace util { namespace stream { namespace {
11 | 
12 | struct CompareUInt64 : public std::binary_function<const void *, const void *, bool> {
13 |   bool operator()(const void *first, const void *second) const {
14 |     return *static_cast<const uint64_t*>(first) < *reinterpret_cast<const uint64_t*>(second);
15 |   }
16 | };
17 | 
18 | const uint64_t kSize = 100000;
19 | 
20 | struct Putter {
21 |   Putter(std::vector<uint64_t> &shuffled) : shuffled_(shuffled) {}
22 | 
23 |   void Run(const ChainPosition &position) {
24 |     Stream put_shuffled(position);
25 |     for (uint64_t i = 0; i < shuffled_.size(); ++i, ++put_shuffled) {
26 |       *static_cast<uint64_t*>(put_shuffled.Get()) = shuffled_[i];
27 |     }
28 |     put_shuffled.Poison();
29 |   }
30 |   std::vector<uint64_t> &shuffled_;
31 | };
32 | 
33 | BOOST_AUTO_TEST_CASE(FromShuffled) {
34 |   std::vector<uint64_t> shuffled;
35 |   shuffled.reserve(kSize);
36 |   for (uint64_t i = 0; i < kSize; ++i) {
37 |     shuffled.push_back(i);
38 |   }
39 |   std::random_shuffle(shuffled.begin(), shuffled.end());
40 | 
41 |   ChainConfig config;
42 |   config.entry_size = 8;
43 |   config.total_memory = 800;
44 |   config.block_count = 3;
45 | 
46 |   SortConfig merge_config;
47 |   merge_config.temp_prefix = "sort_test_temp";
48 |   merge_config.buffer_size = 800;
49 |   merge_config.total_memory = 3300;
50 | 
51 |   Chain chain(config);
52 |   chain >> Putter(shuffled);
53 |   BlockingSort(chain, merge_config, CompareUInt64(), NeverCombine());
54 |   Stream sorted;
55 |   chain >> sorted >> kRecycle;
56 |   for (uint64_t i = 0; i < kSize; ++i, ++sorted) {
57 |     BOOST_CHECK_EQUAL(i, *static_cast<const uint64_t*>(sorted.Get()));
58 |   }
59 |   BOOST_CHECK(!sorted);
60 | }
61 | 
62 | }}} // namespaces
63 | 


--------------------------------------------------------------------------------
/kenlm/util/stream/stream.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_STREAM_STREAM_H
 2 | #define UTIL_STREAM_STREAM_H
 3 | 
 4 | #include "util/stream/chain.hh"
 5 | 
 6 | #include <boost/noncopyable.hpp>
 7 | 
 8 | #include <cassert>
 9 | #include <stdint.h>
10 | 
11 | namespace util {
12 | namespace stream {
13 | 
14 | class Stream : boost::noncopyable {
15 |   public:
16 |     Stream() : current_(NULL), end_(NULL) {}
17 | 
18 |     void Init(const ChainPosition &position) {
19 |       entry_size_ = position.GetChain().EntrySize();
20 |       block_size_ = position.GetChain().BlockSize();
21 |       block_it_.Init(position);
22 |       StartBlock();
23 |     }
24 | 
25 |     explicit Stream(const ChainPosition &position) {
26 |       Init(position);
27 |     }
28 | 
29 |     operator bool() const { return current_ != NULL; }
30 |     bool operator!() const { return current_ == NULL; }
31 | 
32 |     const void *Get() const { return current_; }
33 |     void *Get() { return current_; }
34 | 
35 |     void Poison() {
36 |       block_it_->SetValidSize(current_ - static_cast<uint8_t*>(block_it_->Get()));
37 |       ++block_it_;
38 |       block_it_.Poison();
39 |     }
40 | 
41 |     Stream &operator++() {
42 |       assert(*this);
43 |       assert(current_ < end_);
44 |       current_ += entry_size_;
45 |       if (current_ == end_) {
46 |         ++block_it_;
47 |         StartBlock();
48 |       }
49 |       return *this;
50 |     }
51 | 
52 |   private:
53 |     void StartBlock() {
54 |       for (; block_it_ && !block_it_->ValidSize(); ++block_it_) {}
55 |       current_ = static_cast<uint8_t*>(block_it_->Get());
56 |       end_ = current_ + block_it_->ValidSize();
57 |     }
58 | 
59 |     // The following are pointers to raw memory
60 |     // current_ is the current record
61 |     // end_ is the end of the block (so we know when to move to the next block)
62 |     uint8_t *current_, *end_;
63 | 
64 |     std::size_t entry_size_;
65 |     std::size_t block_size_;
66 | 
67 |     Link block_it_;
68 | };
69 | 
70 | inline Chain &operator>>(Chain &chain, Stream &stream) {
71 |   stream.Init(chain.Add());
72 |   return chain;
73 | }
74 | 
75 | } // namespace stream
76 | } // namespace util
77 | #endif // UTIL_STREAM_STREAM_H
78 | 


--------------------------------------------------------------------------------
/kenlm/util/stream/stream_test.cc:
--------------------------------------------------------------------------------
 1 | #include "util/stream/io.hh"
 2 | 
 3 | #include "util/stream/stream.hh"
 4 | #include "util/file.hh"
 5 | 
 6 | #define BOOST_TEST_MODULE StreamTest
 7 | #include <boost/test/unit_test.hpp>
 8 | 
 9 | #include <unistd.h>
10 | 
11 | namespace util { namespace stream { namespace {
12 | 
13 | BOOST_AUTO_TEST_CASE(StreamTest) {
14 |   scoped_fd in(MakeTemp("io_test_temp"));
15 |   for (uint64_t i = 0; i < 100000; ++i) {
16 |     WriteOrThrow(in.get(), &i, sizeof(uint64_t));
17 |   }
18 |   SeekOrThrow(in.get(), 0);
19 | 
20 |   ChainConfig config;
21 |   config.entry_size = 8;
22 |   config.total_memory = 100;
23 |   config.block_count = 12;
24 | 
25 |   Stream s;
26 |   Chain chain(config);
27 |   chain >> Read(in.get()) >> s >> kRecycle;
28 |   uint64_t i = 0;
29 |   for (; s; ++s, ++i) {
30 |     BOOST_CHECK_EQUAL(i, *static_cast<const uint64_t*>(s.Get()));
31 |   }
32 |   BOOST_CHECK_EQUAL(100000ULL, i);
33 | }
34 | 
35 | }}} // namespaces
36 | 


--------------------------------------------------------------------------------
/kenlm/util/stream/typed_stream.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_STREAM_TYPED_STREAM_H
 2 | #define UTIL_STREAM_TYPED_STREAM_H
 3 | // A typed wrapper to Stream for POD types.
 4 | 
 5 | #include "util/stream/stream.hh"
 6 | 
 7 | namespace util { namespace stream {
 8 | 
 9 | template <class T> class TypedStream : public Stream {
10 |   public:
11 |     // After using the default constructor, call Init (in the parent class)
12 |     TypedStream() {}
13 | 
14 |     explicit TypedStream(const ChainPosition &position) : Stream(position) {}
15 | 
16 |     const T *operator->() const { return static_cast<const T*>(Get()); }
17 |     T *operator->() { return static_cast<T*>(Get()); }
18 | 
19 |     const T &operator*() const { return *static_cast<const T*>(Get()); }
20 |     T &operator*() { return *static_cast<T*>(Get()); }
21 | };
22 | 
23 | }} // namespaces
24 | 
25 | #endif // UTIL_STREAM_TYPED_STREAM_H
26 | 


--------------------------------------------------------------------------------
/kenlm/util/string_piece_hash.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_STRING_PIECE_HASH_H
 2 | #define UTIL_STRING_PIECE_HASH_H
 3 | 
 4 | #include "util/string_piece.hh"
 5 | 
 6 | #include <boost/functional/hash.hpp>
 7 | #include <boost/version.hpp>
 8 | 
 9 | inline size_t hash_value(const StringPiece &str) {
10 |   return boost::hash_range(str.data(), str.data() + str.length());
11 | }
12 | 
13 | /* Support for lookup of StringPiece in boost::unordered_map<std::string> */
14 | struct StringPieceCompatibleHash : public std::unary_function<const StringPiece &, size_t> {
15 |   size_t operator()(const StringPiece &str) const {
16 |     return hash_value(str);
17 |   }
18 | };
19 | 
20 | struct StringPieceCompatibleEquals : public std::binary_function<const StringPiece &, const std::string &, bool> {
21 |   bool operator()(const StringPiece &first, const StringPiece &second) const {
22 |     return first == second;
23 |   }
24 | };
25 | template <class T> typename T::const_iterator FindStringPiece(const T &t, const StringPiece &key) {
26 | #if BOOST_VERSION < 104200
27 |   std::string temp(key.data(), key.size());
28 |   return t.find(temp);
29 | #else
30 |   return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals());
31 | #endif
32 | }
33 | 
34 | template <class T> typename T::iterator FindStringPiece(T &t, const StringPiece &key) {
35 | #if BOOST_VERSION < 104200
36 |   std::string temp(key.data(), key.size());
37 |   return t.find(temp);
38 | #else
39 |   return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals());
40 | #endif
41 | }
42 | 
43 | #endif // UTIL_STRING_PIECE_HASH_H
44 | 


--------------------------------------------------------------------------------
/kenlm/util/string_stream.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_STRING_STREAM_H
 2 | #define UTIL_STRING_STREAM_H
 3 | 
 4 | #include "util/fake_ostream.hh"
 5 | 
 6 | #include <cassert>
 7 | #include <string>
 8 | 
 9 | namespace util {
10 | 
11 | class StringStream : public FakeOStream<StringStream> {
12 |   public:
13 |     StringStream() {}
14 | 
15 |     StringStream &flush() { return *this; }
16 | 
17 |     StringStream &write(const void *data, std::size_t length) {
18 |       out_.append(static_cast<const char*>(data), length);
19 |       return *this;
20 |     }
21 | 
22 |     const std::string &str() const { return out_; }
23 | 
24 |     void str(const std::string &val) { out_ = val; }
25 | 
26 |     void swap(std::string &str) { std::swap(out_, str); }
27 | 
28 |   protected:
29 |     friend class FakeOStream<StringStream>;
30 |     char *Ensure(std::size_t amount) {
31 |       std::size_t current = out_.size();
32 |       out_.resize(out_.size() + amount);
33 |       return &out_[current];
34 |     }
35 | 
36 |     void AdvanceTo(char *to) {
37 |       assert(to <= &*out_.end());
38 |       assert(to >= &*out_.begin());
39 |       out_.resize(to - &*out_.begin());
40 |     }
41 | 
42 |   private:
43 |     std::string out_;
44 | };
45 | 
46 | } // namespace
47 | 
48 | #endif // UTIL_STRING_STREAM_H
49 | 


--------------------------------------------------------------------------------
/kenlm/util/string_stream_test.cc:
--------------------------------------------------------------------------------
 1 | #define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE
 2 | #define BOOST_TEST_MODULE FakeOStreamTest
 3 | 
 4 | #include "util/string_stream.hh"
 5 | #include <boost/test/unit_test.hpp>
 6 | #include <boost/lexical_cast.hpp>
 7 | 
 8 | #include <cstddef>
 9 | #include <limits>
10 | 
11 | namespace util { namespace {
12 | 
13 | template <class T> void TestEqual(const T value) {
14 |   StringStream strme;
15 |   strme << value;
16 |   BOOST_CHECK_EQUAL(boost::lexical_cast<std::string>(value), strme.str());
17 | }
18 | 
19 | template <class T> void TestCorners() {
20 |   TestEqual(std::numeric_limits<T>::max());
21 |   TestEqual(std::numeric_limits<T>::min());
22 |   TestEqual(static_cast<T>(0));
23 |   TestEqual(static_cast<T>(-1));
24 |   TestEqual(static_cast<T>(1));
25 | }
26 | 
27 | BOOST_AUTO_TEST_CASE(Integer) {
28 |   TestCorners<char>();
29 |   TestCorners<signed char>();
30 |   TestCorners<unsigned char>();
31 | 
32 |   TestCorners<short>();
33 |   TestCorners<signed short>();
34 |   TestCorners<unsigned short>();
35 | 
36 |   TestCorners<int>();
37 |   TestCorners<unsigned int>();
38 |   TestCorners<signed int>();
39 | 
40 |   TestCorners<long>();
41 |   TestCorners<unsigned long>();
42 |   TestCorners<signed long>();
43 | 
44 |   TestCorners<long long>();
45 |   TestCorners<unsigned long long>();
46 |   TestCorners<signed long long>();
47 | 
48 |   TestCorners<std::size_t>();
49 | }
50 | 
51 | enum TinyEnum { EnumValue };
52 | 
53 | BOOST_AUTO_TEST_CASE(EnumCase) {
54 |   TestEqual(EnumValue);
55 | }
56 | 
57 | BOOST_AUTO_TEST_CASE(Strings) {
58 |   TestEqual("foo");
59 |   const char *a = "bar";
60 |   TestEqual(a);
61 |   StringPiece piece("abcdef");
62 |   TestEqual(piece);
63 |   TestEqual(StringPiece());
64 | 
65 |   char non_const[3];
66 |   non_const[0] = 'b';
67 |   non_const[1] = 'c';
68 |   non_const[2] = 0;
69 | 
70 |   StringStream out;
71 |   out << "a" << non_const << 'c';
72 |   BOOST_CHECK_EQUAL("abcc", out.str());
73 | 
74 |   // Now test as a separate object.
75 |   StringStream stream;
76 |   stream << "a" << non_const << 'c' << piece;
77 |   BOOST_CHECK_EQUAL("abccabcdef", stream.str());
78 | }
79 | 
80 | }} // namespaces
81 | 


--------------------------------------------------------------------------------
/kenlm/util/tokenize_piece_test.cc:
--------------------------------------------------------------------------------
 1 | #include "util/tokenize_piece.hh"
 2 | #include "util/string_piece.hh"
 3 | 
 4 | #define BOOST_TEST_MODULE TokenIteratorTest
 5 | #include <boost/test/unit_test.hpp>
 6 | 
 7 | #include <iostream>
 8 | 
 9 | namespace util {
10 | namespace {
11 | 
12 | BOOST_AUTO_TEST_CASE(pipe_pipe_none) {
13 |   const char str[] = "nodelimit at all";
14 |   TokenIter<MultiCharacter> it(str, MultiCharacter("|||"));
15 |   BOOST_REQUIRE(it);
16 |   BOOST_CHECK_EQUAL(StringPiece(str), *it);
17 |   ++it;
18 |   BOOST_CHECK(!it);
19 | }
20 | BOOST_AUTO_TEST_CASE(pipe_pipe_two) {
21 |   const char str[] = "|||";
22 |   TokenIter<MultiCharacter> it(str, MultiCharacter("|||"));
23 |   BOOST_REQUIRE(it);
24 |   BOOST_CHECK_EQUAL(StringPiece(), *it);
25 |   ++it;
26 |   BOOST_REQUIRE(it);
27 |   BOOST_CHECK_EQUAL(StringPiece(), *it);
28 |   ++it;
29 |   BOOST_CHECK(!it);
30 | }
31 | 
32 | BOOST_AUTO_TEST_CASE(remove_empty) {
33 |   const char str[] = "|||";
34 |   TokenIter<MultiCharacter, true> it(str, MultiCharacter("|||"));
35 |   BOOST_CHECK(!it);
36 | }
37 | 
38 | BOOST_AUTO_TEST_CASE(remove_empty_keep) {
39 |   const char str[] = " |||";
40 |   TokenIter<MultiCharacter, true> it(str, MultiCharacter("|||"));
41 |   BOOST_REQUIRE(it);
42 |   BOOST_CHECK_EQUAL(StringPiece(" "), *it);
43 |   ++it;
44 |   BOOST_CHECK(!it);
45 | }
46 | 
47 | } // namespace
48 | } // namespace util
49 | 


--------------------------------------------------------------------------------
/kenlm/util/usage.hh:
--------------------------------------------------------------------------------
 1 | #ifndef UTIL_USAGE_H
 2 | #define UTIL_USAGE_H
 3 | #include <cstddef>
 4 | #include <iosfwd>
 5 | #include <string>
 6 | #include <stdint.h>
 7 | 
 8 | namespace util {
 9 | // Time in seconds since process started.  Zero on unsupported platforms.
10 | double WallTime();
11 | 
12 | // User + system time, process-wide.
13 | double CPUTime();
14 | 
15 | // User + system time, thread-specific.
16 | double ThreadTime();
17 | 
18 | // Resident usage in bytes.
19 | uint64_t RSSMax();
20 | 
21 | void PrintUsage(std::ostream &to);
22 | 
23 | // Determine how much physical memory there is.  Return 0 on failure.
24 | uint64_t GuessPhysicalMemory();
25 | 
26 | // Parse a size like unix sort.  Sadly, this means the default multiplier is K.
27 | uint64_t ParseSize(const std::string &arg);
28 | 
29 | } // namespace util
30 | #endif // UTIL_USAGE_H
31 | 


--------------------------------------------------------------------------------
/kenmodels/zhwiki_bigram.klm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/kenmodels/zhwiki_bigram.klm


--------------------------------------------------------------------------------
/kenmodels/zhwiki_trigram.klm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ccheng16/correction/56f0056897a76f2688c5c727d8ba993eb57d7076/kenmodels/zhwiki_trigram.klm


--------------------------------------------------------------------------------
/train_kenlm.sh:
--------------------------------------------------------------------------------
1 | python feed_kenlm.py | ./kenlm/build/bin/lmplz -o 3 > zhwiki_trigram.arpa
2 | ./kenlm/build/bin/build_binary zhwiki_trigram.arpa zhwiki_trigram.klm
3 | 


--------------------------------------------------------------------------------