├── AUTHORS ├── COPYING ├── ChangeLog ├── Makefile.am ├── Makefile.example ├── README ├── autogen.sh ├── configure.ac ├── src ├── InterpolatedNgramLM.cpp ├── InterpolatedNgramLM.h ├── KneserNeySmoothing.cpp ├── KneserNeySmoothing.h ├── Lattice.cpp ├── Lattice.h ├── Mask.h ├── MaxLikelihoodSmoothing.cpp ├── MaxLikelihoodSmoothing.h ├── NgramLM.cpp ├── NgramLM.h ├── NgramModel.cpp ├── NgramModel.h ├── NgramVector.cpp ├── NgramVector.h ├── PerplexityOptimizer.cpp ├── PerplexityOptimizer.h ├── Smoothing.cpp ├── Smoothing.h ├── Types.h ├── Vocab.cpp ├── Vocab.h ├── WordErrorRateOptimizer.cpp ├── WordErrorRateOptimizer.h ├── estimate-ngram.cpp ├── evaluate-ngram.cpp ├── interpolate-ngram.cpp ├── optimize │ ├── LBFGS.h │ ├── LBFGSB.h │ ├── Optimization.h │ ├── Powell.h │ ├── fortran_wrapper.c │ ├── lbfgs.f │ └── lbfgsb.f ├── util │ ├── BitOps.h │ ├── CommandOptions.cpp │ ├── CommandOptions.h │ ├── FastHash.h │ ├── FastIO.h │ ├── Logger.cpp │ ├── Logger.h │ ├── RefCounter.cpp │ ├── RefCounter.h │ ├── SharedPtr.h │ └── ZFile.h └── vector │ ├── DenseVector.h │ ├── DenseVector.tcc │ ├── Operations.h │ ├── Range.h │ ├── Scalar.h │ ├── Traits.h │ ├── Vector.h │ ├── VectorBuilder.h │ ├── VectorClosures.h │ └── VectorOps.h └── tests ├── data ├── small.txt ├── small.vocab ├── test1_ref │ ├── wc.a.hyp │ ├── wc.b.hyp │ ├── wec.a.hyp │ ├── wec.b.hyp │ ├── wl.a.hyp │ ├── wl.b.hyp │ ├── wlc.a.hyp │ ├── wlc.b.hyp │ ├── wrc.a.hyp │ └── wrc.b.hyp └── very_small.txt └── test1.test.in /AUTHORS: -------------------------------------------------------------------------------- 1 | Developers contact: 2 | MIT LM Developers 3 | 4 | Original author: 5 | Bo-June (Paul) Hsu 6 | 7 | Contributors: 8 | Giulio Paci 9 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | Copyright (c) 2008, Massachusetts Institute of Technology 2 | Copyright (c) 2008-2009, Bo-June (Paul) Hsu 3 | Copyright (c) 2010-2013, Giulio Paci 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are 8 | met: 9 | 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above 14 | copyright notice, this list of conditions and the following 15 | disclaimer in the documentation and/or other materials provided 16 | with the distribution. 17 | 18 | * Neither the name of the Massachusetts Institute of Technology 19 | nor the names of its contributors may be used to endorse or 20 | promote products derived from this software without specific 21 | prior written permission. 22 | 23 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 26 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 27 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 28 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 29 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 30 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 31 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 32 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 33 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- 1 | Changes in mitlm 0.4.1 2 | ---------------------- 3 | 4 | [giuliopaci@gmail.com] 5 | 6 | - Fix segmentation fault in interpolate-ngram when no argument is provided. 7 | - Fix opening of very short vocabularies. 8 | - Fix filename escaping in popen call. 9 | - Fix issue 12 about binary mode in popen. 10 | - Add mitlm namespace. 11 | - Improve command line help. 12 | - Improve BitOps.h. 13 | - __fls has been renamed to find_last_bit_set 14 | - the x86 assembly implementation of find_last_bit_set is not selected 15 | by default. 16 | - the generic implementation of find_last_bit_set supports 64bit. 17 | - __builtin_clzl can be used to implement find_last_bit_set. 18 | - Replace hash_map with unordered_map. 19 | - Switch from basic Makefile to autotools. 20 | - Enable out-of-tree compilation. 21 | - Fix compilation with fort77 compiler. 22 | - Enable shared library compilation. 23 | - Add test suite. 24 | - Fix compilation with gcc 4.7.1. 25 | - Fix compilation with MinGW. 26 | 27 | [bojunehsu] 28 | 29 | - Fix LoadEvalCorpus() to allow order = 1. 30 | - Split and into two entries when writing order 1 LMs. 31 | 32 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | ## Process this file with automake to produce Makefile.in 2 | 3 | SUBDIRS = 4 | 5 | EXTRA_DIST = autogen.sh 6 | 7 | 8 | AM_CPPFLAGS = \ 9 | -I$(top_srcdir)/src 10 | 11 | # Enable building files in subdirectories. 12 | AUTOMAKE_OPTIONS = subdir-objects 13 | 14 | lib_LTLIBRARIES = libmitlm.la 15 | 16 | mitlmincdir = $(includedir)/mitlm 17 | mitlmoptimizeincdir = $(includedir)/mitlm/optimize 18 | mitlmutilincdir = $(includedir)/mitlm/util 19 | mitlmvectorincdir = $(includedir)/mitlm/vector 20 | 21 | mitlmoptimizeinc_HEADERS= \ 22 | src/optimize/Optimization.h \ 23 | src/optimize/LBFGS.h \ 24 | src/optimize/Powell.h \ 25 | src/optimize/LBFGSB.h 26 | 27 | mitlmutilinc_HEADERS= \ 28 | src/util/FastIO.h \ 29 | src/util/RefCounter.h \ 30 | src/util/CommandOptions.h \ 31 | src/util/ZFile.h \ 32 | src/util/Logger.h \ 33 | src/util/SharedPtr.h \ 34 | src/util/BitOps.h \ 35 | src/util/FastHash.h 36 | 37 | mitlmvectorinc_HEADERS= \ 38 | src/vector/Range.h \ 39 | src/vector/Operations.h \ 40 | src/vector/VectorClosures.h \ 41 | src/vector/Traits.h \ 42 | src/vector/VectorBuilder.h \ 43 | src/vector/VectorOps.h \ 44 | src/vector/Scalar.h \ 45 | src/vector/Vector.h \ 46 | src/vector/DenseVector.h \ 47 | src/vector/DenseVector.tcc 48 | 49 | mitlminc_HEADERS= \ 50 | src/MaxLikelihoodSmoothing.h \ 51 | src/Smoothing.h \ 52 | src/Vocab.h \ 53 | src/Mask.h \ 54 | src/KneserNeySmoothing.h \ 55 | src/PerplexityOptimizer.h \ 56 | src/NgramVector.h \ 57 | src/Lattice.h \ 58 | src/WordErrorRateOptimizer.h \ 59 | src/InterpolatedNgramLM.h \ 60 | src/NgramLM.h \ 61 | src/NgramModel.h \ 62 | src/Types.h 63 | 64 | 65 | 66 | libmitlm_la_SOURCES = \ 67 | src/util/CommandOptions.cpp \ 68 | src/util/RefCounter.cpp \ 69 | src/util/Logger.cpp \ 70 | src/NgramLM.cpp \ 71 | src/Vocab.cpp \ 72 | src/PerplexityOptimizer.cpp \ 73 | src/Lattice.cpp \ 74 | src/Smoothing.cpp \ 75 | src/NgramModel.cpp \ 76 | src/NgramVector.cpp \ 77 | src/MaxLikelihoodSmoothing.cpp \ 78 | src/KneserNeySmoothing.cpp \ 79 | src/InterpolatedNgramLM.cpp \ 80 | src/optimize/lbfgs.f \ 81 | src/optimize/lbfgsb.f \ 82 | src/optimize/fortran_wrapper.c \ 83 | src/WordErrorRateOptimizer.cpp 84 | 85 | libmitlm_la_LIBADD = $(FLIBS) 86 | libmitlm_la_LDFLAGS = -export-symbols-regex mitlm 87 | 88 | # Programs: 89 | 90 | bin_PROGRAMS = evaluate-ngram estimate-ngram interpolate-ngram 91 | 92 | evaluate_ngram_SOURCES = \ 93 | src/evaluate-ngram.cpp 94 | 95 | evaluate_ngram_LDADD = libmitlm.la $(FLIBS) 96 | evaluate_ngram_CFLAGS = 97 | 98 | estimate_ngram_SOURCES = \ 99 | src/estimate-ngram.cpp 100 | 101 | estimate_ngram_LDADD = libmitlm.la $(FLIBS) 102 | estimate_ngram_CFLAGS = 103 | 104 | interpolate_ngram_SOURCES = \ 105 | src/interpolate-ngram.cpp 106 | 107 | interpolate_ngram_LDADD = libmitlm.la $(FLIBS) 108 | interpolate_ngram_CFLAGS = 109 | TESTS = tests/test1.test 110 | 111 | EXTRA_DIST += \ 112 | tests/data/small.txt \ 113 | tests/data/small.vocab \ 114 | tests/data/test1_ref/wlc.a.hyp \ 115 | tests/data/test1_ref/wlc.b.hyp \ 116 | tests/data/test1_ref/wc.a.hyp \ 117 | tests/data/test1_ref/wc.b.hyp \ 118 | tests/data/test1_ref/wrc.a.hyp \ 119 | tests/data/test1_ref/wrc.b.hyp \ 120 | tests/data/test1_ref/wec.a.hyp \ 121 | tests/data/test1_ref/wec.b.hyp \ 122 | tests/data/test1_ref/wl.a.hyp \ 123 | tests/data/test1_ref/wl.b.hyp 124 | -------------------------------------------------------------------------------- /Makefile.example: -------------------------------------------------------------------------------- 1 | INC = -Isrc 2 | CXXFLAGS = -g -Wall -fPIC -fmessage-length=0 $(INC) 3 | LDFLAGS = -L. -lg2c -lmitlm 4 | FFLAGS = -g -fPIC -fmessage-length=0 5 | 6 | ifdef GPROF 7 | CXXFLAGS += -pg 8 | LDFLAGS += -pg 9 | endif 10 | 11 | ifdef DEBUG 12 | CXXFLAGS += -O0 -fno-inline 13 | else 14 | CXXFLAGS += -O3 -DNDEBUG -funroll-loops 15 | FFLAGS += -O3 -DNDEBUG -funroll-loops 16 | LDFLAGS += -O3 -funroll-loops 17 | endif 18 | 19 | UTIL_SOURCES = src/util/RefCounter.cpp src/util/Logger.cpp src/util/CommandOptions.cpp 20 | SOURCES = $(UTIL_SOURCES) src/Vocab.cpp src/NgramVector.cpp \ 21 | src/NgramModel.cpp src/NgramLM.cpp src/InterpolatedNgramLM.cpp \ 22 | src/Smoothing.cpp src/MaxLikelihoodSmoothing.cpp src/KneserNeySmoothing.cpp \ 23 | src/PerplexityOptimizer.cpp src/WordErrorRateOptimizer.cpp \ 24 | src/Lattice.cpp 25 | UTIL_OBJECTS = $(UTIL_SOURCES:.cpp=.o) 26 | OBJECTS = $(SOURCES:.cpp=.o) src/optimize/lbfgsb.o src/optimize/lbfgs.o 27 | 28 | # Core MITLM utilities 29 | all: estimate-ngram interpolate-ngram evaluate-ngram 30 | 31 | libmitlm.a: $(OBJECTS) 32 | ar rcs $@ $(OBJECTS) 33 | 34 | estimate-ngram: libmitlm.a src/estimate-ngram.o 35 | $(CXX) src/estimate-ngram.o -o $@ $(LDFLAGS) 36 | 37 | interpolate-ngram: libmitlm.a src/interpolate-ngram.o 38 | $(CXX) src/interpolate-ngram.o -o $@ $(LDFLAGS) 39 | 40 | evaluate-ngram: libmitlm.a src/evaluate-ngram.o 41 | $(CXX) src/evaluate-ngram.o -o $@ $(LDFLAGS) 42 | 43 | # Build scripts 44 | clean: 45 | rm -f $(OBJECTS) src/*.o test/*.o mitlm.tgz 46 | rm -f estimate-ngram interpolate-ngram evaluate-ngram libmitlm.a 47 | 48 | dist: clean 49 | cd ..; tar czvf mitlm.tgz --exclude=".*" mitlm/; cd mitlm; mv ../mitlm.tgz . 50 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | ============================= 2 | MIT Language Modeling Toolkit 3 | ============================= 4 | 5 | The MIT Language Modeling (MITLM) toolkit is a set of tools designed 6 | for the efficient estimation of statistical n-gram language models 7 | involving iterative parameter estimation. It achieves much of its 8 | efficiency through the use of a compact vector representation of 9 | n-grams. Details of the data structure and associated algorithms can 10 | be found in the following paper. 11 | 12 | * Bo-June (Paul) Hsu and James Glass. Iterative Language Model 13 | Estimation: Efficient Data Structure & Algorithms. In 14 | Proc. Interspeech, 2008. 15 | 16 | Currently, MITLM supports the following features: 17 | 18 | * Smoothing: Modified Kneser-Ney, Kneser-Ney, maximum likelihood 19 | * Interpolation: Linear interpolation, count merging, generalized 20 | linear interpolation 21 | * Evaluation: Perplexity 22 | * File formats: ARPA, binary, gzip, bz2 23 | 24 | MITLM is available for download under the MIT License. It has been 25 | built and tested on 32-bit and 64-bit Intel CPUs running Debian Linux 26 | 7.0. It currently requires the following: 27 | 28 | * ANSI C++/Fortran compiler (GCC 4.7.1+) 29 | 30 | For more information about MITLM, please visit: 31 | 32 | https://code.google.com/p/mitlm/ 33 | 34 | If you find any BUG and/or want to provide a patch, please file an 35 | issue about it at: 36 | 37 | https://code.google.com/p/mitlm/issues/list 38 | 39 | =============== 40 | Acknowledgments 41 | =============== 42 | 43 | The design and implementation of this toolkit benefited significantly 44 | from the SRI Language Modeling Toolkit by Andreas Stolcke. 45 | 46 | The vector library is partially derived from the Flexible Library for 47 | Efficient Numerical Solutions by Michael Lehn. 48 | 49 | Copyright (c) 2007, Michael Lehn 50 | 51 | All rights reserved. 52 | 53 | Redistribution and use in source and binary forms, with or without 54 | modification, are permitted provided that the following conditions 55 | are met: 56 | 57 | 1) Redistributions of source code must retain the above copyright 58 | notice, this list of conditions and the following disclaimer. 59 | 2) Redistributions in binary form must reproduce the above copyright 60 | notice, this list of conditions and the following disclaimer in 61 | the documentation and/or other materials provided with the 62 | distribution. 63 | 3) Neither the name of the FLENS development group nor the names of 64 | its contributors may be used to endorse or promote products 65 | derived from this software without specific prior written 66 | permission. 67 | 68 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 69 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 70 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 71 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 72 | COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 73 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 74 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 75 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 76 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 77 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 78 | ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 79 | POSSIBILITY OF SUCH DAMAGE. 80 | 81 | The project is supported in part by the T-Party Project, a joint 82 | research program between MIT CSAIL and Quanta Computer Inc. 83 | 84 | Bo-June (Paul) Hsu 85 | Computer Science and Artificial Intelligence Laboratory 86 | Massachusetts Institute of Technology 87 | (C) 2008 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /autogen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Run this to generate all the initial makefiles, etc. 3 | 4 | srcdir=`dirname $0` 5 | test -z "$srcdir" && srcdir=. 6 | 7 | DIE=0 8 | 9 | if [ -n "$GNOME2_DIR" ]; then 10 | ACLOCAL_FLAGS="-I $GNOME2_DIR/share/aclocal $ACLOCAL_FLAGS" 11 | LD_LIBRARY_PATH="$GNOME2_DIR/lib:$LD_LIBRARY_PATH" 12 | PATH="$GNOME2_DIR/bin:$PATH" 13 | export PATH 14 | export LD_LIBRARY_PATH 15 | fi 16 | 17 | (test -f $srcdir/configure.ac) || { 18 | echo -n "**Error**: Directory "\`$srcdir\'" does not look like the" 19 | echo " top-level package directory" 20 | exit 1 21 | } 22 | 23 | (autoconf --version) < /dev/null > /dev/null 2>&1 || { 24 | echo 25 | echo "**Error**: You must have \`autoconf' installed." 26 | echo "Download the appropriate package for your distribution," 27 | echo "or get the source tarball at ftp://ftp.gnu.org/pub/gnu/" 28 | DIE=1 29 | } 30 | 31 | (grep "^AC_PROG_INTLTOOL" $srcdir/configure.ac >/dev/null) && { 32 | (intltoolize --version) < /dev/null > /dev/null 2>&1 || { 33 | echo 34 | echo "**Error**: You must have \`intltool' installed." 35 | echo "You can get it from:" 36 | echo " ftp://ftp.gnome.org/pub/GNOME/" 37 | DIE=1 38 | } 39 | } 40 | 41 | (grep "^AM_PROG_XML_I18N_TOOLS" $srcdir/configure.ac >/dev/null) && { 42 | (xml-i18n-toolize --version) < /dev/null > /dev/null 2>&1 || { 43 | echo 44 | echo "**Error**: You must have \`xml-i18n-toolize' installed." 45 | echo "You can get it from:" 46 | echo " ftp://ftp.gnome.org/pub/GNOME/" 47 | DIE=1 48 | } 49 | } 50 | 51 | (grep "^AM_PROG_LIBTOOL" $srcdir/configure.ac >/dev/null) && { 52 | (libtool --version) < /dev/null > /dev/null 2>&1 || { 53 | echo 54 | echo "**Error**: You must have \`libtool' installed." 55 | echo "You can get it from: ftp://ftp.gnu.org/pub/gnu/" 56 | DIE=1 57 | } 58 | } 59 | 60 | (grep "^AM_GLIB_GNU_GETTEXT" $srcdir/configure.ac >/dev/null) && { 61 | (grep "sed.*POTFILES" $srcdir/configure.ac) > /dev/null || \ 62 | (glib-gettextize --version) < /dev/null > /dev/null 2>&1 || { 63 | echo 64 | echo "**Error**: You must have \`glib' installed." 65 | echo "You can get it from: ftp://ftp.gtk.org/pub/gtk" 66 | DIE=1 67 | } 68 | } 69 | 70 | (automake --version) < /dev/null > /dev/null 2>&1 || { 71 | echo 72 | echo "**Error**: You must have \`automake' installed." 73 | echo "You can get it from: ftp://ftp.gnu.org/pub/gnu/" 74 | DIE=1 75 | NO_AUTOMAKE=yes 76 | } 77 | 78 | 79 | # if no automake, don't bother testing for aclocal 80 | test -n "$NO_AUTOMAKE" || (aclocal --version) < /dev/null > /dev/null 2>&1 || { 81 | echo 82 | echo "**Error**: Missing \`aclocal'. The version of \`automake'" 83 | echo "installed doesn't appear recent enough." 84 | echo "You can get automake from ftp://ftp.gnu.org/pub/gnu/" 85 | DIE=1 86 | } 87 | 88 | if test "$DIE" -eq 1; then 89 | exit 1 90 | fi 91 | 92 | if test -z "$*"; then 93 | echo "**Warning**: I am going to run \`configure' with no arguments." 94 | echo "If you wish to pass any to it, please specify them on the" 95 | echo \`$0\'" command line." 96 | echo 97 | fi 98 | 99 | case $CC in 100 | xlc ) 101 | am_opt=--include-deps;; 102 | esac 103 | 104 | for coin in `find $srcdir -name configure.ac -print` 105 | do 106 | dr=`dirname $coin` 107 | if test -f $dr/NO-AUTO-GEN; then 108 | echo skipping $dr -- flagged as no auto-gen 109 | else 110 | echo processing $dr 111 | ( cd $dr 112 | 113 | aclocalinclude="$ACLOCAL_FLAGS" 114 | if test -d /mingw/share/aclocal ; then 115 | aclocalinclude="$aclocalinclude -I /mingw/share/aclocal" 116 | fi 117 | 118 | if grep "^AM_GLIB_GNU_GETTEXT" configure.ac >/dev/null; then 119 | echo "Creating $dr/aclocal.m4 ..." 120 | test -r $dr/aclocal.m4 || touch $dr/aclocal.m4 121 | echo "Running glib-gettextize... Ignore non-fatal messages." 122 | echo "no" | glib-gettextize --force --copy 123 | echo "Making $dr/aclocal.m4 writable ..." 124 | test -r $dr/aclocal.m4 && chmod u+w $dr/aclocal.m4 125 | fi 126 | if grep "^AC_PROG_INTLTOOL" configure.ac >/dev/null; then 127 | echo "Running intltoolize..." 128 | intltoolize --copy --force --automake 129 | fi 130 | if grep "^AM_PROG_XML_I18N_TOOLS" configure.ac >/dev/null; then 131 | echo "Running xml-i18n-toolize..." 132 | xml-i18n-toolize --copy --force --automake 133 | fi 134 | if grep "^AC_PROG_LIBTOOL" configure.ac >/dev/null; then 135 | if test -z "$NO_LIBTOOLIZE" ; then 136 | echo "Running libtoolize..." 137 | libtoolize --force --copy 138 | fi 139 | fi 140 | echo "Running aclocal $aclocalinclude ..." 141 | aclocal $aclocalinclude 142 | if grep "^AM_CONFIG_HEADER" configure.ac >/dev/null; then 143 | echo "Running autoheader..." 144 | autoheader 145 | fi 146 | echo "Running automake --gnu $am_opt ..." 147 | automake --add-missing --gnu $am_opt 148 | echo "Running autoconf ..." 149 | autoconf 150 | ) 151 | fi 152 | done 153 | 154 | conf_flags="--enable-maintainer-mode" 155 | 156 | if test x$NOCONFIGURE = x; then 157 | echo Running $srcdir/configure $conf_flags "$@" ... 158 | $srcdir/configure $conf_flags "$@" \ 159 | && echo Now type \`make\' to compile. || exit 1 160 | else 161 | echo Skipping configure process. 162 | fi 163 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | dnl Process this file with autoconf to produce a configure script. 2 | 3 | dnl AC_INIT(package, version, bug-report-address) 4 | AC_INIT([MIT Language Modeling Toolkit], 0.4.1, [mitlm-dev@googlegroups.com], mitlm) 5 | dnl This file (configure.ac) revision. 6 | AC_REVISION([$Revision$]) 7 | dnl The base directory (srcdir) must contain configure.ac. 8 | AC_CONFIG_SRCDIR(configure.ac) 9 | dnl Use ./ to store scripts and files used by configure. 10 | AC_CONFIG_AUX_DIR([build-aux]) 11 | 12 | AM_INIT_AUTOMAKE([foreign -Wall -Werror]) 13 | 14 | AM_MAINTAINER_MODE 15 | 16 | dnl Checks for programs. 17 | AC_PROG_CC 18 | AC_PROG_CXX 19 | AC_PROG_F77 20 | 21 | AC_F77_LIBRARY_LDFLAGS 22 | AC_F77_DUMMY_MAIN 23 | AC_F77_WRAPPERS 24 | 25 | AM_PROG_AR 26 | AC_LIBTOOL_WIN32_DLL 27 | AC_PROG_LIBTOOL 28 | 29 | dnl Checks for header files. 30 | AC_CHECK_HEADERS(string.h math.h) 31 | AC_HEADER_STDC 32 | 33 | dnl Checks for types. 34 | 35 | dnl Checks for structures. 36 | 37 | dnl Checks for compiler characteristics. 38 | AC_C_INLINE 39 | AM_PROG_CC_C_O 40 | 41 | dnl Checks for library functions. 42 | AC_FUNC_MEMCMP 43 | AC_FUNC_MALLOC 44 | AC_FUNC_REALLOC 45 | 46 | 47 | AC_DEFUN([AX_CHECK_BUILTIN], [ 48 | AC_CACHE_CHECK([wheter we have $1], [ac_cv_have_$1], 49 | [AC_LINK_IFELSE([AC_LANG_PROGRAM([$2], [$3])], 50 | [ac_cv_have_$1=yes], [ac_cv_have_$1=no])]) 51 | if test $ac_cv_have_$1 = yes; then 52 | AC_DEFINE(AS_TR_CPP([HAVE_$1]), [1], [wheter we have $1]) 53 | fi 54 | ]) 55 | 56 | AX_CHECK_BUILTIN([__builtin_clzl], [[unsigned long x = 1;]], 57 | [[int main(void) { return __builtin_clzll(x) == (sizeof(int)*8 - 1) ? 0 : 1; } 58 | ]]) 59 | 60 | AX_CHECK_BUILTIN([X86_ASM], [[ 61 | #define bool int 62 | #define HAVE_X86_ASM 1 63 | #include "src/util/BitOps.h" 64 | unsigned long x = 2; 65 | ]], 66 | [[int main(void) { return find_last_bit_set(2)-2 ; } 67 | ]]) 68 | 69 | dnl Checks for system services. 70 | 71 | dnl Output. 72 | AC_CONFIG_FILES([ 73 | Makefile 74 | ]) 75 | AC_CONFIG_FILES([tests/test1.test], [chmod +x tests/test1.test]) 76 | 77 | AC_OUTPUT([]) 78 | -------------------------------------------------------------------------------- /src/InterpolatedNgramLM.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef INTERPOLATEDNGRAMLM_H 36 | #define INTERPOLATEDNGRAMLM_H 37 | 38 | #include 39 | #include "util/SharedPtr.h" 40 | #include "Types.h" 41 | #include "NgramModel.h" 42 | #include "NgramLM.h" 43 | #include "Mask.h" 44 | 45 | using std::vector; 46 | 47 | namespace mitlm { 48 | //////////////////////////////////////////////////////////////////////////////// 49 | 50 | enum Interpolation { 51 | LinearInterpolation = 0, 52 | CountMerging = 1, 53 | GeneralizedLinearInterpolation = 2, 54 | LI = LinearInterpolation, 55 | CM = CountMerging, 56 | GLI = GeneralizedLinearInterpolation 57 | }; 58 | 59 | class InterpolatedNgramLM : public NgramLMBase { 60 | protected: 61 | vector > _lms; 62 | vector > _featureList; 63 | Interpolation _interpolation; 64 | ProbVector _weights; 65 | ProbVector _totWeights; 66 | IntVector _paramStarts; 67 | ParamVector _paramDefaults; 68 | BitVector _paramMask; 69 | bool _tieParamOrder; 70 | bool _tieParamLM; 71 | 72 | public: 73 | InterpolatedNgramLM(size_t order = 3, 74 | bool tieParamOrder = false, 75 | bool tieParamLM = false) 76 | : NgramLMBase(order), _interpolation(LI), 77 | _tieParamOrder(tieParamOrder), _tieParamLM(tieParamLM) { } 78 | void LoadLMs(const vector > &lms); 79 | void SetInterpolation(Interpolation interpolation, 80 | const vector > &featureList); 81 | SharedPtr &lms(int l) { return _lms[l]; } 82 | size_t numLMs() { return _lms.size(); } 83 | 84 | virtual Mask *GetMask(vector &probMaskVectors, 85 | vector &bowMaskVectors) const; 86 | virtual bool Estimate(const ParamVector ¶ms, Mask *pMask=NULL); 87 | 88 | private: 89 | void _EstimateProbs(const ParamVector ¶ms); 90 | void _EstimateBows(); 91 | void _EstimateProbsMasked(const ParamVector ¶ms, 92 | InterpolatedNgramLMMask *pMask); 93 | void _EstimateBowsMasked(InterpolatedNgramLMMask *pMask); 94 | }; 95 | 96 | } 97 | 98 | #endif // INTERPOLATEDNGRAMLM_H 99 | -------------------------------------------------------------------------------- /src/KneserNeySmoothing.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef KNESERNEYSMOOTHING_H 36 | #define KNESERNEYSMOOTHING_H 37 | 38 | #include "Types.h" 39 | #include "Smoothing.h" 40 | #include "NgramLM.h" 41 | #include "Mask.h" 42 | 43 | /////////////////////////////////////////////////////////////////////////////// 44 | 45 | namespace mitlm { 46 | 47 | class KneserNeySmoothing : public Smoothing { 48 | protected: 49 | NgramLM *_pLM; 50 | size_t _order; 51 | size_t _discOrder; 52 | bool _tuneParams; 53 | ProbVector _ngramWeights; 54 | ProbVector _invHistCounts; 55 | ParamVector _discParams; 56 | IntVector _paramIndices; 57 | 58 | public: 59 | KneserNeySmoothing(size_t discOrder=3, bool tuneParams=false) 60 | : _discOrder(discOrder), _tuneParams(tuneParams) { } 61 | virtual void Initialize(NgramLM *pLM, size_t order); 62 | virtual void UpdateMask(NgramLMMask &lmMask) const; 63 | virtual bool Estimate(const ParamVector ¶ms, const NgramLMMask *pMask, 64 | ProbVector &probs, ProbVector &bows); 65 | 66 | protected: 67 | void _ComputeWeights(const ParamVector &featParams); 68 | void _Estimate(ProbVector &probs, ProbVector &bows); 69 | void _EstimateMasked(const NgramLMMask *pMask, 70 | ProbVector &probs, ProbVector &bows); 71 | void _EstimateWeighted(ProbVector &probs, ProbVector &bows); 72 | void _EstimateWeightedMasked(const NgramLMMask *pMask, 73 | ProbVector &probs, ProbVector &bows); 74 | }; 75 | 76 | } 77 | 78 | #endif // KNESERNEYSMOOTHING_H 79 | -------------------------------------------------------------------------------- /src/Lattice.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef LATTICE_H 36 | #define LATTICE_H 37 | 38 | #include 39 | #include "util/FastHash.h" 40 | #include "util/ZFile.h" 41 | #include "Types.h" 42 | #include "NgramLM.h" 43 | 44 | using std::vector; 45 | using std::string; 46 | 47 | //////////////////////////////////////////////////////////////////////////////// 48 | 49 | namespace mitlm { 50 | 51 | const float INF = std::numeric_limits::infinity(); 52 | 53 | inline float logAdd(float logX, float logY) { 54 | if (logY > logX) 55 | std::swap(logX, logY); 56 | float negDiff = logY - logX; 57 | if (negDiff < -20) 58 | return logX; 59 | return logX + std::log(1.0f + std::exp(negDiff)); 60 | } 61 | 62 | class Lattice { 63 | friend class WordErrorRateOptimizer; 64 | 65 | struct ArcNgramIndex { 66 | ArcNgramIndex(uint a=0, uint o=0, NgramIndex i=0) : 67 | arcIndex(a), order(o), ngramIndex(i) { } 68 | uint arcIndex : 28; 69 | uint order : 4; 70 | NgramIndex ngramIndex; 71 | }; 72 | typedef DenseVector ArcNgramIndexVector; 73 | 74 | struct ArcScore { 75 | ArcScore(uint a, float s) : arc(a), score(s) { } 76 | uint arc; 77 | float score; 78 | }; 79 | typedef DenseVector ArcScoreVector; 80 | 81 | struct WordProb { 82 | WordProb(VocabIndex w, float p) : word(w), prob(p) { } 83 | VocabIndex word; 84 | float prob; 85 | }; 86 | 87 | const NgramLMBase & _lm; 88 | string _tag; 89 | NodeIndex _finalNode; 90 | NodeVector _arcStarts; 91 | NodeVector _arcEnds; 92 | VocabVector _arcWords; 93 | FloatVector _arcBaseWeights; 94 | FloatVector _arcWeights; 95 | UIntVector _nodeArcs; 96 | VocabVector _ref; 97 | UIntVector _oraclePath; 98 | int _oracleWER; 99 | ArcNgramIndexVector _arcProbs; 100 | ArcNgramIndexVector _arcBows; 101 | bool _skipTags; 102 | 103 | public: 104 | Lattice(const NgramLMBase &lm) : _lm(lm), _skipTags(true) { } 105 | void SetTag(const char *tag) { _tag = tag; } 106 | void LoadLattice(ZFile &latticeFile); 107 | void SaveLattice(ZFile &latticeFile) const; 108 | void UpdateWeights(); 109 | void SetReferenceText(const char *ref); 110 | float ComputeMargin() const; 111 | int ComputeWER() const; 112 | void GetBestPath(vector &bestPath) const; 113 | 114 | void ComputeForwardScores(FloatVector &nodeScores) const; 115 | void ComputeBackwardScores(FloatVector &nodeScores) const; 116 | void ComputePosteriorProbs(const FloatVector &forwardScores, 117 | const FloatVector &backwardScores, 118 | FloatVector &arcProbs) const; 119 | void ComputeForwardSteps(const FloatVector &forwardScores, 120 | FloatVector &nodeSteps) const; 121 | void ComputeBackwardSteps(const FloatVector &backwardScores, 122 | FloatVector &nodeSteps) const; 123 | void EstimateArcPosition(const FloatVector &forwardScores, 124 | const FloatVector &backwardScores, 125 | FloatVector &nodePositions) const; 126 | float BuildConfusionNetwork() const; 127 | 128 | void Serialize(FILE *outFile) const; 129 | void Deserialize(FILE *inFile); 130 | 131 | const char * tag() const { return _tag.c_str(); } 132 | const VocabVector &refWords() const { return _ref; } 133 | const NodeVector &arcStarts() const { return _arcStarts; } 134 | const NodeVector &arcEnds() const { return _arcEnds; } 135 | const VocabVector &arcWords() const { return _arcWords; } 136 | const FloatVector &arcWeights() const { return _arcWeights; } 137 | const UIntVector &oraclePath() const { return _oraclePath; } 138 | int oracleWER() const { return _oracleWER; } 139 | NodeIndex numNodes() const { return _finalNode + 1; } 140 | 141 | private: 142 | template 143 | void _Sort(size_t numTrans, const Compare &compare); 144 | void _Reserve(size_t capacity); 145 | void _ComputeArcNgramMapping(); 146 | float _FindOraclePath(); 147 | void _ReverseViterbiSearch(ArcScoreVector &bestArcs) const; 148 | float _FindBestPath(const ArcScoreVector &bestArcs, 149 | vector &bestPath) const; 150 | void _FindNBestPaths(const ArcScoreVector &bestArcs, 151 | size_t n, vector &nbestScores) const; 152 | bool _IsOracleBestPath(const ArcScoreVector &bestArcs) const; 153 | 154 | }; 155 | 156 | } 157 | 158 | #endif // LATTICE_H 159 | -------------------------------------------------------------------------------- /src/Mask.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef FILTER_H 36 | #define FILTER_H 37 | 38 | #include 39 | #include "util/SharedPtr.h" 40 | #include "Types.h" 41 | 42 | using std::vector; 43 | 44 | namespace mitlm { 45 | 46 | //////////////////////////////////////////////////////////////////////////////// 47 | 48 | struct Mask { 49 | virtual ~Mask() { } 50 | }; 51 | 52 | //////////////////////////////////////////////////////////////////////////////// 53 | 54 | struct NgramLMMask : public Mask { 55 | vector ProbMaskVectors; 56 | vector BowMaskVectors; 57 | vector > SmoothingMasks; 58 | }; 59 | 60 | //////////////////////////////////////////////////////////////////////////////// 61 | 62 | struct KneserNeySmoothingMask : public Mask { 63 | BitVector DiscMask; 64 | }; 65 | 66 | //////////////////////////////////////////////////////////////////////////////// 67 | 68 | struct InterpolatedNgramLMMask : public Mask { 69 | vector ProbMaskVectors; 70 | vector BowMaskVectors; 71 | vector WeightMaskVectors; 72 | vector > LMMasks; 73 | }; 74 | 75 | } 76 | 77 | #endif // FILTER_H 78 | -------------------------------------------------------------------------------- /src/MaxLikelihoodSmoothing.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #include "Types.h" 36 | #include "NgramLM.h" 37 | #include "Mask.h" 38 | #include "Smoothing.h" 39 | #include "MaxLikelihoodSmoothing.h" 40 | 41 | //////////////////////////////////////////////////////////////////////////////// 42 | 43 | namespace mitlm { 44 | 45 | Smoothing::~Smoothing() { } 46 | 47 | //////////////////////////////////////////////////////////////////////////////// 48 | 49 | void 50 | MaxLikelihoodSmoothing::Initialize(NgramLM *pLM, size_t order) { 51 | assert(order != 0); 52 | _pLM = pLM; 53 | _order = order; 54 | _effCounts.attach(_pLM->counts(_order)); 55 | } 56 | 57 | bool 58 | MaxLikelihoodSmoothing::Estimate(const ParamVector ¶ms, 59 | const NgramLMMask *pMask, 60 | ProbVector &probs, 61 | ProbVector &bows) { 62 | if (!_estimated) { 63 | const CountVector &counts(_pLM->counts(_order)); 64 | const IndexVector &hists(_pLM->hists(_order)); 65 | 66 | // Compute inverse of sum of adjusted counts for each history. 67 | CountVector histCounts(_pLM->sizes(_order - 1), 0); 68 | ProbVector invHistCounts(histCounts.length()); 69 | BinCount(hists, histCounts); 70 | invHistCounts = 1.0 / asDouble(histCounts); 71 | 72 | // Compute maximum likelihood probability. 0 backoff. 73 | probs = counts * invHistCounts[hists]; 74 | bows.set(0); 75 | _estimated = true; 76 | } 77 | return true; 78 | } 79 | 80 | } 81 | -------------------------------------------------------------------------------- /src/MaxLikelihoodSmoothing.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef MAXLIKELIHOODSMOOTHING_H 36 | #define MAXLIKELIHOODSMOOTHING_H 37 | 38 | #include "Smoothing.h" 39 | #include "NgramLM.h" 40 | #include "Mask.h" 41 | 42 | /////////////////////////////////////////////////////////////////////////////// 43 | 44 | namespace mitlm { 45 | 46 | class MaxLikelihoodSmoothing : public Smoothing { 47 | NgramLM *_pLM; 48 | size_t _order; 49 | bool _estimated; 50 | 51 | public: 52 | MaxLikelihoodSmoothing() : _pLM(NULL), _order(0), _estimated(false) { } 53 | virtual void Initialize(NgramLM *pLM, size_t order); 54 | virtual void UpdateMask(NgramLMMask &lmMask) const { } 55 | virtual bool Estimate(const ParamVector ¶ms, 56 | const NgramLMMask *pMask, 57 | ProbVector &probs, 58 | ProbVector &bows); 59 | }; 60 | 61 | } 62 | 63 | #endif // MAXLIKELIHOODSMOOTHING_H 64 | -------------------------------------------------------------------------------- /src/NgramLM.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef NGRAMLM_H 36 | #define NGRAMLM_H 37 | 38 | #include 39 | #include "util/SharedPtr.h" 40 | #include "Types.h" 41 | #include "Vocab.h" 42 | #include "NgramModel.h" 43 | #include "Smoothing.h" 44 | #include "Mask.h" 45 | 46 | using std::vector; 47 | 48 | //////////////////////////////////////////////////////////////////////////////// 49 | 50 | namespace mitlm { 51 | class NgramLMBase { 52 | friend class PerplexityOptimizer; 53 | 54 | protected: 55 | SharedPtr _pModel; 56 | size_t _order; 57 | vector _probVectors; 58 | vector _bowVectors; 59 | ParamVector _defParams; 60 | 61 | public: 62 | NgramLMBase(size_t order = 3); 63 | virtual ~NgramLMBase() { } 64 | void UseUnknown() { _pModel->UseUnknown(); } 65 | void LoadVocab(ZFile &vocabFile); 66 | void SaveVocab(ZFile &vocabFile, bool asBinary=false) const; 67 | void SaveLM(ZFile &lmFile, bool asBinary=false) const; 68 | void Serialize(FILE *outFile) const; 69 | void Deserialize(FILE *inFile); 70 | 71 | virtual void SetOrder(size_t order); 72 | virtual Mask *GetMask(vector &probMaskVectors, 73 | vector &bowMaskVectors) const; 74 | virtual bool Estimate(const ParamVector ¶ms, Mask *pMask=NULL); 75 | virtual void SetModel(const SharedPtr &m, 76 | const VocabVector &vocabMap, 77 | const vector &ngramMap); 78 | 79 | size_t order() const { return _order; } 80 | size_t sizes(size_t o) const { return _pModel->sizes(o); } 81 | const Vocab & vocab() const { return _pModel->vocab(); } 82 | const NgramModel & model() const { return *_pModel; } 83 | const VocabVector &words(size_t o) const { return _pModel->words(o); } 84 | const IndexVector &hists(size_t o) const { return _pModel->hists(o); } 85 | const IndexVector &backoffs(size_t o) const { return _pModel->backoffs(o); } 86 | const ProbVector &probs(size_t o) const { return _probVectors[o]; } 87 | const ProbVector &bows(size_t o) const { return _bowVectors[o]; } 88 | const ParamVector &defParams() const { return _defParams; } 89 | }; 90 | 91 | //////////////////////////////////////////////////////////////////////////////// 92 | 93 | class ArpaNgramLM : public NgramLMBase { 94 | public: 95 | ArpaNgramLM(size_t order = 3) : NgramLMBase(order) { } 96 | void LoadLM(ZFile &lmFile); 97 | }; 98 | 99 | //////////////////////////////////////////////////////////////////////////////// 100 | 101 | class NgramLM : public NgramLMBase { 102 | protected: 103 | vector > _smoothings; 104 | vector _countVectors; 105 | vector _featureList; 106 | IntVector _paramStarts; 107 | 108 | public: 109 | NgramLM(size_t order = 3) : NgramLMBase(order), _countVectors(order + 1), 110 | _featureList(order + 1) { } 111 | void Initialize(const char *vocab, bool useUnknown, 112 | const char *text, const char *counts, 113 | const char *smoothing, const char *features); 114 | void LoadCorpus(ZFile &corpusFile, bool reset=false); 115 | void LoadCounts(ZFile &countsFile, bool reset=false); 116 | void SaveCounts(ZFile &countsFile, bool asBinary=false) const; 117 | void SaveEffCounts(ZFile &countsFile, bool asBinary=false) const; 118 | void SetSmoothingAlgs(const vector > &smoothings); 119 | void SetWeighting(const vector &featureList); 120 | 121 | virtual void SetOrder(size_t order); 122 | virtual Mask *GetMask(vector &probMaskVectors, 123 | vector &bowMaskVectors) const; 124 | virtual bool Estimate(const ParamVector ¶ms, Mask *pMask=NULL); 125 | virtual void SetModel(const SharedPtr &m, 126 | const VocabVector &vocabMap, 127 | const vector &ngramMap); 128 | 129 | const CountVector &counts(size_t o) const { return _countVectors[o]; } 130 | const FeatureVectors &features(size_t o) const { return _featureList[o]; } 131 | }; 132 | } 133 | 134 | #endif // NGRAMLM_H 135 | -------------------------------------------------------------------------------- /src/NgramModel.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // Copyright (c) 2010-2013, Giulio Paci // 4 | // All rights reserved. // 5 | // // 6 | // Redistribution and use in source and binary forms, with or without // 7 | // modification, are permitted provided that the following conditions are // 8 | // met: // 9 | // // 10 | // * Redistributions of source code must retain the above copyright // 11 | // notice, this list of conditions and the following disclaimer. // 12 | // // 13 | // * Redistributions in binary form must reproduce the above // 14 | // copyright notice, this list of conditions and the following // 15 | // disclaimer in the documentation and/or other materials provided // 16 | // with the distribution. // 17 | // // 18 | // * Neither the name of the Massachusetts Institute of Technology // 19 | // nor the names of its contributors may be used to endorse or // 20 | // promote products derived from this software without specific // 21 | // prior written permission. // 22 | // // 23 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 24 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 25 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 26 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 27 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 28 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 29 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 30 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 31 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 32 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 33 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 34 | //////////////////////////////////////////////////////////////////////////// 35 | 36 | #ifndef NGRAMMODEL_H 37 | #define NGRAMMODEL_H 38 | 39 | #include 40 | #include "util/ZFile.h" 41 | #include "Types.h" 42 | #include "Vocab.h" 43 | #include "NgramVector.h" 44 | 45 | using std::vector; 46 | 47 | namespace mitlm { 48 | 49 | //////////////////////////////////////////////////////////////////////////////// 50 | // NgramModel represents a full n-gram model, consisting of a NgramVector for 51 | // each order of the model. The model consists of a vocab mapping words to 52 | // indices. At the minimum, each n-gram is represented by its history index 53 | // and target word index. Additional fields, such as backoff index and counts, 54 | // can be associated with each n-gram using vectors with aligned indices. 55 | // Various methods are provides for efficiently loading/saving n-gram data. 56 | // 57 | class NgramModel { 58 | protected: 59 | Vocab _vocab; 60 | vector _vectors; 61 | vector _backoffVectors; 62 | 63 | public: 64 | NgramModel(size_t order = 3); 65 | void UseUnknown() { _vocab.UseUnknown(); } 66 | void SetOrder(size_t order); 67 | void LoadVocab(ZFile &vocabFile); 68 | void SaveVocab(ZFile &vocabFile, bool asBinary=false) const; 69 | void LoadCorpus(vector &countVectors, 70 | ZFile &corpusFile, bool reset=false); 71 | void LoadCounts(vector &countVectors, 72 | ZFile &countsFile, bool reset=false); 73 | void SaveCounts(const vector &countVectors, 74 | ZFile &countsFile, bool includeZeroOrder=false) const; 75 | void LoadLM(vector &probVectors, 76 | vector &bowVectors, 77 | ZFile &lmFile); 78 | void SaveLM(const vector &probVectors, 79 | const vector &bowVectors, 80 | ZFile &lmFile) const; 81 | void LoadEvalCorpus(vector &probCountVectors, 82 | vector &bowCountVectors, 83 | BitVector &vocabMask, ZFile &corpusFile, 84 | size_t &outNumOOV, size_t &outNumWords) const; 85 | void LoadFeatures(vector &featureVectors, 86 | ZFile &featureFile, size_t maxOrder=0) const; 87 | void LoadComputedFeatures(vector &featureVectors, 88 | const char *featureFile, 89 | size_t maxOrder=0) const; 90 | void SaveFeatures(vector &featureVectors, 91 | ZFile &featureFile) const; 92 | size_t GetNgramWords(size_t order, NgramIndex index, StrVector &wrds) const; 93 | void ExtendModel(const NgramModel &m, VocabVector &vocabMap, 94 | vector &ngramMap); 95 | void SortModel(VocabVector &vocabMap, vector &ngramMap); 96 | void Serialize(FILE *outFile) const; 97 | void Deserialize(FILE *inFile); 98 | 99 | template 100 | static void ApplySort(const IndexVector &ngramMap, DenseVector &data, 101 | size_t length = 0, T defValue = T()) { 102 | assert(data.length() >= ngramMap.length()); 103 | if (length == 0) length = ngramMap.length(); 104 | DenseVector sortedData(length, defValue); 105 | for (size_t i = 0; i < ngramMap.length(); ++i) 106 | sortedData[ngramMap[i]] = data[i]; 107 | data.swap(sortedData); 108 | } 109 | 110 | size_t size() const { return _vectors.size(); } 111 | size_t sizes(size_t o) const { return _vectors[o].size(); } 112 | const Vocab & vocab() const { return _vocab; } 113 | const NgramVector &vectors(size_t o) const { return _vectors[o]; } 114 | const VocabVector &words(size_t o) const { return _vectors[o].words(); } 115 | const IndexVector &hists(size_t o) const { return _vectors[o].hists(); } 116 | const IndexVector &backoffs(size_t o) const { return _backoffVectors[o];} 117 | 118 | protected: 119 | NgramIndex _Find(const VocabIndex *words, size_t wordsLen) const; 120 | void _ComputeBackoffs(); 121 | void _LoadFrequency(vector &freqVectors, 122 | ZFile &corpusFile, size_t maxSize=0) const; 123 | void _LoadEntropy(vector &entropyVectors, 124 | ZFile &corpusFile, size_t maxSize=0) const; 125 | void _LoadTopicProbs(vector &topicProbVectors, 126 | ZFile &hmmldaFile, size_t maxSize, 127 | bool onlyTargetWord=false) const; 128 | void _LoadTopicProbs2(vector &topicProbVectors, 129 | ZFile &hmmldaFile, size_t maxSize) const; 130 | }; 131 | 132 | } 133 | 134 | #endif // NGRAMMODEL_H 135 | -------------------------------------------------------------------------------- /src/NgramVector.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef NGRAMVECTOR_H 36 | #define NGRAMVECTOR_H 37 | 38 | #include "Types.h" 39 | 40 | namespace mitlm { 41 | 42 | //////////////////////////////////////////////////////////////////////////////// 43 | // NgramVector represents the n-gram structure within a particular order of the 44 | // n-gram trie. For each n-gram, it stores the index of the history n-gram in 45 | // the lower-order NgramVector and the index corresponding to the target word. 46 | // The n-grams can be accessed by index. Lookup of the n-gram index can be 47 | // performed in constant time. 48 | // 49 | class NgramVector { 50 | friend class NgramModel; 51 | friend class NgramIndexCompare; 52 | 53 | protected: 54 | size_t _length; 55 | VocabVector _words; 56 | IndexVector _hists; 57 | IndexVector _indices; // Index table mapping value to index 58 | size_t _hashMask; // Hash mask: hashIndex = hash & hashMask 59 | mutable VocabVector _wordsView; 60 | mutable IndexVector _histsView; 61 | 62 | public: 63 | static const NgramIndex Invalid; // = (NgramIndex)-1; 64 | 65 | NgramVector(); 66 | NgramVector(const NgramVector &v); 67 | NgramIndex Find(NgramIndex hist, VocabIndex word) const; 68 | NgramIndex Add(NgramIndex hist, VocabIndex word); 69 | NgramIndex Add(NgramIndex hist, VocabIndex word, bool *outNew); 70 | void Reserve(size_t capacity); 71 | bool Sort(const VocabVector &vocabMap, const IndexVector &boNgramMap, 72 | IndexVector &ngramMap); 73 | void Serialize(FILE *outFile) const; 74 | void Deserialize(FILE *inFile); 75 | 76 | size_t size() const { return _length; } 77 | size_t capacity() const { return _indices.length(); } 78 | const VocabVector &words() const { return _wordsView; } 79 | const IndexVector &hists() const { return _histsView; } 80 | 81 | protected: 82 | NgramIndex *_FindIndex(NgramIndex hist, VocabIndex word); 83 | void _Reindex(size_t indexSize); 84 | }; 85 | 86 | } 87 | 88 | #endif // NGRAMVECTOR_H 89 | -------------------------------------------------------------------------------- /src/PerplexityOptimizer.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #include 36 | #include "util/Logger.h" 37 | #include "PerplexityOptimizer.h" 38 | 39 | //////////////////////////////////////////////////////////////////////////////// 40 | 41 | namespace mitlm { 42 | 43 | void 44 | PerplexityOptimizer::LoadCorpus(ZFile &corpusFile) { 45 | //const CountVector &counts(_lm.counts(1)); 46 | //BitVector vocabMask = (_lm.counts > 0); 47 | BitVector vocabMask(_lm.vocab().size(), 1); 48 | _lm._pModel->LoadEvalCorpus(_probCountVectors, _bowCountVectors, 49 | vocabMask, corpusFile, _numOOV, _numWords); 50 | 51 | vector probMaskVectors(_order + 1); 52 | vector bowMaskVectors(_order); 53 | for (size_t o = 0; o <= _order; o++) 54 | probMaskVectors[o] = (_probCountVectors[o] > 0); 55 | for (size_t o = 0; o < _order; o++) 56 | bowMaskVectors[o] = (_bowCountVectors[o] > 0); 57 | _mask = _lm.GetMask(probMaskVectors, bowMaskVectors); 58 | } 59 | 60 | double 61 | PerplexityOptimizer::ComputeEntropy(const ParamVector ¶ms) { 62 | // Estimate model. 63 | if (!_lm.Estimate(params, _mask)) 64 | return 7; // Out of bounds. Corresponds to perplexity = 1100. 65 | 66 | // Compute total log probability and num zero probs. 67 | _totLogProb = 0.0; 68 | _numZeroProbs = 0; 69 | for (size_t o = 0; o <= _order; o++) { 70 | // assert(alltrue(counts == 0 || probs > 0)); 71 | // _totLogProb += dot(log(probs), counts, counts > 0); 72 | // _totLogProb += sum((log(probs) * counts)[counts > 0]); 73 | const CountVector &counts(_probCountVectors[o]); 74 | const ProbVector & probs(_lm.probs(o)); 75 | for (size_t i = 0; i < counts.length(); i++) { 76 | if (counts[i] > 0) { 77 | assert(std::isfinite(probs[i])); 78 | if (probs[i] == 0) 79 | _numZeroProbs++; 80 | else 81 | _totLogProb += std::log(probs[i]) * counts[i]; 82 | } 83 | } 84 | } 85 | for (size_t o = 0; o < _order; o++) { 86 | // assert(allTrue(counts == 0 || bows > 0)); 87 | // _totLogProb += dot(log(bows), counts, counts > 0); 88 | const CountVector &counts(_bowCountVectors[o]); 89 | const ProbVector & bows(_lm.bows(o)); 90 | for (size_t i = 0; i < counts.length(); i++) { 91 | if (counts[i] > 0) { 92 | assert(std::isfinite(bows[i])); 93 | assert(bows[i] != 0); 94 | if (bows[i] == 0) 95 | Logger::Warn(1, "Invalid BOW %lu %lu %i\n", o,i,counts[i]); 96 | _totLogProb += std::log(bows[i]) * counts[i]; 97 | } 98 | } 99 | } 100 | 101 | double entropy = -_totLogProb / (_numWords - _numZeroProbs); 102 | if (Logger::GetVerbosity() > 2) 103 | std::cout << std::exp(entropy) << "\t" << params << std::endl; 104 | else 105 | Logger::Log(2, "%f\n", std::exp(entropy)); 106 | return std::isnan(entropy) ? 7 : entropy; 107 | } 108 | 109 | double 110 | PerplexityOptimizer::Optimize(ParamVector ¶ms, Optimization technique) { 111 | _numCalls = 0; 112 | ComputeEntropyFunc func(*this); 113 | int numIter; 114 | double minEntropy; 115 | clock_t startTime = clock(); 116 | switch (technique) { 117 | case PowellOptimization: 118 | minEntropy = MinimizePowell(func, params, numIter); 119 | break; 120 | case LBFGSOptimization: 121 | minEntropy = MinimizeLBFGS(func, params, numIter); 122 | break; 123 | case LBFGSBOptimization: 124 | minEntropy = MinimizeLBFGSB(func, params, numIter); 125 | break; 126 | default: 127 | throw std::runtime_error("Unsupported optimization technique."); 128 | } 129 | clock_t endTime = clock(); 130 | 131 | Logger::Log(1, "Iterations = %i\n", numIter); 132 | Logger::Log(1, "Elapsed Time = %f\n", 133 | (double)(endTime - startTime) / CLOCKS_PER_SEC); 134 | Logger::Log(1, "Perplexity = %f\n", std::exp(minEntropy)); 135 | Logger::Log(1, "Num OOVs = %lu\n", _numOOV); 136 | Logger::Log(1, "Num ZeroProbs = %lu\n", _numZeroProbs); 137 | Logger::Log(1, "Func Evals = %lu\n", _numCalls); 138 | Logger::Log(1, "OptParams = [ "); 139 | for (size_t i = 0; i < params.length(); i++) 140 | Logger::Log(1, "%f ", params[i]); 141 | Logger::Log(1, "]\n"); 142 | return minEntropy; 143 | } 144 | 145 | } 146 | -------------------------------------------------------------------------------- /src/PerplexityOptimizer.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef PERPLEXITYOPTIMIZER_H 36 | #define PERPLEXITYOPTIMIZER_H 37 | 38 | #include 39 | #include "optimize/Optimization.h" 40 | #include "Types.h" 41 | #include "NgramLM.h" 42 | #include "Mask.h" 43 | 44 | //////////////////////////////////////////////////////////////////////////////// 45 | 46 | namespace mitlm { 47 | 48 | class PerplexityOptimizer { 49 | protected: 50 | NgramLMBase & _lm; 51 | size_t _order; 52 | vector _probCountVectors; 53 | vector _bowCountVectors; 54 | size_t _numOOV; 55 | size_t _numWords; 56 | size_t _numZeroProbs; 57 | size_t _numCalls; 58 | double _totLogProb; 59 | SharedPtr _mask; 60 | 61 | class ComputeEntropyFunc { 62 | PerplexityOptimizer &_obj; 63 | public: 64 | ComputeEntropyFunc(PerplexityOptimizer &obj) : _obj(obj) { } 65 | double operator()(const ParamVector ¶ms) 66 | { _obj._numCalls++; return _obj.ComputeEntropy(params); } 67 | }; 68 | 69 | public: 70 | PerplexityOptimizer(NgramLMBase &lm, size_t order=3) 71 | : _lm(lm), _order(order) { } 72 | 73 | void SetOrder(size_t order) { _order = order; } 74 | void LoadCorpus(ZFile &corpusFile); 75 | double ComputeEntropy(const ParamVector ¶ms); 76 | double ComputePerplexity(const ParamVector ¶ms) 77 | { return std::exp(ComputeEntropy(params)); } 78 | double Optimize(ParamVector ¶ms, 79 | Optimization technique=PowellOptimization); 80 | }; 81 | 82 | } 83 | 84 | #endif // PERPLEXITYOPTIMIZER_H 85 | -------------------------------------------------------------------------------- /src/Smoothing.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #include "MaxLikelihoodSmoothing.h" 36 | #include "KneserNeySmoothing.h" 37 | 38 | namespace mitlm { 39 | 40 | Smoothing * 41 | Smoothing::Create(const char *smoothing) { 42 | if (strcmp(smoothing, "FixKN") == 0) { 43 | return new KneserNeySmoothing(1, false); 44 | } else if (strcmp(smoothing, "FixModKN") == 0) { 45 | return new KneserNeySmoothing(3, false); 46 | } else if (strncmp(smoothing, "FixKN", 5) == 0) { 47 | for (size_t i = 5; i < strlen(smoothing); ++i) 48 | if (!isdigit(smoothing[i])) return NULL; 49 | return new KneserNeySmoothing(atoi(&smoothing[5]), false); 50 | } else if (strcmp(smoothing, "KN") == 0) { 51 | return new KneserNeySmoothing(1, true); 52 | } else if (strcmp(smoothing, "ModKN") == 0) { 53 | return new KneserNeySmoothing(3, true); 54 | } else if (strncmp(smoothing, "KN", 2) == 0) { 55 | for (size_t i = 2; i < strlen(smoothing); ++i) 56 | if (!isdigit(smoothing[i])) return NULL; 57 | return new KneserNeySmoothing(atoi(&smoothing[2]), true); 58 | } else if (strcmp(smoothing, "ML") == 0) { 59 | return new MaxLikelihoodSmoothing(); 60 | } else 61 | return NULL; 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /src/Smoothing.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef SMOOTHING_H 36 | #define SMOOTHING_H 37 | 38 | #include "Types.h" 39 | #include "Mask.h" 40 | 41 | namespace mitlm { 42 | 43 | //////////////////////////////////////////////////////////////////////////////// 44 | 45 | // Forward declaration. 46 | class NgramLM; 47 | 48 | //////////////////////////////////////////////////////////////////////////////// 49 | 50 | class Smoothing { 51 | protected: 52 | ParamVector _defParams; 53 | CountVector _effCounts; 54 | Smoothing() { } 55 | 56 | public: 57 | virtual ~Smoothing(); 58 | virtual void Initialize(NgramLM *pLM, size_t order) = 0; 59 | virtual void UpdateMask(NgramLMMask &lmMask) const = 0; 60 | virtual bool Estimate(const ParamVector ¶ms, const NgramLMMask *pMask, 61 | ProbVector &probs, ProbVector &bows) = 0; 62 | 63 | const ParamVector &defParams() const { return _defParams; } 64 | const CountVector &effCounts() const { return _effCounts; } 65 | 66 | public: 67 | static Smoothing *Create(const char *smoothing); 68 | }; 69 | 70 | } 71 | 72 | #endif // SMOOTHING_H 73 | -------------------------------------------------------------------------------- /src/Types.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef TYPES_H 36 | #define TYPES_H 37 | 38 | #include 39 | #include "vector/DenseVector.h" 40 | #include "vector/VectorBuilder.h" 41 | #include "vector/VectorOps.h" 42 | 43 | //////////////////////////////////////////////////////////////////////////////// 44 | 45 | namespace mitlm { 46 | // Type aliases. 47 | typedef unsigned char byte; 48 | typedef unsigned short ushort; 49 | typedef unsigned int uint; 50 | 51 | // Defines the size of basic types. 52 | typedef int VocabIndex; 53 | typedef int NgramIndex; 54 | typedef int Count; 55 | typedef float LProb; 56 | typedef double Prob; 57 | typedef double Param; 58 | typedef uint NodeIndex; 59 | 60 | // Vector aliases. 61 | typedef mitlm::DenseVector StrVector; 62 | typedef mitlm::DenseVector BitVector; 63 | typedef mitlm::DenseVector ByteVector; 64 | typedef mitlm::DenseVector ShortVector; 65 | typedef mitlm::DenseVector IntVector; 66 | typedef mitlm::DenseVector UIntVector; 67 | typedef mitlm::DenseVector SizeVector; 68 | typedef mitlm::DenseVector FloatVector; 69 | typedef mitlm::DenseVector DoubleVector; 70 | typedef mitlm::DenseVector VocabVector; 71 | typedef mitlm::DenseVector CountVector; 72 | typedef mitlm::DenseVector IndexVector; 73 | typedef mitlm::DenseVector ProbVector; 74 | typedef mitlm::DenseVector ParamVector; 75 | typedef mitlm::DenseVector NodeVector; 76 | 77 | typedef std::vector FeatureVectors; 78 | } 79 | 80 | #endif // TYPES_H 81 | -------------------------------------------------------------------------------- /src/Vocab.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef VOCAB_H 36 | #define VOCAB_H 37 | 38 | #include 39 | #include "util/FastHash.h" 40 | #include "util/ZFile.h" 41 | #include "Types.h" 42 | 43 | namespace mitlm { 44 | 45 | //////////////////////////////////////////////////////////////////////////////// 46 | 47 | struct OffsetLen { 48 | OffsetLen(uint offset = 0, uint len = 0) : Offset(offset), Len(len) { } 49 | uint Offset; // Offset into string buffer 50 | uint Len; // Length of string 51 | }; 52 | 53 | //////////////////////////////////////////////////////////////////////////////// 54 | // Vocab represents a collection of words associated with increasing index. 55 | // Word access by index and index lookup by word can be performed in constant 56 | // time. To support efficient serialization and memory mapping, we store the 57 | // words in a single string buffer. Word lengths are stored along with the 58 | // offsets for convenience and efficient comparison/lookup. 59 | // 60 | class Vocab { 61 | protected: 62 | typedef DenseVector OffsetLenVector; 63 | 64 | size_t _length; 65 | OffsetLenVector _offsetLens; // Offsets into string buffer and lengths 66 | VocabVector _indices; // Index table mapping string to index 67 | std::string _buffer; // String buffer storing all words 68 | size_t _hashMask; // Hash mask: hashIndex = hash & hashMask 69 | bool _fixedVocab; 70 | VocabIndex _unkIndex; 71 | 72 | public: 73 | static const VocabIndex Invalid; // = (VocabIndex)-1; 74 | static const VocabIndex EndOfSentence; // = (VocabIndex)0; 75 | 76 | Vocab(size_t capacity = 1<<16); 77 | void SetFixedVocab(bool fixedVocab); 78 | void UseUnknown(); 79 | VocabIndex Find(const char *word, size_t len) const; 80 | VocabIndex Find(const char *word) const { return Find(word, strlen(word)); } 81 | VocabIndex Add(const char *word, size_t len); 82 | VocabIndex Add(const char *word) { return Add(word, strlen(word)); } 83 | void Reserve(size_t capacity); 84 | bool Sort(VocabVector &sortMap); 85 | void LoadVocab(ZFile &vocabFile); 86 | void SaveVocab(ZFile &vocabFile, bool asBinary=false) const; 87 | void Serialize(FILE *outFile) const; 88 | void Deserialize(FILE *inFile); 89 | 90 | bool IsFixedVocab() const { return _fixedVocab; } 91 | size_t size() const { return _length; } 92 | size_t wordlen(VocabIndex n) const { return _offsetLens[n].Len; } 93 | const char *operator[](VocabIndex n) const 94 | { return &_buffer[_offsetLens[n].Offset]; } 95 | 96 | protected: 97 | VocabIndex *_FindIndex(const char *word, size_t len); 98 | void _Reindex(size_t indexSize); 99 | }; 100 | 101 | } 102 | 103 | #endif // VOCAB_H 104 | -------------------------------------------------------------------------------- /src/WordErrorRateOptimizer.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef WORDERRORRATEOPTIMIZER_H 36 | #define WORDERRORRATEOPTIMIZER_H 37 | 38 | #include 39 | #include "optimize/Optimization.h" 40 | #include "Types.h" 41 | #include "NgramLM.h" 42 | #include "Mask.h" 43 | #include "Lattice.h" 44 | 45 | using std::vector; 46 | 47 | namespace mitlm { 48 | 49 | //////////////////////////////////////////////////////////////////////////////// 50 | 51 | class WordErrorRateOptimizer { 52 | protected: 53 | NgramLMBase & _lm; 54 | size_t _order; 55 | vector _lattices; 56 | size_t _numCalls; 57 | double _worstMargin; 58 | SharedPtr _mask; 59 | 60 | class ComputeMarginFunc { 61 | WordErrorRateOptimizer &_obj; 62 | public: 63 | ComputeMarginFunc(WordErrorRateOptimizer &obj) : _obj(obj) { } 64 | double operator()(const ParamVector ¶ms) 65 | { _obj._numCalls++; return -_obj.ComputeMargin(params); } 66 | }; 67 | 68 | class ComputeWERFunc { 69 | WordErrorRateOptimizer &_obj; 70 | public: 71 | ComputeWERFunc(WordErrorRateOptimizer &obj) : _obj(obj) { } 72 | double operator()(const ParamVector ¶ms) 73 | { _obj._numCalls++; return _obj.ComputeWER(params); } 74 | }; 75 | 76 | public: 77 | WordErrorRateOptimizer(NgramLMBase &lm, size_t order=3) 78 | : _lm(lm), _order(order), _worstMargin(-100) { } 79 | ~WordErrorRateOptimizer(); 80 | 81 | void SetOrder(size_t order) { _order = order; } 82 | void LoadLattices(ZFile &latticesFile); 83 | void SaveLattices(ZFile &latticesFile); 84 | void SaveTranscript(ZFile &transcriptFile); 85 | void SaveUttConfidence(ZFile &confidenceFile); 86 | void SaveWER(ZFile &werFile); 87 | double ComputeMargin(const ParamVector ¶ms); 88 | double ComputeWER(const ParamVector ¶ms); 89 | double ComputeOracleWER() const; 90 | double OptimizeMargin(ParamVector ¶ms, 91 | Optimization technique=PowellOptimization); 92 | double OptimizeWER(ParamVector ¶ms, 93 | Optimization technique=PowellOptimization); 94 | }; 95 | 96 | } 97 | 98 | #endif // WORDERRORRATEOPTIMIZER_H 99 | -------------------------------------------------------------------------------- /src/evaluate-ngram.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // Copyright (c) 2010-2013, Giulio Paci // 4 | // All rights reserved. // 5 | // // 6 | // Redistribution and use in source and binary forms, with or without // 7 | // modification, are permitted provided that the following conditions are // 8 | // met: // 9 | // // 10 | // * Redistributions of source code must retain the above copyright // 11 | // notice, this list of conditions and the following disclaimer. // 12 | // // 13 | // * Redistributions in binary form must reproduce the above // 14 | // copyright notice, this list of conditions and the following // 15 | // disclaimer in the documentation and/or other materials provided // 16 | // with the distribution. // 17 | // // 18 | // * Neither the name of the Massachusetts Institute of Technology // 19 | // nor the names of its contributors may be used to endorse or // 20 | // promote products derived from this software without specific // 21 | // prior written permission. // 22 | // // 23 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 24 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 25 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 26 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 27 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 28 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 29 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 30 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 31 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 32 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 33 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 34 | //////////////////////////////////////////////////////////////////////////// 35 | 36 | #include 37 | #include 38 | #include "util/CommandOptions.h" 39 | #include "util/Logger.h" 40 | #include "util/ZFile.h" 41 | #include "Types.h" 42 | #include "Lattice.h" 43 | #include "PerplexityOptimizer.h" 44 | #include "WordErrorRateOptimizer.h" 45 | 46 | #ifdef F77_DUMMY_MAIN 47 | # ifdef __cplusplus 48 | extern "C" 49 | # endif 50 | int F77_DUMMY_MAIN () { return 1; } 51 | #endif 52 | 53 | using std::vector; 54 | using std::string; 55 | 56 | //////////////////////////////////////////////////////////////////////////////// 57 | 58 | const char *headerDesc = 59 | "Usage: evaluate-ngram [Options]\n" 60 | "\n" 61 | "Evaluates the performance of an n-gram language model. It also supports\n" 62 | "various n-gram language model conversions, including changes in order,\n" 63 | "vocabulary, and file format.\n" 64 | "\n" 65 | "Filename argument can be an ASCII file, a compressed file (ending in .Z or .gz),\n" 66 | "or '-' to indicate stdin/stdout.\n"; 67 | 68 | const char *footerDesc_tmpl = 69 | "---------------------------------------------------------------\n" 70 | "| %-59s |\n" 71 | "| Copyright (C) 2009 Bo-June (Paul) Hsu |\n" 72 | "| MIT Computer Science and Artificial Intelligence Laboratory |\n" 73 | "---------------------------------------------------------------\n"; 74 | 75 | //////////////////////////////////////////////////////////////////////////////// 76 | 77 | int main(int argc, char* argv[]) { 78 | // Parse command line options. 79 | char *footerDesc = new char[strlen(footerDesc_tmpl)+strlen(PACKAGE_STRING)+1+59]; 80 | sprintf(footerDesc, footerDesc_tmpl, PACKAGE_STRING); 81 | mitlm::CommandOptions opts(headerDesc, footerDesc); 82 | delete [] footerDesc; 83 | opts.AddOption("h,help", "Print this message."); 84 | opts.AddOption("verbose", "Set verbosity level.", "1", "int"); 85 | opts.AddOption("o,order", "Set the n-gram order of the estimated LM.", "3", "int"); 86 | opts.AddOption("v,vocab", "Fix the vocab to only words from the specified file.", NULL, "file"); 87 | opts.AddOption("l,lm", "Load specified LM.", NULL, "file"); 88 | opts.AddOption("cl,compile-lattices", "[SLS] Compile lattices into a binary format.", NULL, "file"); 89 | opts.AddOption("wb,write-binary", "Write LM/counts files in binary format.", "false", "boolean"); 90 | opts.AddOption("wv,write-vocab", "Write LM vocab to file.", NULL, "file"); 91 | opts.AddOption("wl,write-lm", "Write ARPA backoff LM to file.", NULL, "file"); 92 | opts.AddOption("ep,eval-perp", "Compute test set perplexity.", NULL, "files"); 93 | opts.AddOption("ew,eval-wer", "Compute test set lattice word error rate.", NULL, "files"); 94 | opts.AddOption("em,eval-margin", "Compute test set lattice margin.", NULL, "files"); 95 | if (!opts.ParseArguments(argc, (const char **)argv) || 96 | opts["help"] != NULL) { 97 | std::cout << std::endl; 98 | opts.PrintHelp(); 99 | return 1; 100 | } 101 | 102 | // Process basic command line arguments. 103 | size_t order = atoi(opts["order"]); 104 | bool writeBinary = mitlm::AsBoolean(opts["write-binary"]); 105 | mitlm::Logger::SetVerbosity(atoi(opts["verbose"])); 106 | if (!opts["lm"]) { 107 | mitlm::Logger::Error(0, "Language model must be specified using -lm."); 108 | exit(1); 109 | } 110 | 111 | // Load language model. 112 | mitlm::ArpaNgramLM lm(order); 113 | if (opts["vocab"]) { 114 | mitlm::Logger::Log(1, "Loading vocab %s...\n", opts["vocab"]); 115 | mitlm::ZFile vocabZFile(opts["vocab"]); 116 | lm.LoadVocab(vocabZFile); 117 | } 118 | mitlm::Logger::Log(1, "Loading LM %s...\n", opts["lm"]); 119 | mitlm::ZFile lmZFile(opts["lm"], "r"); 120 | lm.LoadLM(lmZFile); 121 | 122 | // Compile lattices. 123 | if (opts["compile-lattices"]) { 124 | mitlm::Logger::Log(0, "Compiling lattices %s:\n", opts["compile-lattices"]); 125 | mitlm::ZFile latticesZFile(opts["compile-lattices"]); 126 | mitlm::WordErrorRateOptimizer eval(lm, order); 127 | eval.LoadLattices(latticesZFile); 128 | string outFile(opts["compile-lattices"]); 129 | outFile += ".bin"; 130 | mitlm::ZFile outZFile(outFile.c_str(), "w"); 131 | eval.SaveLattices(outZFile); 132 | } 133 | 134 | // Evaluate LM. 135 | mitlm::ParamVector params(lm.defParams()); 136 | if (opts["eval-perp"]) { 137 | mitlm::Logger::Log(0, "Perplexity Evaluations:\n"); 138 | vector evalFiles; 139 | mitlm::trim_split(evalFiles, opts["eval-perp"], ','); 140 | for (size_t i = 0; i < evalFiles.size(); i++) { 141 | mitlm::Logger::Log(1, "Loading eval set %s...\n", evalFiles[i].c_str()); 142 | mitlm::ZFile evalZFile(evalFiles[i].c_str()); 143 | mitlm::PerplexityOptimizer eval(lm, order); 144 | eval.LoadCorpus(evalZFile); 145 | 146 | mitlm::Logger::Log(0, "\t%s\t%.3f\n", evalFiles[i].c_str(), 147 | eval.ComputePerplexity(params)); 148 | } 149 | } 150 | if (opts["eval-margin"]) { 151 | mitlm::Logger::Log(0, "Margin Evaluations:\n"); 152 | vector evalFiles; 153 | mitlm::trim_split(evalFiles, opts["eval-margin"], ','); 154 | for (size_t i = 0; i < evalFiles.size(); i++) { 155 | mitlm::Logger::Log(1, "Loading eval lattices %s...\n", 156 | evalFiles[i].c_str()); 157 | mitlm::ZFile evalZFile(evalFiles[i].c_str()); 158 | mitlm::WordErrorRateOptimizer eval(lm, order); 159 | eval.LoadLattices(evalZFile); 160 | 161 | mitlm::Logger::Log(0, "\t%s\t%.3f\n", evalFiles[i].c_str(), 162 | eval.ComputeMargin(params)); 163 | } 164 | } 165 | if (opts["eval-wer"]) { 166 | mitlm::Logger::Log(0, "WER Evaluations:\n"); 167 | vector evalFiles; 168 | mitlm::trim_split(evalFiles, opts["eval-wer"], ','); 169 | for (size_t i = 0; i < evalFiles.size(); i++) { 170 | mitlm::Logger::Log(1, "Loading eval lattices %s...\n", 171 | evalFiles[i].c_str()); 172 | mitlm::ZFile evalZFile(evalFiles[i].c_str()); 173 | mitlm::WordErrorRateOptimizer eval(lm, order); 174 | eval.LoadLattices(evalZFile); 175 | 176 | mitlm::Logger::Log(0, "\t%s\t%.2f%%\n", evalFiles[i].c_str(), 177 | eval.ComputeWER(params)); 178 | } 179 | } 180 | 181 | // Save results. 182 | if (opts["write-vocab"]) { 183 | mitlm::Logger::Log(1, "Saving vocabulary to %s...\n", opts["write-vocab"]); 184 | mitlm::ZFile vocabZFile(opts["write-vocab"], "w"); 185 | lm.SaveVocab(vocabZFile); 186 | } 187 | if (opts["write-lm"]) { 188 | mitlm::Logger::Log(1, "Saving LM to %s...\n", opts["write-lm"]); 189 | mitlm::ZFile lmZFile(opts["write-lm"], "w"); 190 | lm.SaveLM(lmZFile, writeBinary); 191 | } 192 | 193 | return 0; 194 | } 195 | -------------------------------------------------------------------------------- /src/optimize/LBFGS.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef LBFGS_H 36 | #define LBFGS_H 37 | 38 | #include "../Types.h" 39 | 40 | namespace mitlm { 41 | 42 | //////////////////////////////////////////////////////////////////////////////// 43 | 44 | extern "C" { 45 | void mitlm_lbfgs(int *n, int *m, double *x, double *f, double *g, 46 | int *diagco, double *diag, int *iprint, 47 | double *eps, double *xtol, double *w, int *iflag); 48 | } 49 | 50 | template 51 | double 52 | MinimizeLBFGS(Function &func, DoubleVector &x, int &numIter, double step=1e-8, 53 | double eps=1e-5, double xtol=1e-16, int maxIter=0) { 54 | if (maxIter == 0) maxIter = 15000; 55 | 56 | int n = x.length(); 57 | int m = 10; 58 | double f; 59 | DoubleVector g(n); 60 | int diagco = false; 61 | DoubleVector diag(n, 0); 62 | int iprint[2] = {-1, 0}; 63 | DoubleVector w(n*(2*m+1) + 2*m); 64 | int iflag = 0; 65 | 66 | numIter = 0; 67 | while (true) { 68 | f = func(x); 69 | // Approximate gradient. 70 | for (int i = 0; i < n; ++i) { 71 | x[i] += step; 72 | g[i] = (func(x) - f) / step; 73 | x[i] -= step; 74 | } 75 | mitlm_lbfgs(&n, &m, x.data(), &f, g.data(), &diagco, diag.data(), iprint, 76 | &eps, &xtol, w.data(), &iflag); 77 | if (iflag <= 0) 78 | break; 79 | if (++numIter > maxIter) break; 80 | } 81 | return f; 82 | } 83 | 84 | } 85 | 86 | #endif // LBFGS_H 87 | -------------------------------------------------------------------------------- /src/optimize/LBFGSB.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef LBFGSB_H 36 | #define LBFGSB_H 37 | 38 | #include "../Types.h" 39 | 40 | namespace mitlm { 41 | 42 | //////////////////////////////////////////////////////////////////////////////// 43 | 44 | extern "C" { 45 | void mitlm_setulb(int *n, int *m, double *x, double *l, double *u, int *nbd, 46 | double *f, double *g, double *factr, double *pgtol, 47 | double *wa, int *iwa, char *task, int *iprint, 48 | char *csave, int *lsave, int *isave, double *dsave); 49 | } 50 | 51 | template 52 | double 53 | MinimizeLBFGSB(Function &func, DoubleVector &x, int &numIter, double step=1e-8, 54 | double factr=1e7, double pgtol=1e-5, int maxIter=0) { 55 | if (maxIter == 0) maxIter = 15000; 56 | 57 | int n = x.length(); 58 | int m = 10; 59 | DoubleVector l(n); 60 | DoubleVector u(n); 61 | IntVector nbd(n, 0); 62 | double f; 63 | DoubleVector g(n); 64 | DoubleVector wa(2*m*n + 4*n + 12*m*m + 12*m); 65 | IntVector iwa(3*n); 66 | char task[60]; 67 | int iprint = -1; 68 | char csave[60]; 69 | IntVector lsave(4); 70 | IntVector isave(44); 71 | DoubleVector dsave(29); 72 | 73 | numIter = 0; 74 | memset(task, ' ', 60); 75 | strncpy(task, "START", 5); 76 | while (true) { 77 | mitlm_setulb(&n, &m, x.data(), l.data(), u.data(), nbd.data(), 78 | &f, g.data(), &factr, &pgtol, wa.data(), iwa.data(), &task[0], 79 | &iprint, &csave[0], lsave.data(), isave.data(), dsave.data()); 80 | if (strncmp(task, "FG", 2) == 0) { 81 | f = func(x); 82 | // Approximate gradient. 83 | for (int i = 0; i < n; ++i) { 84 | x[i] += step; 85 | g[i] = (func(x) - f) / step; 86 | x[i] -= step; 87 | } 88 | } else if (strncmp(task, "NEW_X", 5) == 0) { 89 | if (++numIter >= maxIter) 90 | strcpy(task, "STOP: TOTAL NO. ITERATIONS EXCEEDS LIMIT"); 91 | } else 92 | break; 93 | } 94 | return f; 95 | } 96 | 97 | } 98 | 99 | #endif // LBFGSB_H 100 | -------------------------------------------------------------------------------- /src/optimize/Optimization.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef OPTIMIZATION_H 36 | #define OPTIMIZATION_H 37 | 38 | #include "Powell.h" 39 | #include "LBFGS.h" 40 | #include "LBFGSB.h" 41 | 42 | namespace mitlm { 43 | 44 | //////////////////////////////////////////////////////////////////////////////// 45 | 46 | enum Optimization { 47 | UnknownOptimization, 48 | PowellOptimization, 49 | LBFGSOptimization, 50 | LBFGSBOptimization 51 | }; 52 | 53 | inline Optimization ToOptimization(const char *optimization) { 54 | if (strcmp(optimization, "Powell") == 0) 55 | return PowellOptimization; 56 | if (strcmp(optimization, "LBFGS") == 0) 57 | return LBFGSOptimization; 58 | if (strcmp(optimization, "LBFGSB") == 0) 59 | return LBFGSBOptimization; 60 | return UnknownOptimization; 61 | } 62 | 63 | } 64 | 65 | #endif // OPTIMIZATION_H 66 | -------------------------------------------------------------------------------- /src/optimize/fortran_wrapper.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013, Giulio Paci // 4 | // All rights reserved. // 5 | // // 6 | // Redistribution and use in source and binary forms, with or without // 7 | // modification, are permitted provided that the following conditions are // 8 | // met: // 9 | // // 10 | // * Redistributions of source code must retain the above copyright // 11 | // notice, this list of conditions and the following disclaimer. // 12 | // // 13 | // * Redistributions in binary form must reproduce the above // 14 | // copyright notice, this list of conditions and the following // 15 | // disclaimer in the documentation and/or other materials provided // 16 | // with the distribution. // 17 | // // 18 | // * Neither the name of the Massachusetts Institute of Technology // 19 | // nor the names of its contributors may be used to endorse or // 20 | // promote products derived from this software without specific // 21 | // prior written permission. // 22 | // // 23 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 24 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 25 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 26 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 27 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 28 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 29 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 30 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 31 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 32 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 33 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 34 | //////////////////////////////////////////////////////////////////////////// 35 | 36 | #ifndef BITOPS_H 37 | #define BITOPS_H 38 | 39 | namespace mitlm { 40 | 41 | /////////////////////////////////////////////////////////////////////////////// 42 | 43 | // Find the last (most-significant) bit set. 44 | // Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. 45 | static __inline__ unsigned long find_last_bit_set(unsigned long x) { 46 | #ifdef HAVE_X86_ASM 47 | #if __WORDSIZE == 64 48 | if (!x) return 0; 49 | __asm__("bsrq %1,%0" 50 | :"=r" (x) 51 | :"rm" (x)); 52 | return x + 1; 53 | #else 54 | int r; 55 | __asm__("bsrl %1,%0\n" 56 | "jnz 1f\n" 57 | "movl $-1,%0\n" 58 | "1:" 59 | :"=r" (r) 60 | :"rm" (x)); 61 | return r + 1; 62 | #endif 63 | #else 64 | #ifdef HAVE___BUILTIN_CLZ 65 | return (x? (((sizeof(unsigned long)*8))- __builtin_clzl(x)): 0); 66 | #else 67 | #if __WORDSIZE == 64 68 | int r = 64; 69 | if (!x) return 0; 70 | if (!(x & 0xffffffff00000000ul)) { x <<= 32; r -= 32; } 71 | if (!(x & 0xffff000000000000ul)) { x <<= 16; r -= 16; } 72 | if (!(x & 0xff00000000000000ul)) { x <<= 8; r -= 8; } 73 | if (!(x & 0xf000000000000000ul)) { x <<= 4; r -= 4; } 74 | if (!(x & 0xc000000000000000ul)) { x <<= 2; r -= 2; } 75 | if (!(x & 0x8000000000000000ul)) { x <<= 1; r -= 1; } 76 | return r; 77 | #else 78 | int r = 32; 79 | if (!x) return 0; 80 | if (!(x & 0xffff0000ul)) { x <<= 16; r -= 16; } 81 | if (!(x & 0xff000000ul)) { x <<= 8; r -= 8; } 82 | if (!(x & 0xf0000000ul)) { x <<= 4; r -= 4; } 83 | if (!(x & 0xc0000000ul)) { x <<= 2; r -= 2; } 84 | if (!(x & 0x80000000ul)) { x <<= 1; r -= 1; } 85 | return r; 86 | #endif 87 | #endif 88 | #endif 89 | } 90 | 91 | // Determine if x is a power of 2 or 0. 92 | static __inline__ bool isPowerOf2(unsigned long x) 93 | { return !(x & (x - 1)); } 94 | 95 | // Returns the smallest power of 2 larger than x. 96 | static __inline__ unsigned long nextPowerOf2(unsigned long x) 97 | { return 1UL << find_last_bit_set(x); } 98 | 99 | } 100 | 101 | #endif // BITOPS_H 102 | -------------------------------------------------------------------------------- /src/util/CommandOptions.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // Copyright (c) 2013, Giulio Paci // 4 | // All rights reserved. // 5 | // // 6 | // Redistribution and use in source and binary forms, with or without // 7 | // modification, are permitted provided that the following conditions are // 8 | // met: // 9 | // // 10 | // * Redistributions of source code must retain the above copyright // 11 | // notice, this list of conditions and the following disclaimer. // 12 | // // 13 | // * Redistributions in binary form must reproduce the above // 14 | // copyright notice, this list of conditions and the following // 15 | // disclaimer in the documentation and/or other materials provided // 16 | // with the distribution. // 17 | // // 18 | // * Neither the name of the Massachusetts Institute of Technology // 19 | // nor the names of its contributors may be used to endorse or // 20 | // promote products derived from this software without specific // 21 | // prior written permission. // 22 | // // 23 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 24 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 25 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 26 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 27 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 28 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 29 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 30 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 31 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 32 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 33 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 34 | //////////////////////////////////////////////////////////////////////////// 35 | 36 | #include 37 | #include 38 | #include 39 | #include "CommandOptions.h" 40 | #include "util/Logger.h" 41 | 42 | using namespace std; 43 | 44 | namespace mitlm { 45 | 46 | CommandOptions::CommandOptions(const char *header, const char *footer) { 47 | _header = header; 48 | _footer = footer; 49 | } 50 | 51 | void 52 | CommandOptions::AddOption(const char *name, 53 | const char *desc, 54 | const char *defval, 55 | const char *type) { 56 | vector names; 57 | trim_split(names, name, ','); 58 | for (size_t i = 0; i < names.size(); ++i) 59 | _nameIndexMap[names[i]] = _options.size(); 60 | 61 | _options.push_back(CmdOption(name, desc, defval, type)); 62 | } 63 | 64 | bool 65 | CommandOptions::ParseArguments(int argc, const char **argv) { 66 | if (_header.length() == 0) 67 | _header = string("Usage: ") + argv[0] + " [Options]\n"; 68 | 69 | _values.resize(_options.size(), NULL); 70 | int i = 1; 71 | while (i < argc) { 72 | const char *argName = argv[i++]; 73 | assert(argName != NULL && argName[0] != '\0'); 74 | hash_map_iter iter = _nameIndexMap.find(&argName[1]); 75 | if (iter == _nameIndexMap.end()) { 76 | cerr << "Invalid argument '" << argName << "'.\n"; 77 | return false; 78 | } 79 | if (_values[iter->second] != NULL) { 80 | cerr << "Argument '" << argName << "' specified multiple times.\n"; 81 | return false; 82 | } 83 | const char *value = ""; 84 | if (i < argc && argv[i][0] != '-') 85 | value = argv[i++]; 86 | _values[iter->second] = value; 87 | } 88 | 89 | for (size_t i = 0; i < _options.size(); ++i) { 90 | if (_values[i] == NULL) 91 | _values[i] = _options[i].defval; 92 | // Logger::Log(1, "%s = %s\n", 93 | // _options[i].name, (_values[i]==NULL ? "NULL" : _values[i])); 94 | } 95 | return true; 96 | } 97 | 98 | void 99 | CommandOptions::PrintHelp() const { 100 | cout << _header << "\n"; 101 | cout << "Options:\n"; 102 | for (size_t i = 0; i < _options.size(); ++i) { 103 | vector names; 104 | trim_split(names, _options[i].name, ','); 105 | cout << " -" << names[0]; 106 | for (size_t j = 1; j < names.size(); j++) 107 | cout << ", -" << names[j]; 108 | if (_options[i].type != NULL) 109 | cout << " <" << _options[i].type << ">"; 110 | cout << endl; 111 | cout << " " << _options[i].desc << endl; 112 | if (_options[i].defval != NULL) 113 | cout << " Default: " << _options[i].defval << endl; 114 | } 115 | if (_footer.length() > 0 ) 116 | cout << endl << _footer; 117 | } 118 | 119 | const char * 120 | CommandOptions::operator[](const char *name) const { 121 | hash_map_iter iter = _nameIndexMap.find(name); 122 | assert(iter != _nameIndexMap.end()); 123 | return _values[iter->second]; 124 | } 125 | 126 | //////////////////////////////////////////////////////////////////////////////// 127 | 128 | vector & 129 | trim_split(vector &result, const char *str, char delimiter) { 130 | result.resize(0); 131 | if (str != NULL) { 132 | const char *end = str + strlen(str); 133 | while (str <= end) { 134 | while (isspace(*str)) ++str; 135 | const char *token = str; 136 | while (*str != delimiter && str < end) ++str; 137 | const char *tokenEnd = str++ - 1; 138 | while (isspace(*tokenEnd) && tokenEnd > token) --tokenEnd; 139 | result.push_back(string(token, tokenEnd - token + 1)); 140 | } 141 | } 142 | return result; 143 | } 144 | 145 | } 146 | -------------------------------------------------------------------------------- /src/util/CommandOptions.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // Copyright (c) 2013, Giulio Paci // 4 | // All rights reserved. // 5 | // // 6 | // Redistribution and use in source and binary forms, with or without // 7 | // modification, are permitted provided that the following conditions are // 8 | // met: // 9 | // // 10 | // * Redistributions of source code must retain the above copyright // 11 | // notice, this list of conditions and the following disclaimer. // 12 | // // 13 | // * Redistributions in binary form must reproduce the above // 14 | // copyright notice, this list of conditions and the following // 15 | // disclaimer in the documentation and/or other materials provided // 16 | // with the distribution. // 17 | // // 18 | // * Neither the name of the Massachusetts Institute of Technology // 19 | // nor the names of its contributors may be used to endorse or // 20 | // promote products derived from this software without specific // 21 | // prior written permission. // 22 | // // 23 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 24 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 25 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 26 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 27 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 28 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 29 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 30 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 31 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 32 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 33 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 34 | //////////////////////////////////////////////////////////////////////////// 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | using std::string; 42 | using std::vector; 43 | 44 | namespace mitlm { 45 | 46 | class CommandOptions { 47 | protected: 48 | struct CmdOption { 49 | CmdOption(const char *name_, const char *desc_, const char *defval_, const char *type_) 50 | : name(name_), desc(desc_), defval(defval_), type(type_) { } 51 | const char *name; 52 | const char *desc; 53 | const char *defval; 54 | const char *type; 55 | }; 56 | 57 | typedef std::tr1::unordered_map::const_iterator hash_map_iter; 58 | 59 | string _header; 60 | string _footer; 61 | vector _options; 62 | vector _values; 63 | std::tr1::unordered_map _nameIndexMap; 64 | 65 | public: 66 | CommandOptions(const char *header="", const char *footer=""); 67 | void AddOption(const char *name, const char *desc, const char *defval=NULL, const char *type=NULL); 68 | bool ParseArguments(int argc, const char **argv); 69 | void PrintHelp() const; 70 | 71 | const char * operator[](const char *name) const; 72 | }; 73 | 74 | vector & 75 | trim_split(vector &result, const char *str, char delimiter); 76 | 77 | inline const char *GetItem(vector &items, size_t index) { 78 | if (items.size() == 0) return NULL; 79 | if (items.size() == 1) return items[0].c_str(); 80 | return items[index].c_str(); 81 | } 82 | 83 | inline string GetBasename(string str) { 84 | size_t extIndex = str.find_last_of('.'); 85 | return extIndex == string::npos ? str : str.substr(0, extIndex); 86 | } 87 | 88 | inline bool AsBoolean(const char *str) { 89 | if (str == NULL) return false; 90 | if (str[0] == 't' || str[0] == 'T' || str[0] == '1') return true; 91 | return false; 92 | } 93 | 94 | } 95 | -------------------------------------------------------------------------------- /src/util/FastHash.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef FASTHASH_H 36 | #define FASTHASH_H 37 | 38 | #include 39 | #include 40 | 41 | //////////////////////////////////////////////////////////////////////////////// 42 | 43 | #undef get16bits 44 | 45 | #if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \ 46 | || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__) 47 | #define get16bits(d) (*((const uint16_t *) (d))) 48 | #else 49 | #define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8) \ 50 | +(uint32_t)(((const uint8_t *)(d))[0])) 51 | #endif 52 | 53 | namespace mitlm { 54 | 55 | //////////////////////////////////////////////////////////////////////////////// 56 | // Adopted from http://www.azillionmonkeys.com/qed/hash.html. 57 | // @Copyright 2004-2007 by Paul Hsieh 58 | inline uint32_t SuperFastHash(const char * data, int len) { 59 | uint32_t hash = len, tmp; 60 | int rem; 61 | 62 | if (len <= 0 || data == NULL) return 0; 63 | 64 | rem = len & 3; 65 | len >>= 2; 66 | 67 | /* Main loop */ 68 | for (; len > 0; len--) { 69 | hash += get16bits(data); 70 | tmp = (get16bits(data+2) << 11) ^ hash; 71 | hash = (hash << 16) ^ tmp; 72 | data += 2 * sizeof(uint16_t); 73 | hash += hash >> 11; 74 | } 75 | 76 | /* Handle end cases */ 77 | switch (rem) { 78 | case 3: hash += get16bits(data); 79 | hash ^= hash << 16; 80 | hash ^= data[sizeof(uint16_t)] << 18; 81 | hash += hash >> 11; 82 | break; 83 | case 2: hash += get16bits(data); 84 | hash ^= hash << 11; 85 | hash += hash >> 17; 86 | break; 87 | case 1: hash += *data; 88 | hash ^= hash << 10; 89 | hash += hash >> 1; 90 | } 91 | 92 | /* Force "avalanching" of final 127 bits */ 93 | hash ^= hash << 3; 94 | hash += hash >> 5; 95 | hash ^= hash << 4; 96 | hash += hash >> 17; 97 | hash ^= hash << 25; 98 | hash += hash >> 6; 99 | 100 | return hash; 101 | } 102 | 103 | inline uint32_t SuperFastHash(uint32_t key1, uint32_t key2) { 104 | uint32_t hash = 0, tmp; 105 | 106 | hash += (key1 >> 16); 107 | tmp = ((key1 & 0xFFFF) << 11) ^ hash; 108 | hash = (hash << 16) ^ tmp; 109 | hash += hash >> 11; 110 | 111 | hash += (key2 >> 16); 112 | tmp = ((key2 & 0xFFFF) << 11) ^ hash; 113 | hash = (hash << 16) ^ tmp; 114 | hash += hash >> 11; 115 | 116 | /* Force "avalanching" of final 127 bits */ 117 | hash ^= hash << 3; 118 | hash += hash >> 5; 119 | hash ^= hash << 4; 120 | hash += hash >> 17; 121 | hash ^= hash << 25; 122 | hash += hash >> 6; 123 | 124 | return hash; 125 | } 126 | 127 | //////////////////////////////////////////////////////////////////////////////// 128 | 129 | inline size_t StringHash(const char *s, size_t len) { 130 | // Approximate SRILM Hash Function 131 | unsigned long i = 0; 132 | const char *end = s + len; 133 | for (; s != end; s++) 134 | i += (i << 3) + *s; 135 | return i * 1103515245; 136 | } 137 | 138 | } 139 | 140 | #endif // FASTHASH_H 141 | -------------------------------------------------------------------------------- /src/util/FastIO.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef FASTIO_H 36 | #define FASTIO_H 37 | 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include "Logger.h" 46 | 47 | namespace mitlm { 48 | 49 | //////////////////////////////////////////////////////////////////////////////// 50 | 51 | // Use date as version ID. 52 | #define MITLMv1a 0x20080901 // Bug: Vocab did not store length 53 | #define MITLMv1 0x20081201 54 | 55 | //////////////////////////////////////////////////////////////////////////////// 56 | 57 | template 58 | void Copy(InputIterator input, OutputIterator begin, const OutputIterator end) { 59 | while (begin != end) { 60 | *begin = *input; 61 | ++begin; ++input; 62 | } 63 | } 64 | 65 | //////////////////////////////////////////////////////////////////////////////// 66 | 67 | inline bool getline(FILE *file, char *buf, size_t bufSize) { 68 | if (fgets(buf, bufSize, file)) { 69 | size_t len = strlen(buf) - 1; 70 | if (len >= bufSize) 71 | Logger::Error(1, "The following exceeded max length.\n%s\n", buf); 72 | else if (buf[len] == '\n') 73 | buf[len] = '\0'; 74 | return true; 75 | } else 76 | return false; 77 | } 78 | 79 | inline bool getline(FILE *file, char *buf, size_t bufSize, size_t *outLen) { 80 | if (fgets(buf, bufSize, file)) { 81 | *outLen = strlen(buf) - 1; 82 | if (*outLen >= bufSize) 83 | Logger::Error(1, "The following exceeded max length.\n%s\n", buf); 84 | else if (buf[*outLen] == '\n') 85 | buf[*outLen] = '\0'; 86 | return true; 87 | } else 88 | return false; 89 | } 90 | 91 | //////////////////////////////////////////////////////////////////////////////// 92 | 93 | inline void WriteAlignPad(FILE *outFile, size_t len) { 94 | uint64_t zero = 0; 95 | if ((len % 8) != 0 && fwrite(&zero, (8 - len) % 8, 1, outFile) != 1) 96 | throw std::runtime_error("Write failed."); 97 | } 98 | 99 | inline void ReadAlignPad(FILE *inFile, size_t len) { 100 | uint64_t zero = 0; 101 | if ((len % 8) != 0 && 102 | (fread(&zero, (8 - len) % 8, 1, inFile) != 1 || zero != 0)) 103 | throw std::runtime_error("Read failed."); 104 | } 105 | 106 | inline void WriteInt32(FILE *outFile, int x) { 107 | if (fwrite(&x, sizeof(int), 1, outFile) != 1) 108 | throw std::runtime_error("Write failed."); 109 | } 110 | 111 | inline void WriteUInt32(FILE *outFile, unsigned int x) { 112 | if (fwrite(&x, sizeof(unsigned int), 1, outFile) != 1) 113 | throw std::runtime_error("Write failed."); 114 | } 115 | 116 | inline void WriteUInt64(FILE *outFile, uint64_t x) { 117 | if (fwrite(&x, sizeof(uint64_t), 1, outFile) != 1) 118 | throw std::runtime_error("Write failed."); 119 | } 120 | 121 | inline void WriteDouble(FILE *outFile, double x) { 122 | if (fwrite(&x, sizeof(double), 1, outFile) != 1) 123 | throw std::runtime_error("Write failed."); 124 | } 125 | 126 | inline void WriteString(FILE *outFile, const std::string &str) { 127 | WriteUInt64(outFile, (uint64_t)str.length()); 128 | if (fwrite(str.c_str(), str.length(), 1, outFile) != 1) 129 | throw std::runtime_error("Write failed."); 130 | WriteAlignPad(outFile, str.length()); 131 | } 132 | 133 | template 134 | inline void WriteVector(FILE *out, const std::vector &x) { 135 | WriteUInt64(out, (uint64_t)x.size()); 136 | if (fwrite(x.data(), sizeof(T), x.size(), out) != x.size()) 137 | throw std::runtime_error("Write failed."); 138 | WriteAlignPad(out, x.size() * sizeof(T)); 139 | } 140 | 141 | inline void WriteHeader(FILE *outFile, const char *header) { 142 | size_t len = strlen(header); 143 | if (fwrite(header, len, 1, outFile) != 1) 144 | throw std::runtime_error("Write failed."); 145 | WriteAlignPad(outFile, len); 146 | } 147 | 148 | //////////////////////////////////////////////////////////////////////////////// 149 | 150 | inline int ReadInt32(FILE *inFile) { 151 | int v; 152 | if (fread(&v, sizeof(int), 1, inFile) != 1) 153 | throw std::runtime_error("Read failed."); 154 | return v; 155 | } 156 | 157 | inline unsigned int ReadUInt32(FILE *inFile) { 158 | unsigned int v; 159 | if (fread(&v, sizeof(unsigned int), 1, inFile) != 1) 160 | throw std::runtime_error("Read failed."); 161 | return v; 162 | } 163 | 164 | inline uint64_t ReadUInt64(FILE *inFile) { 165 | uint64_t v; 166 | if (fread(&v, sizeof(uint64_t), 1, inFile) != 1) 167 | throw std::runtime_error("Read failed."); 168 | return v; 169 | } 170 | 171 | inline double ReadDouble(FILE *inFile) { 172 | double v; 173 | if (fread(&v, sizeof(double), 1, inFile) != 1) 174 | throw std::runtime_error("Read failed."); 175 | return v; 176 | } 177 | 178 | inline void ReadString(FILE *inFile, std::string &str) { 179 | str.resize(ReadUInt64(inFile)); 180 | if (fread(&str[0], str.length(), 1, inFile) != 1) 181 | throw std::runtime_error("Read failed."); 182 | ReadAlignPad(inFile, str.length()); 183 | } 184 | 185 | template 186 | inline void ReadVector(FILE *in, std::vector &x) { 187 | x.resize(ReadUInt64(in)); 188 | if (fread(x.data(), sizeof(T), x.size(), in) != x.size()) 189 | throw std::runtime_error("Read failed."); 190 | ReadAlignPad(in, x.size() * sizeof(T)); 191 | } 192 | 193 | inline void VerifyHeader(FILE *inFile, const char *header) { 194 | char buf[256]; 195 | size_t len = strlen(header); 196 | assert(len < 255); 197 | if (fread(buf, len, 1, inFile) != 1 || strncmp(buf, header, len) != 0) 198 | throw std::runtime_error("Invalid file format."); 199 | ReadAlignPad(inFile, len); 200 | } 201 | 202 | } 203 | 204 | #endif // FASTIO_H 205 | -------------------------------------------------------------------------------- /src/util/Logger.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #include 36 | #include 37 | #include "Logger.h" 38 | 39 | namespace mitlm { 40 | 41 | //////////////////////////////////////////////////////////////////////////////// 42 | 43 | #ifdef NDEBUG 44 | int Logger::_verbosity = 0; 45 | bool Logger::_timestamp = false; 46 | #else 47 | int Logger::_verbosity = 1; 48 | bool Logger::_timestamp = true; 49 | #endif 50 | clock_t Logger::_startTime = clock(); 51 | FILE* Logger::_err_file = stderr; 52 | FILE* Logger::_out_file = stdout; 53 | 54 | void Logger::Log(int level, const char *fmt, ...) { 55 | if (_verbosity >= level) { 56 | if ( _out_file != NULL ) { 57 | va_list args; 58 | va_start(args, fmt); 59 | if (_timestamp) 60 | fprintf(_out_file, "%.3f\t", (double)(clock() - _startTime) / CLOCKS_PER_SEC); 61 | vfprintf(_out_file, fmt, args); 62 | va_end(args); 63 | } 64 | } 65 | } 66 | 67 | void Logger::Warn(int level, const char *fmt, ...) { 68 | if (_verbosity >= level) { 69 | va_list args; 70 | if ( _err_file != NULL ) { 71 | va_start(args, fmt); 72 | fprintf(_err_file, "\033[0;33m"); 73 | } 74 | if ( _out_file != NULL ) { 75 | if (_timestamp) 76 | fprintf(_out_file, "%.3f\t", (double)(clock() - _startTime) / CLOCKS_PER_SEC); 77 | } 78 | if ( _err_file != NULL ) { 79 | vfprintf(_err_file, fmt, args); 80 | fprintf(_err_file, "\033[m"); 81 | va_end(args); 82 | } 83 | } 84 | } 85 | 86 | void Logger::Error(int level, const char *fmt, ...) { 87 | if (_verbosity >= level) { 88 | va_list args; 89 | if ( _err_file != NULL ) { 90 | va_start(args, fmt); 91 | fprintf(_err_file, "\033[1;31m"); 92 | } 93 | if ( _out_file != NULL ) { 94 | if (_timestamp) 95 | fprintf(_out_file, "%.3f\t", (double)(clock() - _startTime) / CLOCKS_PER_SEC); 96 | } 97 | if ( _err_file != NULL ) { 98 | vfprintf(_err_file, fmt, args); 99 | fprintf(_err_file, "\033[m"); 100 | va_end(args); 101 | } 102 | } 103 | } 104 | 105 | } 106 | -------------------------------------------------------------------------------- /src/util/Logger.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef LOGGER_H 36 | #define LOGGER_H 37 | 38 | #include 39 | #include 40 | 41 | namespace mitlm { 42 | 43 | //////////////////////////////////////////////////////////////////////////////// 44 | 45 | class Logger { 46 | static int _verbosity; 47 | static bool _timestamp; 48 | static clock_t _startTime; 49 | static FILE* _err_file; 50 | static FILE* _out_file; 51 | 52 | public: 53 | static inline void SetErrorFile(FILE* file=stderr) { _err_file = file; } 54 | static inline void SetOutputFile(FILE* file=stdout) { _out_file = file; } 55 | static inline void SetVerbosity(int verbosity=1) { _verbosity = verbosity; } 56 | static inline void ShowTimestamp(bool timestamp) { _timestamp = timestamp; } 57 | static inline int GetVerbosity() { return _verbosity; } 58 | static void Log(int level, const char *fmt, ...); 59 | static void Warn(int level, const char *fmt, ...); 60 | static void Error(int level, const char *fmt, ...); 61 | static void PrintStackTrace(FILE *out, const char *file, int line); 62 | }; 63 | 64 | } 65 | 66 | #endif // LOGGER_H 67 | -------------------------------------------------------------------------------- /src/util/RefCounter.cpp: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // Copyright (c) 2013, Giulio Paci // 4 | // All rights reserved. // 5 | // // 6 | // Redistribution and use in source and binary forms, with or without // 7 | // modification, are permitted provided that the following conditions are // 8 | // met: // 9 | // // 10 | // * Redistributions of source code must retain the above copyright // 11 | // notice, this list of conditions and the following disclaimer. // 12 | // // 13 | // * Redistributions in binary form must reproduce the above // 14 | // copyright notice, this list of conditions and the following // 15 | // disclaimer in the documentation and/or other materials provided // 16 | // with the distribution. // 17 | // // 18 | // * Neither the name of the Massachusetts Institute of Technology // 19 | // nor the names of its contributors may be used to endorse or // 20 | // promote products derived from this software without specific // 21 | // prior written permission. // 22 | // // 23 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 24 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 25 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 26 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 27 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 28 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 29 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 30 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 31 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 32 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 33 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 34 | //////////////////////////////////////////////////////////////////////////// 35 | 36 | #include "Logger.h" 37 | #include "RefCounter.h" 38 | 39 | namespace mitlm { 40 | 41 | //////////////////////////////////////////////////////////////////////////////// 42 | 43 | _RefCounter RefCounter; 44 | 45 | _RefCounter::~_RefCounter() { 46 | if (!_map.empty()) { 47 | Logger::Error(1, "-- RefCounter----------\n"); 48 | std::tr1::unordered_map::const_iterator it; 49 | for (it = _map.begin(); it != _map.end(); ++it) 50 | Logger::Error(1, "map[%p] = %i\n", (void *)it->first, it->second); 51 | Logger::Error(1, "-----------------------\n\n"); 52 | } 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/util/RefCounter.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // Copyright (c) 2013, Giulio Paci // 4 | // All rights reserved. // 5 | // // 6 | // Redistribution and use in source and binary forms, with or without // 7 | // modification, are permitted provided that the following conditions are // 8 | // met: // 9 | // // 10 | // * Redistributions of source code must retain the above copyright // 11 | // notice, this list of conditions and the following disclaimer. // 12 | // // 13 | // * Redistributions in binary form must reproduce the above // 14 | // copyright notice, this list of conditions and the following // 15 | // disclaimer in the documentation and/or other materials provided // 16 | // with the distribution. // 17 | // // 18 | // * Neither the name of the Massachusetts Institute of Technology // 19 | // nor the names of its contributors may be used to endorse or // 20 | // promote products derived from this software without specific // 21 | // prior written permission. // 22 | // // 23 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 24 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 25 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 26 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 27 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 28 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 29 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 30 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 31 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 32 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 33 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 34 | //////////////////////////////////////////////////////////////////////////// 35 | 36 | #ifndef REFCOUNTER_H 37 | #define REFCOUNTER_H 38 | 39 | #include 40 | 41 | namespace mitlm { 42 | 43 | //////////////////////////////////////////////////////////////////////////////// 44 | 45 | class _RefCounter 46 | { 47 | public: 48 | ~_RefCounter(); 49 | 50 | template void attach(const T *referenced) 51 | { ++_map[reinterpret_cast(referenced)]; } 52 | 53 | template bool detach(const T *referenced) { 54 | unsigned long addr = reinterpret_cast(referenced); 55 | if (_map.find(addr)==_map.end()) 56 | return true; 57 | if (--_map[addr] == -1) { 58 | _map.erase(addr); 59 | return true; 60 | } 61 | return false; 62 | } 63 | 64 | private: 65 | std::tr1::unordered_map _map; 66 | }; 67 | 68 | extern _RefCounter RefCounter; 69 | 70 | } 71 | 72 | #endif // REFCOUNTER_H 73 | -------------------------------------------------------------------------------- /src/util/SharedPtr.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef SHAREDPTR_H 36 | #define SHAREDPTR_H 37 | 38 | #include "RefCounter.h" 39 | 40 | namespace mitlm { 41 | 42 | template 43 | class SharedPtr { 44 | protected: 45 | T *_p; 46 | 47 | public: 48 | explicit SharedPtr(T *p = NULL) : _p(p) { } 49 | SharedPtr(const SharedPtr &p) : _p(p._p) 50 | { if (_p != NULL) RefCounter.attach(_p); } 51 | ~SharedPtr() { if (_p != NULL && RefCounter.detach(_p)) delete _p; } 52 | 53 | SharedPtr &operator=(T *p) { 54 | if (_p != NULL && RefCounter.detach(_p)) delete _p; 55 | _p = p; 56 | return *this; 57 | } 58 | SharedPtr &operator=(const SharedPtr &p) { 59 | if (_p != NULL && RefCounter.detach(_p)) delete _p; 60 | if ((_p = p._p) != NULL) RefCounter.attach(_p); 61 | return *this; 62 | } 63 | 64 | T *get() { return _p; } 65 | T &operator*() { return *_p; } 66 | T *operator->() { return _p; } 67 | operator T *() { return _p; } 68 | const T *get() const { return _p; } 69 | const T &operator*() const { return *_p; } 70 | const T *operator->() const { return _p; } 71 | operator const T *() const { return _p; } 72 | operator bool() const { return _p == NULL; } 73 | }; 74 | 75 | } 76 | 77 | #endif // SHAREDPTR_H 78 | -------------------------------------------------------------------------------- /src/util/ZFile.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // Copyright (c) 2010-2013, Giulio Paci // 4 | // Copyright (c) 2013, Jakub Wilk // 5 | // All rights reserved. // 6 | // // 7 | // Redistribution and use in source and binary forms, with or without // 8 | // modification, are permitted provided that the following conditions are // 9 | // met: // 10 | // // 11 | // * Redistributions of source code must retain the above copyright // 12 | // notice, this list of conditions and the following disclaimer. // 13 | // // 14 | // * Redistributions in binary form must reproduce the above // 15 | // copyright notice, this list of conditions and the following // 16 | // disclaimer in the documentation and/or other materials provided // 17 | // with the distribution. // 18 | // // 19 | // * Neither the name of the Massachusetts Institute of Technology // 20 | // nor the names of its contributors may be used to endorse or // 21 | // promote products derived from this software without specific // 22 | // prior written permission. // 23 | // // 24 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 25 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 26 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 27 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 28 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 29 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 30 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 31 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 32 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 33 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 34 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 35 | //////////////////////////////////////////////////////////////////////////// 36 | 37 | #ifndef ZFILE_H 38 | #define ZFILE_H 39 | 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | 47 | #ifndef O_BINARY 48 | #define O_BINARY 0 49 | #endif 50 | 51 | namespace mitlm { 52 | 53 | #if ( defined(WIN32) || defined(_WIN32) ) && !defined(__CYGWIN__) 54 | 55 | static std::string win_argv_escape ( const std::string& s ) 56 | { 57 | std::ostringstream buffer; 58 | buffer << '"'; 59 | for (std::string::const_iterator it = s.begin () ; it != s.end(); ++it) 60 | { 61 | // count backslashes 62 | unsigned n = 0; 63 | while (it != s.end () && *it == '\\') 64 | { 65 | it++; 66 | n++; 67 | } 68 | if (it == s.end ()) 69 | { 70 | // at the end of the string we must escape all backslashes, 71 | // because we are going to append a '"' 72 | n *= 2; 73 | for ( unsigned i = 0; i < n; i++ ) 74 | { 75 | buffer << '\\'; 76 | } 77 | break; 78 | } 79 | else if (*it == '"') 80 | { 81 | // with '\'* + '"' we should escape all '\' 82 | n *= 2; 83 | // and '"' 84 | n++; 85 | } 86 | for ( unsigned i = 0; i < n; i++ ) 87 | { 88 | buffer << '\\'; 89 | } 90 | buffer << *it; 91 | } 92 | buffer << '"'; 93 | return buffer.str(); 94 | } 95 | 96 | static std::string cmd_exe_escape ( const std::string& s ) 97 | { 98 | std::ostringstream buffer; 99 | for (std::string::const_iterator it = s.begin(); it != s.end(); it++) 100 | { 101 | if ( (*it == '"') 102 | || (*it == ' ') 103 | || (*it == '\t') 104 | || (*it == '\n') 105 | || (*it == '\v') 106 | || (*it == '(') 107 | || (*it == ')') 108 | || (*it == '%') 109 | || (*it == '!') 110 | || (*it == '^') 111 | || (*it == '<') 112 | || (*it == '>') 113 | || (*it == '&') 114 | || (*it == '|') 115 | ) 116 | { 117 | buffer << '^'; 118 | } 119 | buffer << *it; 120 | } 121 | return buffer.str(); 122 | } 123 | 124 | # define popen_escape(s) cmd_exe_escape(s) 125 | # define popen_escape2(s) cmd_exe_escape(win_argv_escape(s)) 126 | # define EXEC_TOKEN 127 | #else 128 | 129 | static std::string shell_escape(const std::string &s) 130 | { 131 | std::ostringstream buffer; 132 | buffer << "'"; 133 | for (std::string::const_iterator it = s.begin(); it != s.end(); it++) 134 | { 135 | if (*it == '\'') 136 | buffer << "'\\''"; 137 | else 138 | buffer << *it; 139 | } 140 | buffer << "'"; 141 | return buffer.str(); 142 | } 143 | 144 | # define popen_escape(s) shell_escape(s) 145 | # define popen_escape2(s) shell_escape(s) 146 | # define EXEC_TOKEN "exec " 147 | #endif 148 | 149 | //////////////////////////////////////////////////////////////////////////////// 150 | 151 | class ZFile { 152 | protected: 153 | FILE * _file; 154 | std::string _filename; 155 | std::string _mode; 156 | 157 | bool endsWith(const char *str, const char *suffix) { 158 | size_t strLen = strlen(str); 159 | size_t suffixLen = strlen(suffix); 160 | return (suffixLen <= strLen) && 161 | (strncmp(&str[strLen - suffixLen], suffix, suffixLen) == 0); 162 | } 163 | 164 | FILE *processOpen(const std::string &command, const char *mode) 165 | { return popen(command.c_str(), mode); } 166 | 167 | public: 168 | ZFile(const char *filename, const char *mode="r") { 169 | if (mode == NULL || (mode[0] != 'r' && mode[0] != 'w')) 170 | throw std::runtime_error("Invalid mode"); 171 | 172 | _filename = filename; 173 | if(mode[0] == 'r') 174 | { 175 | _mode = O_BINARY ? "rb" : "r"; 176 | } 177 | else if(mode[0] == 'w') 178 | { 179 | _mode = O_BINARY ? "wb" : "w"; 180 | } 181 | ReOpen(); 182 | } 183 | ~ZFile() { if (_file) fclose(_file); } 184 | 185 | void ReOpen() { 186 | const char *mode = _mode.c_str(); 187 | if (endsWith(_filename.c_str(), ".gz")) { 188 | _file = (_mode[0] == 'r') ? 189 | processOpen(std::string(EXEC_TOKEN "gzip -dc ") + popen_escape2(_filename), mode) : 190 | processOpen(std::string(EXEC_TOKEN "gzip -c > ") + popen_escape(_filename), mode); 191 | } else if (endsWith(_filename.c_str(), ".bz2")) { 192 | _file = (_mode[0] == 'r') ? 193 | processOpen(std::string(EXEC_TOKEN "bzip2 -dc ") + popen_escape2(_filename), mode) : 194 | processOpen(std::string(EXEC_TOKEN "bzip2 -c > ") + popen_escape(_filename), mode); 195 | } else if (endsWith(_filename.c_str(), ".zip")) { 196 | _file = (_mode[0] == 'r') ? 197 | processOpen(std::string(EXEC_TOKEN "unzip -c ") + popen_escape2(_filename), mode) : 198 | processOpen(std::string(EXEC_TOKEN "zip -q > ") + popen_escape(_filename), mode); 199 | } else { // Assume uncompressed 200 | _file = fopen(_filename.c_str(), mode); 201 | } 202 | if (_file == NULL) 203 | throw std::runtime_error("Cannot open file"); 204 | } 205 | 206 | operator FILE *() const { return _file; } 207 | }; 208 | 209 | } 210 | 211 | #endif // ZFILE_H 212 | -------------------------------------------------------------------------------- /src/vector/DenseVector.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef DENSEVECTOR_H 36 | #define DENSEVECTOR_H 37 | 38 | #include 39 | #include 40 | #include "Vector.h" 41 | #include "VectorClosures.h" 42 | #include "Range.h" 43 | #include "Traits.h" 44 | 45 | namespace mitlm { 46 | 47 | //////////////////////////////////////////////////////////////////////////////// 48 | 49 | template 50 | class DenseVector: public Vector > { 51 | public: 52 | typedef T ElementType; 53 | typedef const T * ConstIterator; 54 | typedef T * Iterator; 55 | 56 | DenseVector(size_t length = 0); 57 | DenseVector(size_t length, T value); 58 | DenseVector(const Range &r); 59 | DenseVector(const DenseVector &rhs); 60 | DenseVector(const DenseVector &rhs, bool clone); 61 | template DenseVector(const Vector &rhs); 62 | ~DenseVector(); 63 | 64 | DenseVector & operator=(T value); 65 | DenseVector & operator=(const Range &r); 66 | DenseVector & operator=(const DenseVector &rhs); 67 | DenseVector & operator=(const std::vector &rhs); 68 | template DenseVector &operator=(const Vector &rhs); 69 | template DenseVector &operator+=(const Vector &rhs); 70 | template DenseVector &operator-=(const Vector &rhs); 71 | template DenseVector &operator*=(const Vector &rhs); 72 | template DenseVector &operator/=(const Vector &rhs); 73 | DenseVector & operator+=(T alpha); 74 | DenseVector & operator-=(T alpha); 75 | DenseVector & operator*=(T alpha); 76 | DenseVector & operator/=(T alpha); 77 | const T & operator[](size_t index) const; 78 | T & operator[](size_t index); 79 | const DenseVector operator[](const Range &r) const; 80 | DenseVector operator[](const Range &r); 81 | 82 | template 83 | const IndirectVectorClosure, typename RHS::Impl> 84 | operator[](const Vector &x) const; 85 | 86 | template 87 | IndirectVectorClosure, typename RHS::Impl> 88 | operator[](const Vector &x); 89 | 90 | template 91 | MaskedVectorClosure, typename M::Impl> 92 | masked(const Vector &mask); 93 | 94 | void reset(size_t length); 95 | void reset(size_t length, T value); 96 | void resize(size_t length); 97 | void resize(size_t length, T value); 98 | void swap(DenseVector &v); 99 | void set(T value); 100 | void attach(const DenseVector &v); 101 | template bool sort(Compare compare); 102 | 103 | size_t length() const { return _length; } 104 | ConstIterator begin() const { return _data; } 105 | ConstIterator end() const { return _data + _length; } 106 | const T * data() const { return _data; } 107 | Iterator begin() { return _data; } 108 | Iterator end() { return _data + _length; } 109 | T * data() { return _data; } 110 | 111 | private: 112 | DenseVector(size_t length, T *data, void *storage); 113 | void _allocate(); 114 | void _release(); 115 | 116 | size_t _length; 117 | T * _data; 118 | void * _storage; 119 | }; 120 | 121 | //////////////////////////////////////////////////////////////////////////////// 122 | 123 | template 124 | struct TypeInfo > { 125 | typedef DenseVector Impl; 126 | typedef T ElementType; 127 | }; 128 | 129 | //////////////////////////////////////////////////////////////////////////////// 130 | 131 | template 132 | std::ostream &operator<<(std::ostream &o, const DenseVector &x); 133 | 134 | template <> 135 | std::ostream &operator<<(std::ostream &o, const DenseVector &x); 136 | 137 | template 138 | void WriteVector(FILE *out, const DenseVector &x); 139 | 140 | template 141 | void ReadVector(FILE *in, DenseVector &x); 142 | 143 | //////////////////////////////////////////////////////////////////////////////// 144 | 145 | } 146 | 147 | #include "DenseVector.tcc" 148 | 149 | #endif // DENSEVECTOR_H 150 | -------------------------------------------------------------------------------- /src/vector/Operations.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef OPERATIONS_H 36 | #define OPERATIONS_H 37 | 38 | #include 39 | 40 | namespace mitlm { 41 | 42 | //////////////////////////////////////////////////////////////////////////////// 43 | // Boolean Operations 44 | 45 | struct OpNot { 46 | template 47 | static bool Eval(T v) { return !v; } 48 | }; 49 | 50 | struct OpEqual { 51 | template 52 | static bool Eval(L a, R b) { return a == b; } 53 | }; 54 | 55 | struct OpNotEqual { 56 | template 57 | static bool Eval(L a, R b) { return a != b; } 58 | }; 59 | 60 | struct OpLessThan { 61 | template 62 | static bool Eval(L a, R b) { return a < b; } 63 | }; 64 | 65 | struct OpLessEqual { 66 | template 67 | static bool Eval(L a, R b) { return a <= b; } 68 | }; 69 | 70 | //////////////////////////////////////////////////////////////////////////////// 71 | // Arithmetic Operations 72 | 73 | struct OpNeg { 74 | template 75 | static T Eval(T v) { return -v; } 76 | }; 77 | 78 | struct OpAdd { 79 | template 80 | static typename Promotion::Type Eval(L a, R b) { return a+b; } 81 | }; 82 | 83 | struct OpSub { 84 | template 85 | static typename Promotion::Type Eval(L a, R b) { return a-b; } 86 | }; 87 | 88 | struct OpMult { 89 | template 90 | static typename Promotion::Type Eval(L a, R b) { return a*b; } 91 | }; 92 | 93 | struct OpDiv { 94 | template 95 | static typename Promotion::Type Eval(L a, R b) { return a/b; } 96 | }; 97 | 98 | //////////////////////////////////////////////////////////////////////////////// 99 | // Advanced Operations 100 | 101 | struct OpMin { 102 | template static T Eval(T a, T b) { return std::min(a, b); } 103 | }; 104 | 105 | struct OpMax { 106 | template static T Eval(T a, T b) { return std::max(a, b); } 107 | }; 108 | 109 | struct OpAbs { 110 | template static T Eval(T v) { return fabs(v); } 111 | }; 112 | 113 | struct OpLog { 114 | template static T Eval(T v) { return log(v); } 115 | }; 116 | 117 | struct OpLog10 { 118 | template static T Eval(T v) { return log10(v); } 119 | }; 120 | 121 | struct OpExp { 122 | template static T Eval(T v) { return exp(v); } 123 | }; 124 | 125 | struct OpPow { 126 | template 127 | static typename Promotion::Type 128 | Eval(L a, R b) { 129 | typedef typename Promotion::Type ResultType; 130 | return pow((ResultType)a, (ResultType)b); 131 | } 132 | }; 133 | 134 | struct OpIsNan { 135 | template static T Eval(T v) { return std::isnan(v); } 136 | }; 137 | 138 | struct OpIsInf { 139 | template static T Eval(T v) { return std::isinf(v); } 140 | }; 141 | 142 | struct OpIsFinite { 143 | template static T Eval(T v) { return std::isfinite(v); } 144 | }; 145 | 146 | //////////////////////////////////////////////////////////////////////////////// 147 | // Type Casting Operations 148 | 149 | struct OpDouble { 150 | template static double Eval(T v) { return (double)v; } 151 | }; 152 | 153 | struct OpFloat { 154 | template static float Eval(T v) { return (float)v; } 155 | }; 156 | 157 | struct OpInt { 158 | template static int Eval(T v) { return static_cast(v); } 159 | }; 160 | 161 | } 162 | 163 | #endif // OPERATIONS_H 164 | -------------------------------------------------------------------------------- /src/vector/Range.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef RANGE_H 36 | #define RANGE_H 37 | 38 | #include 39 | 40 | namespace mitlm { 41 | 42 | //////////////////////////////////////////////////////////////////////////////// 43 | 44 | class Range { 45 | public: 46 | explicit Range(size_t endIndex) 47 | : _beginIndex(0), _endIndex(endIndex) 48 | { assert(_beginIndex <= _endIndex); } 49 | 50 | explicit Range(size_t beginIndex, size_t endIndex) 51 | : _beginIndex(beginIndex), _endIndex(endIndex) 52 | { assert(_beginIndex <= _endIndex); } 53 | 54 | size_t beginIndex() const { return _beginIndex; } 55 | size_t endIndex() const { return _endIndex; } 56 | size_t length() const { return _endIndex - _beginIndex; } 57 | 58 | private: 59 | size_t _beginIndex, _endIndex; 60 | }; 61 | 62 | } 63 | 64 | #endif // RANGE_H 65 | -------------------------------------------------------------------------------- /src/vector/Scalar.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef SCALAR_H 36 | #define SCALAR_H 37 | 38 | #include "Traits.h" 39 | 40 | namespace mitlm { 41 | 42 | //////////////////////////////////////////////////////////////////////////////// 43 | 44 | template 45 | class Scalar { 46 | public: 47 | class ConstIterator { 48 | public: 49 | ConstIterator(T value) : _value(value) { } 50 | ConstIterator & operator++() { return *this; } 51 | T operator*() const { return _value; } 52 | bool operator==(const ConstIterator &iter) const { return true; } 53 | bool operator!=(const ConstIterator &iter) const { return false; } 54 | 55 | private: 56 | T _value; 57 | }; 58 | 59 | typedef T ElementType; 60 | typedef Scalar Impl; 61 | 62 | Scalar(T value) : _value(value) { } 63 | const Impl & impl() const { return *this; } 64 | operator T() const { return _value; } 65 | size_t length() const { return 1; } 66 | ConstIterator begin() const { return ConstIterator(_value); } 67 | ConstIterator end() const { return ConstIterator(_value); } 68 | ElementType operator[](size_t index) const { return _value; } 69 | 70 | private: 71 | T _value; 72 | }; 73 | 74 | //////////////////////////////////////////////////////////////////////////////// 75 | 76 | template 77 | struct Ref > { 78 | typedef Scalar Type; 79 | }; 80 | 81 | } 82 | 83 | #endif // SCALAR_H 84 | -------------------------------------------------------------------------------- /src/vector/Traits.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef TRAITS_H 36 | #define TRAITS_H 37 | 38 | namespace mitlm { 39 | template struct Ref { typedef const A &Type; }; 40 | template <> struct Ref { typedef void Type; }; 41 | template <> struct Ref { typedef short Type; }; 42 | template <> struct Ref { typedef int Type; }; 43 | template <> struct Ref { typedef long Type; }; 44 | template <> struct Ref { typedef float Type; }; 45 | template <> struct Ref { typedef double Type; }; 46 | 47 | template struct Promotion { }; 48 | template struct Promotion { typedef A Type; }; 49 | template <> struct Promotion { typedef int Type; }; 50 | template <> struct Promotion { typedef int Type; }; 51 | template <> struct Promotion { typedef long Type; }; 52 | template <> struct Promotion { typedef long Type; }; 53 | template <> struct Promotion { typedef long Type; }; 54 | template <> struct Promotion { typedef long Type; }; 55 | template <> struct Promotion { typedef float Type; }; 56 | template <> struct Promotion { typedef float Type; }; 57 | template <> struct Promotion { typedef float Type; }; 58 | template <> struct Promotion { typedef float Type; }; 59 | template <> struct Promotion { typedef double Type; }; 60 | template <> struct Promotion { typedef double Type; }; 61 | template <> struct Promotion { typedef double Type; }; 62 | template <> struct Promotion { typedef double Type; }; 63 | template <> struct Promotion { typedef double Type; }; 64 | template <> struct Promotion { typedef double Type; }; 65 | template <> struct Promotion { typedef double Type; }; 66 | template <> struct Promotion { typedef double Type; }; 67 | template <> struct Promotion { typedef double Type; }; 68 | template <> struct Promotion { typedef double Type; }; 69 | 70 | } 71 | 72 | #endif // TRAITS_H 73 | -------------------------------------------------------------------------------- /src/vector/Vector.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef VECTOR_H 36 | #define VECTOR_H 37 | 38 | #include "Traits.h" 39 | #include "Scalar.h" 40 | 41 | namespace mitlm { 42 | 43 | //////////////////////////////////////////////////////////////////////////////// 44 | 45 | template 46 | struct TypeInfo { 47 | typedef A Impl; 48 | typedef typename Impl::ElementType ElementType; 49 | }; 50 | 51 | //////////////////////////////////////////////////////////////////////////////// 52 | 53 | template 54 | class Vector { 55 | public: 56 | typedef typename TypeInfo::ElementType ElementType; 57 | typedef Scalar ScalarType; 58 | typedef typename TypeInfo::Impl Impl; 59 | 60 | virtual ~Vector() { } 61 | const Impl & impl() const { return static_cast(*this); } 62 | Impl & impl() { return static_cast(*this); } 63 | }; 64 | 65 | //////////////////////////////////////////////////////////////////////////////// 66 | 67 | template 68 | struct TypeInfo > { 69 | typedef typename TypeInfo::Impl Impl; 70 | typedef typename TypeInfo::ElementType ElementType; 71 | }; 72 | 73 | } 74 | 75 | #endif // VECTOR_H 76 | -------------------------------------------------------------------------------- /src/vector/VectorBuilder.h: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2008, Massachusetts Institute of Technology // 3 | // All rights reserved. // 4 | // // 5 | // Redistribution and use in source and binary forms, with or without // 6 | // modification, are permitted provided that the following conditions are // 7 | // met: // 8 | // // 9 | // * Redistributions of source code must retain the above copyright // 10 | // notice, this list of conditions and the following disclaimer. // 11 | // // 12 | // * Redistributions in binary form must reproduce the above // 13 | // copyright notice, this list of conditions and the following // 14 | // disclaimer in the documentation and/or other materials provided // 15 | // with the distribution. // 16 | // // 17 | // * Neither the name of the Massachusetts Institute of Technology // 18 | // nor the names of its contributors may be used to endorse or // 19 | // promote products derived from this software without specific // 20 | // prior written permission. // 21 | // // 22 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // 23 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // 24 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // 25 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // 26 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // 27 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // 28 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // 29 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // 30 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // 31 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // 32 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 33 | //////////////////////////////////////////////////////////////////////////// 34 | 35 | #ifndef VECTORBUILDER_H 36 | #define VECTORBUILDER_H 37 | 38 | #include 39 | #include "util/BitOps.h" 40 | #include "Vector.h" 41 | #include "DenseVector.h" 42 | 43 | namespace mitlm { 44 | 45 | //////////////////////////////////////////////////////////////////////////////// 46 | 47 | template 48 | class VectorBuilder : public Vector > { 49 | public: 50 | typedef T ElementType; 51 | typedef const T * ConstIterator; 52 | 53 | VectorBuilder(size_t capacity=16) : _length(0) { 54 | _vector.resize(std::max(capacity, (size_t)16)); 55 | } 56 | 57 | void append(typename Ref::Type value, size_t count=1) { 58 | if (length() + count > _vector.length()) 59 | _vector.resize(nextPowerOf2(length() + count - 1)); 60 | std::fill_n(_vector.begin() + _length, count, value); 61 | _length += count; 62 | } 63 | 64 | template 65 | void append(const Vector &rhs) { 66 | size_t newLength = _length + rhs.impl().length(); 67 | if (newLength > _vector.length()) 68 | _vector.resize(nextPowerOf2(newLength - 1)); 69 | 70 | typename Vector::Impl::ConstIterator rBegin = rhs.impl().begin(); 71 | typename Vector::Impl::ConstIterator rEnd = rhs.impl().end(); 72 | T *lBegin = _vector.begin() + _length; 73 | while (rBegin != rEnd) 74 | *lBegin++ = *rBegin++; 75 | _length = newLength; 76 | } 77 | 78 | size_t length() const { return _length; } 79 | ConstIterator begin() const { return _vector.begin(); } 80 | ConstIterator end() const { return _vector.end(); } 81 | 82 | protected: 83 | size_t _length; 84 | DenseVector _vector; 85 | }; 86 | 87 | //////////////////////////////////////////////////////////////////////////////// 88 | 89 | template 90 | struct TypeInfo > { 91 | typedef VectorBuilder Impl; 92 | typedef T ElementType; 93 | }; 94 | 95 | } 96 | 97 | #endif // VECTORBUILDER_H 98 | -------------------------------------------------------------------------------- /tests/data/small.txt: -------------------------------------------------------------------------------- 1 | a b c d 2 | c d e 3 | d e 4 | -------------------------------------------------------------------------------- /tests/data/small.vocab: -------------------------------------------------------------------------------- 1 | a 2 | b 3 | c 4 | -------------------------------------------------------------------------------- /tests/data/test1_ref/wc.a.hyp: -------------------------------------------------------------------------------- 1 | 3 2 | a 1 3 | b 1 4 | c 2 5 | d 3 6 | e 2 7 | a 1 8 | c 1 9 | d 1 10 | a b 1 11 | b c 1 12 | c d 2 13 | d 1 14 | d e 2 15 | e 2 16 | a b 1 17 | c d 1 18 | d e 1 19 | a b c 1 20 | b c d 1 21 | c d 1 22 | c d e 1 23 | d e 2 24 | -------------------------------------------------------------------------------- /tests/data/test1_ref/wc.b.hyp: -------------------------------------------------------------------------------- 1 | 3 2 | a 1 3 | b 1 4 | c 2 5 | a 1 6 | c 1 7 | a b 1 8 | b c 1 9 | a b 1 10 | a b c 1 11 | -------------------------------------------------------------------------------- /tests/data/test1_ref/wec.a.hyp: -------------------------------------------------------------------------------- 1 | 2 2 | a 1 3 | b 1 4 | c 2 5 | d 2 6 | e 1 7 | a 1 8 | c 1 9 | d 1 10 | a b 1 11 | b c 1 12 | c d 2 13 | d 1 14 | d e 2 15 | e 1 16 | a b 1 17 | c d 1 18 | d e 1 19 | a b c 1 20 | b c d 1 21 | c d 1 22 | c d e 1 23 | d e 2 24 | -------------------------------------------------------------------------------- /tests/data/test1_ref/wec.b.hyp: -------------------------------------------------------------------------------- 1 | 3 2 | a 1 3 | b 1 4 | c 2 5 | a 1 6 | c 1 7 | a b 1 8 | b c 1 9 | a b 1 10 | a b c 1 11 | -------------------------------------------------------------------------------- /tests/data/test1_ref/wl.a.hyp: -------------------------------------------------------------------------------- 1 | 2 | \data\ 3 | ngram 1=7 4 | ngram 2=9 5 | ngram 3=8 6 | 7 | \1-grams: 8 | -0.887296 9 | -99 -0.196295 10 | -0.691001 a -0.196295 11 | -0.691001 b -0.196295 12 | -0.887296 c 13 | -0.887296 d -0.056116 14 | -0.691001 e -0.196295 15 | 16 | \2-grams: 17 | -0.600600 a -0.109144 18 | -0.691001 c -0.109144 19 | -0.691001 d -0.109144 20 | -0.306919 a b -0.109144 21 | -0.350541 b c -0.109144 22 | -0.887296 c d -0.109144 23 | -0.628694 d 24 | -0.747117 d e 25 | -0.350541 e 26 | 27 | \3-grams: 28 | -0.217618 a b 29 | -0.490737 c d 30 | -0.441947 d e 31 | -0.244727 a b c 32 | -0.490737 b c d 33 | -0.531669 c d 34 | -0.601465 c d e 35 | -0.350541 d e 36 | 37 | \end\ 38 | -------------------------------------------------------------------------------- /tests/data/test1_ref/wl.b.hyp: -------------------------------------------------------------------------------- 1 | 2 | \data\ 3 | ngram 1=5 4 | ngram 2=4 5 | ngram 3=2 6 | 7 | \1-grams: 8 | -0.793946 9 | -99 0.000000 10 | -0.634245 a 11 | -0.634245 b 12 | -0.425969 c 13 | 14 | \2-grams: 15 | -0.634245 a 16 | -0.425969 c 17 | -0.634245 a b 18 | -0.425969 b c 19 | 20 | \3-grams: 21 | -0.634245 a b 22 | -0.425969 a b c 23 | 24 | \end\ 25 | -------------------------------------------------------------------------------- /tests/data/test1_ref/wlc.a.hyp: -------------------------------------------------------------------------------- 1 | 6 2 | 2 3 | a 1 4 | b 1 5 | c 2 6 | d 2 7 | e 1 8 | a 0 9 | c 0 10 | d 0 11 | a b 1 12 | b c 1 13 | c d 2 14 | d 1 15 | d e 2 16 | e 1 17 | -------------------------------------------------------------------------------- /tests/data/test1_ref/wlc.b.hyp: -------------------------------------------------------------------------------- 1 | 4 2 | 0 3 | a 1 4 | b 1 5 | c 2 6 | a 0 7 | c 0 8 | a b 1 9 | b c 1 10 | -------------------------------------------------------------------------------- /tests/data/test1_ref/wrc.a.hyp: -------------------------------------------------------------------------------- 1 | 6 2 | 3 3 | a 1 4 | b 1 5 | c 1 6 | d 2 7 | e 1 8 | a 1 9 | c 1 10 | d 1 11 | a b 1 12 | b c 1 13 | c d 2 14 | d 0 15 | d e 1 16 | e 0 17 | -------------------------------------------------------------------------------- /tests/data/test1_ref/wrc.b.hyp: -------------------------------------------------------------------------------- 1 | 4 2 | 2 3 | a 1 4 | b 1 5 | c 0 6 | a 1 7 | c 0 8 | a b 1 9 | b c 0 10 | -------------------------------------------------------------------------------- /tests/data/very_small.txt: -------------------------------------------------------------------------------- 1 | a 2 | -------------------------------------------------------------------------------- /tests/test1.test.in: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e; 4 | 5 | PATH=.:"$PATH" 6 | 7 | INPUT_DIR="@abs_top_srcdir@/"tests/data/ 8 | REFERENCE_DIR="$INPUT_DIR"test1_ref/ 9 | OUTPUT_DIR=tests/test1_output/ 10 | 11 | rm -fr "$OUTPUT_DIR" 12 | mkdir -p "$OUTPUT_DIR" 13 | 14 | estimate-ngram -t "$INPUT_DIR"small.txt \ 15 | -wc "$OUTPUT_DIR"wc.a.hyp -wec "$OUTPUT_DIR"wec.a.hyp -wlc "$OUTPUT_DIR"wlc.a.hyp -wrc "$OUTPUT_DIR"wrc.a.hyp -wl "$OUTPUT_DIR"wl.a.hyp \ 16 | > /dev/null 17 | 18 | estimate-ngram -t "$INPUT_DIR"small.txt -v "$INPUT_DIR"small.vocab \ 19 | -wc "$OUTPUT_DIR"wc.b.hyp -wec "$OUTPUT_DIR"wec.b.hyp -wlc "$OUTPUT_DIR"wlc.b.hyp -wrc "$OUTPUT_DIR"wrc.b.hyp -wl "$OUTPUT_DIR"wl.b.hyp \ 20 | > /dev/null 21 | 22 | for i in `ls "$REFERENCE_DIR"` 23 | do 24 | LC_ALL=C diff "$OUTPUT_DIR""$i" "$REFERENCE_DIR""$i" 25 | done 26 | 27 | rm -fr "$OUTPUT_DIR" 28 | 29 | exit 0; --------------------------------------------------------------------------------