├── .autoconf ├── ar-lib ├── compile ├── config.guess ├── config.sub ├── depcomp ├── install-sh ├── ltmain.sh ├── m4 │ ├── libtool.m4 │ ├── ltoptions.m4 │ ├── ltsugar.m4 │ ├── ltversion.m4 │ └── lt~obsolete.m4 └── missing ├── .dockerignore ├── .gitattributes ├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── Makefile.am ├── Makefile.in ├── README.md ├── aclocal.m4 ├── configure ├── configure.ac ├── docs ├── doxygen.cfg └── mainpage.dox ├── python ├── phonetisaurus-module.py ├── phonetisaurus │ └── __init__.py ├── script │ ├── demo.html │ ├── g2pserver.py │ ├── phoneticize.py │ └── words.list └── setup.py ├── src ├── 3rdparty │ ├── rnnlm │ │ ├── COPYRIGHT.txt │ │ ├── rnnlmlib.cpp │ │ └── rnnlmlib.h │ └── utfcpp │ │ ├── utf8.h │ │ └── utf8 │ │ ├── checked.h │ │ ├── core.h │ │ └── unchecked.h ├── bin │ ├── phonetisaurus-align.cc │ ├── phonetisaurus-arpa2wfst.cc │ ├── phonetisaurus-g2pfst.cc │ ├── phonetisaurus-g2prnn.cc │ └── rnnlm.cc ├── include │ ├── ARPA2WFST.h │ ├── LatticePruner.h │ ├── LegacyRnnLMDecodable.h │ ├── LegacyRnnLMHash.h │ ├── LegacyRnnLMReader.h │ ├── M2MFstAligner.h │ ├── PhonetisaurusRex.h │ ├── PhonetisaurusScript.h │ ├── RnnLMDecoder.h │ ├── RnnLMPy.h │ └── util.h ├── lib │ ├── LatticePruner.cc │ ├── M2MFstAligner.cc │ ├── feature-reader.cc │ └── util.cc └── scripts │ ├── phonetisaurus-apply │ └── phonetisaurus-train └── test ├── check-nbest-wer.py └── g014b2b ├── g014b2b.ref ├── g014b2b.train └── g014b2b.words /.autoconf/ar-lib: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # Wrapper for Microsoft lib.exe 3 | 4 | me=ar-lib 5 | scriptversion=2012-03-01.08; # UTC 6 | 7 | # Copyright (C) 2010-2017 Free Software Foundation, Inc. 8 | # Written by Peter Rosin . 9 | # 10 | # This program is free software; you can redistribute it and/or modify 11 | # it under the terms of the GNU General Public License as published by 12 | # the Free Software Foundation; either version 2, or (at your option) 13 | # any later version. 14 | # 15 | # This program is distributed in the hope that it will be useful, 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | # GNU General Public License for more details. 19 | # 20 | # You should have received a copy of the GNU General Public License 21 | # along with this program. If not, see . 22 | 23 | # As a special exception to the GNU General Public License, if you 24 | # distribute this file as part of a program that contains a 25 | # configuration script generated by Autoconf, you may include it under 26 | # the same distribution terms that you use for the rest of that program. 27 | 28 | # This file is maintained in Automake, please report 29 | # bugs to or send patches to 30 | # . 31 | 32 | 33 | # func_error message 34 | func_error () 35 | { 36 | echo "$me: $1" 1>&2 37 | exit 1 38 | } 39 | 40 | file_conv= 41 | 42 | # func_file_conv build_file 43 | # Convert a $build file to $host form and store it in $file 44 | # Currently only supports Windows hosts. 45 | func_file_conv () 46 | { 47 | file=$1 48 | case $file in 49 | / | /[!/]*) # absolute file, and not a UNC file 50 | if test -z "$file_conv"; then 51 | # lazily determine how to convert abs files 52 | case `uname -s` in 53 | MINGW*) 54 | file_conv=mingw 55 | ;; 56 | CYGWIN*) 57 | file_conv=cygwin 58 | ;; 59 | *) 60 | file_conv=wine 61 | ;; 62 | esac 63 | fi 64 | case $file_conv in 65 | mingw) 66 | file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'` 67 | ;; 68 | cygwin) 69 | file=`cygpath -m "$file" || echo "$file"` 70 | ;; 71 | wine) 72 | file=`winepath -w "$file" || echo "$file"` 73 | ;; 74 | esac 75 | ;; 76 | esac 77 | } 78 | 79 | # func_at_file at_file operation archive 80 | # Iterate over all members in AT_FILE performing OPERATION on ARCHIVE 81 | # for each of them. 82 | # When interpreting the content of the @FILE, do NOT use func_file_conv, 83 | # since the user would need to supply preconverted file names to 84 | # binutils ar, at least for MinGW. 85 | func_at_file () 86 | { 87 | operation=$2 88 | archive=$3 89 | at_file_contents=`cat "$1"` 90 | eval set x "$at_file_contents" 91 | shift 92 | 93 | for member 94 | do 95 | $AR -NOLOGO $operation:"$member" "$archive" || exit $? 96 | done 97 | } 98 | 99 | case $1 in 100 | '') 101 | func_error "no command. Try '$0 --help' for more information." 102 | ;; 103 | -h | --h*) 104 | cat <. 8 | # 9 | # This program is free software; you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation; either version 2, or (at your option) 12 | # any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | 22 | # As a special exception to the GNU General Public License, if you 23 | # distribute this file as part of a program that contains a 24 | # configuration script generated by Autoconf, you may include it under 25 | # the same distribution terms that you use for the rest of that program. 26 | 27 | # This file is maintained in Automake, please report 28 | # bugs to or send patches to 29 | # . 30 | 31 | nl=' 32 | ' 33 | 34 | # We need space, tab and new line, in precisely that order. Quoting is 35 | # there to prevent tools from complaining about whitespace usage. 36 | IFS=" "" $nl" 37 | 38 | file_conv= 39 | 40 | # func_file_conv build_file lazy 41 | # Convert a $build file to $host form and store it in $file 42 | # Currently only supports Windows hosts. If the determined conversion 43 | # type is listed in (the comma separated) LAZY, no conversion will 44 | # take place. 45 | func_file_conv () 46 | { 47 | file=$1 48 | case $file in 49 | / | /[!/]*) # absolute file, and not a UNC file 50 | if test -z "$file_conv"; then 51 | # lazily determine how to convert abs files 52 | case `uname -s` in 53 | MINGW*) 54 | file_conv=mingw 55 | ;; 56 | CYGWIN*) 57 | file_conv=cygwin 58 | ;; 59 | *) 60 | file_conv=wine 61 | ;; 62 | esac 63 | fi 64 | case $file_conv/,$2, in 65 | *,$file_conv,*) 66 | ;; 67 | mingw/*) 68 | file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'` 69 | ;; 70 | cygwin/*) 71 | file=`cygpath -m "$file" || echo "$file"` 72 | ;; 73 | wine/*) 74 | file=`winepath -w "$file" || echo "$file"` 75 | ;; 76 | esac 77 | ;; 78 | esac 79 | } 80 | 81 | # func_cl_dashL linkdir 82 | # Make cl look for libraries in LINKDIR 83 | func_cl_dashL () 84 | { 85 | func_file_conv "$1" 86 | if test -z "$lib_path"; then 87 | lib_path=$file 88 | else 89 | lib_path="$lib_path;$file" 90 | fi 91 | linker_opts="$linker_opts -LIBPATH:$file" 92 | } 93 | 94 | # func_cl_dashl library 95 | # Do a library search-path lookup for cl 96 | func_cl_dashl () 97 | { 98 | lib=$1 99 | found=no 100 | save_IFS=$IFS 101 | IFS=';' 102 | for dir in $lib_path $LIB 103 | do 104 | IFS=$save_IFS 105 | if $shared && test -f "$dir/$lib.dll.lib"; then 106 | found=yes 107 | lib=$dir/$lib.dll.lib 108 | break 109 | fi 110 | if test -f "$dir/$lib.lib"; then 111 | found=yes 112 | lib=$dir/$lib.lib 113 | break 114 | fi 115 | if test -f "$dir/lib$lib.a"; then 116 | found=yes 117 | lib=$dir/lib$lib.a 118 | break 119 | fi 120 | done 121 | IFS=$save_IFS 122 | 123 | if test "$found" != yes; then 124 | lib=$lib.lib 125 | fi 126 | } 127 | 128 | # func_cl_wrapper cl arg... 129 | # Adjust compile command to suit cl 130 | func_cl_wrapper () 131 | { 132 | # Assume a capable shell 133 | lib_path= 134 | shared=: 135 | linker_opts= 136 | for arg 137 | do 138 | if test -n "$eat"; then 139 | eat= 140 | else 141 | case $1 in 142 | -o) 143 | # configure might choose to run compile as 'compile cc -o foo foo.c'. 144 | eat=1 145 | case $2 in 146 | *.o | *.[oO][bB][jJ]) 147 | func_file_conv "$2" 148 | set x "$@" -Fo"$file" 149 | shift 150 | ;; 151 | *) 152 | func_file_conv "$2" 153 | set x "$@" -Fe"$file" 154 | shift 155 | ;; 156 | esac 157 | ;; 158 | -I) 159 | eat=1 160 | func_file_conv "$2" mingw 161 | set x "$@" -I"$file" 162 | shift 163 | ;; 164 | -I*) 165 | func_file_conv "${1#-I}" mingw 166 | set x "$@" -I"$file" 167 | shift 168 | ;; 169 | -l) 170 | eat=1 171 | func_cl_dashl "$2" 172 | set x "$@" "$lib" 173 | shift 174 | ;; 175 | -l*) 176 | func_cl_dashl "${1#-l}" 177 | set x "$@" "$lib" 178 | shift 179 | ;; 180 | -L) 181 | eat=1 182 | func_cl_dashL "$2" 183 | ;; 184 | -L*) 185 | func_cl_dashL "${1#-L}" 186 | ;; 187 | -static) 188 | shared=false 189 | ;; 190 | -Wl,*) 191 | arg=${1#-Wl,} 192 | save_ifs="$IFS"; IFS=',' 193 | for flag in $arg; do 194 | IFS="$save_ifs" 195 | linker_opts="$linker_opts $flag" 196 | done 197 | IFS="$save_ifs" 198 | ;; 199 | -Xlinker) 200 | eat=1 201 | linker_opts="$linker_opts $2" 202 | ;; 203 | -*) 204 | set x "$@" "$1" 205 | shift 206 | ;; 207 | *.cc | *.CC | *.cxx | *.CXX | *.[cC]++) 208 | func_file_conv "$1" 209 | set x "$@" -Tp"$file" 210 | shift 211 | ;; 212 | *.c | *.cpp | *.CPP | *.lib | *.LIB | *.Lib | *.OBJ | *.obj | *.[oO]) 213 | func_file_conv "$1" mingw 214 | set x "$@" "$file" 215 | shift 216 | ;; 217 | *) 218 | set x "$@" "$1" 219 | shift 220 | ;; 221 | esac 222 | fi 223 | shift 224 | done 225 | if test -n "$linker_opts"; then 226 | linker_opts="-link$linker_opts" 227 | fi 228 | exec "$@" $linker_opts 229 | exit 1 230 | } 231 | 232 | eat= 233 | 234 | case $1 in 235 | '') 236 | echo "$0: No command. Try '$0 --help' for more information." 1>&2 237 | exit 1; 238 | ;; 239 | -h | --h*) 240 | cat <<\EOF 241 | Usage: compile [--help] [--version] PROGRAM [ARGS] 242 | 243 | Wrapper for compilers which do not understand '-c -o'. 244 | Remove '-o dest.o' from ARGS, run PROGRAM with the remaining 245 | arguments, and rename the output as expected. 246 | 247 | If you are trying to build a whole package this is not the 248 | right script to run: please start by reading the file 'INSTALL'. 249 | 250 | Report bugs to . 251 | EOF 252 | exit $? 253 | ;; 254 | -v | --v*) 255 | echo "compile $scriptversion" 256 | exit $? 257 | ;; 258 | cl | *[/\\]cl | cl.exe | *[/\\]cl.exe ) 259 | func_cl_wrapper "$@" # Doesn't return... 260 | ;; 261 | esac 262 | 263 | ofile= 264 | cfile= 265 | 266 | for arg 267 | do 268 | if test -n "$eat"; then 269 | eat= 270 | else 271 | case $1 in 272 | -o) 273 | # configure might choose to run compile as 'compile cc -o foo foo.c'. 274 | # So we strip '-o arg' only if arg is an object. 275 | eat=1 276 | case $2 in 277 | *.o | *.obj) 278 | ofile=$2 279 | ;; 280 | *) 281 | set x "$@" -o "$2" 282 | shift 283 | ;; 284 | esac 285 | ;; 286 | *.c) 287 | cfile=$1 288 | set x "$@" "$1" 289 | shift 290 | ;; 291 | *) 292 | set x "$@" "$1" 293 | shift 294 | ;; 295 | esac 296 | fi 297 | shift 298 | done 299 | 300 | if test -z "$ofile" || test -z "$cfile"; then 301 | # If no '-o' option was seen then we might have been invoked from a 302 | # pattern rule where we don't need one. That is ok -- this is a 303 | # normal compilation that the losing compiler can handle. If no 304 | # '.c' file was seen then we are probably linking. That is also 305 | # ok. 306 | exec "$@" 307 | fi 308 | 309 | # Name of file we expect compiler to create. 310 | cofile=`echo "$cfile" | sed 's|^.*[\\/]||; s|^[a-zA-Z]:||; s/\.c$/.o/'` 311 | 312 | # Create the lock directory. 313 | # Note: use '[/\\:.-]' here to ensure that we don't use the same name 314 | # that we are using for the .o file. Also, base the name on the expected 315 | # object file name, since that is what matters with a parallel build. 316 | lockdir=`echo "$cofile" | sed -e 's|[/\\:.-]|_|g'`.d 317 | while true; do 318 | if mkdir "$lockdir" >/dev/null 2>&1; then 319 | break 320 | fi 321 | sleep 1 322 | done 323 | # FIXME: race condition here if user kills between mkdir and trap. 324 | trap "rmdir '$lockdir'; exit 1" 1 2 15 325 | 326 | # Run the compile. 327 | "$@" 328 | ret=$? 329 | 330 | if test -f "$cofile"; then 331 | test "$cofile" = "$ofile" || mv "$cofile" "$ofile" 332 | elif test -f "${cofile}bj"; then 333 | test "${cofile}bj" = "$ofile" || mv "${cofile}bj" "$ofile" 334 | fi 335 | 336 | rmdir "$lockdir" 337 | exit $ret 338 | 339 | # Local Variables: 340 | # mode: shell-script 341 | # sh-indentation: 2 342 | # eval: (add-hook 'write-file-hooks 'time-stamp) 343 | # time-stamp-start: "scriptversion=" 344 | # time-stamp-format: "%:y-%02m-%02d.%02H" 345 | # time-stamp-time-zone: "UTC" 346 | # time-stamp-end: "; # UTC" 347 | # End: 348 | -------------------------------------------------------------------------------- /.autoconf/m4/ltsugar.m4: -------------------------------------------------------------------------------- 1 | # ltsugar.m4 -- libtool m4 base layer. -*-Autoconf-*- 2 | # 3 | # Copyright (C) 2004-2005, 2007-2008, 2011-2015 Free Software 4 | # Foundation, Inc. 5 | # Written by Gary V. Vaughan, 2004 6 | # 7 | # This file is free software; the Free Software Foundation gives 8 | # unlimited permission to copy and/or distribute it, with or without 9 | # modifications, as long as this notice is preserved. 10 | 11 | # serial 6 ltsugar.m4 12 | 13 | # This is to help aclocal find these macros, as it can't see m4_define. 14 | AC_DEFUN([LTSUGAR_VERSION], [m4_if([0.1])]) 15 | 16 | 17 | # lt_join(SEP, ARG1, [ARG2...]) 18 | # ----------------------------- 19 | # Produce ARG1SEPARG2...SEPARGn, omitting [] arguments and their 20 | # associated separator. 21 | # Needed until we can rely on m4_join from Autoconf 2.62, since all earlier 22 | # versions in m4sugar had bugs. 23 | m4_define([lt_join], 24 | [m4_if([$#], [1], [], 25 | [$#], [2], [[$2]], 26 | [m4_if([$2], [], [], [[$2]_])$0([$1], m4_shift(m4_shift($@)))])]) 27 | m4_define([_lt_join], 28 | [m4_if([$#$2], [2], [], 29 | [m4_if([$2], [], [], [[$1$2]])$0([$1], m4_shift(m4_shift($@)))])]) 30 | 31 | 32 | # lt_car(LIST) 33 | # lt_cdr(LIST) 34 | # ------------ 35 | # Manipulate m4 lists. 36 | # These macros are necessary as long as will still need to support 37 | # Autoconf-2.59, which quotes differently. 38 | m4_define([lt_car], [[$1]]) 39 | m4_define([lt_cdr], 40 | [m4_if([$#], 0, [m4_fatal([$0: cannot be called without arguments])], 41 | [$#], 1, [], 42 | [m4_dquote(m4_shift($@))])]) 43 | m4_define([lt_unquote], $1) 44 | 45 | 46 | # lt_append(MACRO-NAME, STRING, [SEPARATOR]) 47 | # ------------------------------------------ 48 | # Redefine MACRO-NAME to hold its former content plus 'SEPARATOR''STRING'. 49 | # Note that neither SEPARATOR nor STRING are expanded; they are appended 50 | # to MACRO-NAME as is (leaving the expansion for when MACRO-NAME is invoked). 51 | # No SEPARATOR is output if MACRO-NAME was previously undefined (different 52 | # than defined and empty). 53 | # 54 | # This macro is needed until we can rely on Autoconf 2.62, since earlier 55 | # versions of m4sugar mistakenly expanded SEPARATOR but not STRING. 56 | m4_define([lt_append], 57 | [m4_define([$1], 58 | m4_ifdef([$1], [m4_defn([$1])[$3]])[$2])]) 59 | 60 | 61 | 62 | # lt_combine(SEP, PREFIX-LIST, INFIX, SUFFIX1, [SUFFIX2...]) 63 | # ---------------------------------------------------------- 64 | # Produce a SEP delimited list of all paired combinations of elements of 65 | # PREFIX-LIST with SUFFIX1 through SUFFIXn. Each element of the list 66 | # has the form PREFIXmINFIXSUFFIXn. 67 | # Needed until we can rely on m4_combine added in Autoconf 2.62. 68 | m4_define([lt_combine], 69 | [m4_if(m4_eval([$# > 3]), [1], 70 | [m4_pushdef([_Lt_sep], [m4_define([_Lt_sep], m4_defn([lt_car]))])]]dnl 71 | [[m4_foreach([_Lt_prefix], [$2], 72 | [m4_foreach([_Lt_suffix], 73 | ]m4_dquote(m4_dquote(m4_shift(m4_shift(m4_shift($@)))))[, 74 | [_Lt_sep([$1])[]m4_defn([_Lt_prefix])[$3]m4_defn([_Lt_suffix])])])])]) 75 | 76 | 77 | # lt_if_append_uniq(MACRO-NAME, VARNAME, [SEPARATOR], [UNIQ], [NOT-UNIQ]) 78 | # ----------------------------------------------------------------------- 79 | # Iff MACRO-NAME does not yet contain VARNAME, then append it (delimited 80 | # by SEPARATOR if supplied) and expand UNIQ, else NOT-UNIQ. 81 | m4_define([lt_if_append_uniq], 82 | [m4_ifdef([$1], 83 | [m4_if(m4_index([$3]m4_defn([$1])[$3], [$3$2$3]), [-1], 84 | [lt_append([$1], [$2], [$3])$4], 85 | [$5])], 86 | [lt_append([$1], [$2], [$3])$4])]) 87 | 88 | 89 | # lt_dict_add(DICT, KEY, VALUE) 90 | # ----------------------------- 91 | m4_define([lt_dict_add], 92 | [m4_define([$1($2)], [$3])]) 93 | 94 | 95 | # lt_dict_add_subkey(DICT, KEY, SUBKEY, VALUE) 96 | # -------------------------------------------- 97 | m4_define([lt_dict_add_subkey], 98 | [m4_define([$1($2:$3)], [$4])]) 99 | 100 | 101 | # lt_dict_fetch(DICT, KEY, [SUBKEY]) 102 | # ---------------------------------- 103 | m4_define([lt_dict_fetch], 104 | [m4_ifval([$3], 105 | m4_ifdef([$1($2:$3)], [m4_defn([$1($2:$3)])]), 106 | m4_ifdef([$1($2)], [m4_defn([$1($2)])]))]) 107 | 108 | 109 | # lt_if_dict_fetch(DICT, KEY, [SUBKEY], VALUE, IF-TRUE, [IF-FALSE]) 110 | # ----------------------------------------------------------------- 111 | m4_define([lt_if_dict_fetch], 112 | [m4_if(lt_dict_fetch([$1], [$2], [$3]), [$4], 113 | [$5], 114 | [$6])]) 115 | 116 | 117 | # lt_dict_filter(DICT, [SUBKEY], VALUE, [SEPARATOR], KEY, [...]) 118 | # -------------------------------------------------------------- 119 | m4_define([lt_dict_filter], 120 | [m4_if([$5], [], [], 121 | [lt_join(m4_quote(m4_default([$4], [[, ]])), 122 | lt_unquote(m4_split(m4_normalize(m4_foreach(_Lt_key, lt_car([m4_shiftn(4, $@)]), 123 | [lt_if_dict_fetch([$1], _Lt_key, [$2], [$3], [_Lt_key ])])))))])[]dnl 124 | ]) 125 | -------------------------------------------------------------------------------- /.autoconf/m4/ltversion.m4: -------------------------------------------------------------------------------- 1 | # ltversion.m4 -- version numbers -*- Autoconf -*- 2 | # 3 | # Copyright (C) 2004, 2011-2015 Free Software Foundation, Inc. 4 | # Written by Scott James Remnant, 2004 5 | # 6 | # This file is free software; the Free Software Foundation gives 7 | # unlimited permission to copy and/or distribute it, with or without 8 | # modifications, as long as this notice is preserved. 9 | 10 | # @configure_input@ 11 | 12 | # serial 4179 ltversion.m4 13 | # This file is part of GNU Libtool 14 | 15 | m4_define([LT_PACKAGE_VERSION], [2.4.6]) 16 | m4_define([LT_PACKAGE_REVISION], [2.4.6]) 17 | 18 | AC_DEFUN([LTVERSION_VERSION], 19 | [macro_version='2.4.6' 20 | macro_revision='2.4.6' 21 | _LT_DECL(, macro_version, 0, [Which release of libtool.m4 was used?]) 22 | _LT_DECL(, macro_revision, 0) 23 | ]) 24 | -------------------------------------------------------------------------------- /.autoconf/m4/lt~obsolete.m4: -------------------------------------------------------------------------------- 1 | # lt~obsolete.m4 -- aclocal satisfying obsolete definitions. -*-Autoconf-*- 2 | # 3 | # Copyright (C) 2004-2005, 2007, 2009, 2011-2015 Free Software 4 | # Foundation, Inc. 5 | # Written by Scott James Remnant, 2004. 6 | # 7 | # This file is free software; the Free Software Foundation gives 8 | # unlimited permission to copy and/or distribute it, with or without 9 | # modifications, as long as this notice is preserved. 10 | 11 | # serial 5 lt~obsolete.m4 12 | 13 | # These exist entirely to fool aclocal when bootstrapping libtool. 14 | # 15 | # In the past libtool.m4 has provided macros via AC_DEFUN (or AU_DEFUN), 16 | # which have later been changed to m4_define as they aren't part of the 17 | # exported API, or moved to Autoconf or Automake where they belong. 18 | # 19 | # The trouble is, aclocal is a bit thick. It'll see the old AC_DEFUN 20 | # in /usr/share/aclocal/libtool.m4 and remember it, then when it sees us 21 | # using a macro with the same name in our local m4/libtool.m4 it'll 22 | # pull the old libtool.m4 in (it doesn't see our shiny new m4_define 23 | # and doesn't know about Autoconf macros at all.) 24 | # 25 | # So we provide this file, which has a silly filename so it's always 26 | # included after everything else. This provides aclocal with the 27 | # AC_DEFUNs it wants, but when m4 processes it, it doesn't do anything 28 | # because those macros already exist, or will be overwritten later. 29 | # We use AC_DEFUN over AU_DEFUN for compatibility with aclocal-1.6. 30 | # 31 | # Anytime we withdraw an AC_DEFUN or AU_DEFUN, remember to add it here. 32 | # Yes, that means every name once taken will need to remain here until 33 | # we give up compatibility with versions before 1.7, at which point 34 | # we need to keep only those names which we still refer to. 35 | 36 | # This is to help aclocal find these macros, as it can't see m4_define. 37 | AC_DEFUN([LTOBSOLETE_VERSION], [m4_if([1])]) 38 | 39 | m4_ifndef([AC_LIBTOOL_LINKER_OPTION], [AC_DEFUN([AC_LIBTOOL_LINKER_OPTION])]) 40 | m4_ifndef([AC_PROG_EGREP], [AC_DEFUN([AC_PROG_EGREP])]) 41 | m4_ifndef([_LT_AC_PROG_ECHO_BACKSLASH], [AC_DEFUN([_LT_AC_PROG_ECHO_BACKSLASH])]) 42 | m4_ifndef([_LT_AC_SHELL_INIT], [AC_DEFUN([_LT_AC_SHELL_INIT])]) 43 | m4_ifndef([_LT_AC_SYS_LIBPATH_AIX], [AC_DEFUN([_LT_AC_SYS_LIBPATH_AIX])]) 44 | m4_ifndef([_LT_PROG_LTMAIN], [AC_DEFUN([_LT_PROG_LTMAIN])]) 45 | m4_ifndef([_LT_AC_TAGVAR], [AC_DEFUN([_LT_AC_TAGVAR])]) 46 | m4_ifndef([AC_LTDL_ENABLE_INSTALL], [AC_DEFUN([AC_LTDL_ENABLE_INSTALL])]) 47 | m4_ifndef([AC_LTDL_PREOPEN], [AC_DEFUN([AC_LTDL_PREOPEN])]) 48 | m4_ifndef([_LT_AC_SYS_COMPILER], [AC_DEFUN([_LT_AC_SYS_COMPILER])]) 49 | m4_ifndef([_LT_AC_LOCK], [AC_DEFUN([_LT_AC_LOCK])]) 50 | m4_ifndef([AC_LIBTOOL_SYS_OLD_ARCHIVE], [AC_DEFUN([AC_LIBTOOL_SYS_OLD_ARCHIVE])]) 51 | m4_ifndef([_LT_AC_TRY_DLOPEN_SELF], [AC_DEFUN([_LT_AC_TRY_DLOPEN_SELF])]) 52 | m4_ifndef([AC_LIBTOOL_PROG_CC_C_O], [AC_DEFUN([AC_LIBTOOL_PROG_CC_C_O])]) 53 | m4_ifndef([AC_LIBTOOL_SYS_HARD_LINK_LOCKS], [AC_DEFUN([AC_LIBTOOL_SYS_HARD_LINK_LOCKS])]) 54 | m4_ifndef([AC_LIBTOOL_OBJDIR], [AC_DEFUN([AC_LIBTOOL_OBJDIR])]) 55 | m4_ifndef([AC_LTDL_OBJDIR], [AC_DEFUN([AC_LTDL_OBJDIR])]) 56 | m4_ifndef([AC_LIBTOOL_PROG_LD_HARDCODE_LIBPATH], [AC_DEFUN([AC_LIBTOOL_PROG_LD_HARDCODE_LIBPATH])]) 57 | m4_ifndef([AC_LIBTOOL_SYS_LIB_STRIP], [AC_DEFUN([AC_LIBTOOL_SYS_LIB_STRIP])]) 58 | m4_ifndef([AC_PATH_MAGIC], [AC_DEFUN([AC_PATH_MAGIC])]) 59 | m4_ifndef([AC_PROG_LD_GNU], [AC_DEFUN([AC_PROG_LD_GNU])]) 60 | m4_ifndef([AC_PROG_LD_RELOAD_FLAG], [AC_DEFUN([AC_PROG_LD_RELOAD_FLAG])]) 61 | m4_ifndef([AC_DEPLIBS_CHECK_METHOD], [AC_DEFUN([AC_DEPLIBS_CHECK_METHOD])]) 62 | m4_ifndef([AC_LIBTOOL_PROG_COMPILER_NO_RTTI], [AC_DEFUN([AC_LIBTOOL_PROG_COMPILER_NO_RTTI])]) 63 | m4_ifndef([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE], [AC_DEFUN([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE])]) 64 | m4_ifndef([AC_LIBTOOL_PROG_COMPILER_PIC], [AC_DEFUN([AC_LIBTOOL_PROG_COMPILER_PIC])]) 65 | m4_ifndef([AC_LIBTOOL_PROG_LD_SHLIBS], [AC_DEFUN([AC_LIBTOOL_PROG_LD_SHLIBS])]) 66 | m4_ifndef([AC_LIBTOOL_POSTDEP_PREDEP], [AC_DEFUN([AC_LIBTOOL_POSTDEP_PREDEP])]) 67 | m4_ifndef([LT_AC_PROG_EGREP], [AC_DEFUN([LT_AC_PROG_EGREP])]) 68 | m4_ifndef([LT_AC_PROG_SED], [AC_DEFUN([LT_AC_PROG_SED])]) 69 | m4_ifndef([_LT_CC_BASENAME], [AC_DEFUN([_LT_CC_BASENAME])]) 70 | m4_ifndef([_LT_COMPILER_BOILERPLATE], [AC_DEFUN([_LT_COMPILER_BOILERPLATE])]) 71 | m4_ifndef([_LT_LINKER_BOILERPLATE], [AC_DEFUN([_LT_LINKER_BOILERPLATE])]) 72 | m4_ifndef([_AC_PROG_LIBTOOL], [AC_DEFUN([_AC_PROG_LIBTOOL])]) 73 | m4_ifndef([AC_LIBTOOL_SETUP], [AC_DEFUN([AC_LIBTOOL_SETUP])]) 74 | m4_ifndef([_LT_AC_CHECK_DLFCN], [AC_DEFUN([_LT_AC_CHECK_DLFCN])]) 75 | m4_ifndef([AC_LIBTOOL_SYS_DYNAMIC_LINKER], [AC_DEFUN([AC_LIBTOOL_SYS_DYNAMIC_LINKER])]) 76 | m4_ifndef([_LT_AC_TAGCONFIG], [AC_DEFUN([_LT_AC_TAGCONFIG])]) 77 | m4_ifndef([AC_DISABLE_FAST_INSTALL], [AC_DEFUN([AC_DISABLE_FAST_INSTALL])]) 78 | m4_ifndef([_LT_AC_LANG_CXX], [AC_DEFUN([_LT_AC_LANG_CXX])]) 79 | m4_ifndef([_LT_AC_LANG_F77], [AC_DEFUN([_LT_AC_LANG_F77])]) 80 | m4_ifndef([_LT_AC_LANG_GCJ], [AC_DEFUN([_LT_AC_LANG_GCJ])]) 81 | m4_ifndef([AC_LIBTOOL_LANG_C_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_C_CONFIG])]) 82 | m4_ifndef([_LT_AC_LANG_C_CONFIG], [AC_DEFUN([_LT_AC_LANG_C_CONFIG])]) 83 | m4_ifndef([AC_LIBTOOL_LANG_CXX_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_CXX_CONFIG])]) 84 | m4_ifndef([_LT_AC_LANG_CXX_CONFIG], [AC_DEFUN([_LT_AC_LANG_CXX_CONFIG])]) 85 | m4_ifndef([AC_LIBTOOL_LANG_F77_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_F77_CONFIG])]) 86 | m4_ifndef([_LT_AC_LANG_F77_CONFIG], [AC_DEFUN([_LT_AC_LANG_F77_CONFIG])]) 87 | m4_ifndef([AC_LIBTOOL_LANG_GCJ_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_GCJ_CONFIG])]) 88 | m4_ifndef([_LT_AC_LANG_GCJ_CONFIG], [AC_DEFUN([_LT_AC_LANG_GCJ_CONFIG])]) 89 | m4_ifndef([AC_LIBTOOL_LANG_RC_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_RC_CONFIG])]) 90 | m4_ifndef([_LT_AC_LANG_RC_CONFIG], [AC_DEFUN([_LT_AC_LANG_RC_CONFIG])]) 91 | m4_ifndef([AC_LIBTOOL_CONFIG], [AC_DEFUN([AC_LIBTOOL_CONFIG])]) 92 | m4_ifndef([_LT_AC_FILE_LTDLL_C], [AC_DEFUN([_LT_AC_FILE_LTDLL_C])]) 93 | m4_ifndef([_LT_REQUIRED_DARWIN_CHECKS], [AC_DEFUN([_LT_REQUIRED_DARWIN_CHECKS])]) 94 | m4_ifndef([_LT_AC_PROG_CXXCPP], [AC_DEFUN([_LT_AC_PROG_CXXCPP])]) 95 | m4_ifndef([_LT_PREPARE_SED_QUOTE_VARS], [AC_DEFUN([_LT_PREPARE_SED_QUOTE_VARS])]) 96 | m4_ifndef([_LT_PROG_ECHO_BACKSLASH], [AC_DEFUN([_LT_PROG_ECHO_BACKSLASH])]) 97 | m4_ifndef([_LT_PROG_F77], [AC_DEFUN([_LT_PROG_F77])]) 98 | m4_ifndef([_LT_PROG_FC], [AC_DEFUN([_LT_PROG_FC])]) 99 | m4_ifndef([_LT_PROG_CXX], [AC_DEFUN([_LT_PROG_CXX])]) 100 | -------------------------------------------------------------------------------- /.autoconf/missing: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # Common wrapper for a few potentially missing GNU programs. 3 | 4 | scriptversion=2013-10-28.13; # UTC 5 | 6 | # Copyright (C) 1996-2014 Free Software Foundation, Inc. 7 | # Originally written by Fran,cois Pinard , 1996. 8 | 9 | # This program is free software; you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation; either version 2, or (at your option) 12 | # any later version. 13 | 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | 22 | # As a special exception to the GNU General Public License, if you 23 | # distribute this file as part of a program that contains a 24 | # configuration script generated by Autoconf, you may include it under 25 | # the same distribution terms that you use for the rest of that program. 26 | 27 | if test $# -eq 0; then 28 | echo 1>&2 "Try '$0 --help' for more information" 29 | exit 1 30 | fi 31 | 32 | case $1 in 33 | 34 | --is-lightweight) 35 | # Used by our autoconf macros to check whether the available missing 36 | # script is modern enough. 37 | exit 0 38 | ;; 39 | 40 | --run) 41 | # Back-compat with the calling convention used by older automake. 42 | shift 43 | ;; 44 | 45 | -h|--h|--he|--hel|--help) 46 | echo "\ 47 | $0 [OPTION]... PROGRAM [ARGUMENT]... 48 | 49 | Run 'PROGRAM [ARGUMENT]...', returning a proper advice when this fails due 50 | to PROGRAM being missing or too old. 51 | 52 | Options: 53 | -h, --help display this help and exit 54 | -v, --version output version information and exit 55 | 56 | Supported PROGRAM values: 57 | aclocal autoconf autoheader autom4te automake makeinfo 58 | bison yacc flex lex help2man 59 | 60 | Version suffixes to PROGRAM as well as the prefixes 'gnu-', 'gnu', and 61 | 'g' are ignored when checking the name. 62 | 63 | Send bug reports to ." 64 | exit $? 65 | ;; 66 | 67 | -v|--v|--ve|--ver|--vers|--versi|--versio|--version) 68 | echo "missing $scriptversion (GNU Automake)" 69 | exit $? 70 | ;; 71 | 72 | -*) 73 | echo 1>&2 "$0: unknown '$1' option" 74 | echo 1>&2 "Try '$0 --help' for more information" 75 | exit 1 76 | ;; 77 | 78 | esac 79 | 80 | # Run the given program, remember its exit status. 81 | "$@"; st=$? 82 | 83 | # If it succeeded, we are done. 84 | test $st -eq 0 && exit 0 85 | 86 | # Also exit now if we it failed (or wasn't found), and '--version' was 87 | # passed; such an option is passed most likely to detect whether the 88 | # program is present and works. 89 | case $2 in --version|--help) exit $st;; esac 90 | 91 | # Exit code 63 means version mismatch. This often happens when the user 92 | # tries to use an ancient version of a tool on a file that requires a 93 | # minimum version. 94 | if test $st -eq 63; then 95 | msg="probably too old" 96 | elif test $st -eq 127; then 97 | # Program was missing. 98 | msg="missing on your system" 99 | else 100 | # Program was found and executed, but failed. Give up. 101 | exit $st 102 | fi 103 | 104 | perl_URL=http://www.perl.org/ 105 | flex_URL=http://flex.sourceforge.net/ 106 | gnu_software_URL=http://www.gnu.org/software 107 | 108 | program_details () 109 | { 110 | case $1 in 111 | aclocal|automake) 112 | echo "The '$1' program is part of the GNU Automake package:" 113 | echo "<$gnu_software_URL/automake>" 114 | echo "It also requires GNU Autoconf, GNU m4 and Perl in order to run:" 115 | echo "<$gnu_software_URL/autoconf>" 116 | echo "<$gnu_software_URL/m4/>" 117 | echo "<$perl_URL>" 118 | ;; 119 | autoconf|autom4te|autoheader) 120 | echo "The '$1' program is part of the GNU Autoconf package:" 121 | echo "<$gnu_software_URL/autoconf/>" 122 | echo "It also requires GNU m4 and Perl in order to run:" 123 | echo "<$gnu_software_URL/m4/>" 124 | echo "<$perl_URL>" 125 | ;; 126 | esac 127 | } 128 | 129 | give_advice () 130 | { 131 | # Normalize program name to check for. 132 | normalized_program=`echo "$1" | sed ' 133 | s/^gnu-//; t 134 | s/^gnu//; t 135 | s/^g//; t'` 136 | 137 | printf '%s\n' "'$1' is $msg." 138 | 139 | configure_deps="'configure.ac' or m4 files included by 'configure.ac'" 140 | case $normalized_program in 141 | autoconf*) 142 | echo "You should only need it if you modified 'configure.ac'," 143 | echo "or m4 files included by it." 144 | program_details 'autoconf' 145 | ;; 146 | autoheader*) 147 | echo "You should only need it if you modified 'acconfig.h' or" 148 | echo "$configure_deps." 149 | program_details 'autoheader' 150 | ;; 151 | automake*) 152 | echo "You should only need it if you modified 'Makefile.am' or" 153 | echo "$configure_deps." 154 | program_details 'automake' 155 | ;; 156 | aclocal*) 157 | echo "You should only need it if you modified 'acinclude.m4' or" 158 | echo "$configure_deps." 159 | program_details 'aclocal' 160 | ;; 161 | autom4te*) 162 | echo "You might have modified some maintainer files that require" 163 | echo "the 'autom4te' program to be rebuilt." 164 | program_details 'autom4te' 165 | ;; 166 | bison*|yacc*) 167 | echo "You should only need it if you modified a '.y' file." 168 | echo "You may want to install the GNU Bison package:" 169 | echo "<$gnu_software_URL/bison/>" 170 | ;; 171 | lex*|flex*) 172 | echo "You should only need it if you modified a '.l' file." 173 | echo "You may want to install the Fast Lexical Analyzer package:" 174 | echo "<$flex_URL>" 175 | ;; 176 | help2man*) 177 | echo "You should only need it if you modified a dependency" \ 178 | "of a man page." 179 | echo "You may want to install the GNU Help2man package:" 180 | echo "<$gnu_software_URL/help2man/>" 181 | ;; 182 | makeinfo*) 183 | echo "You should only need it if you modified a '.texi' file, or" 184 | echo "any other file indirectly affecting the aspect of the manual." 185 | echo "You might want to install the Texinfo package:" 186 | echo "<$gnu_software_URL/texinfo/>" 187 | echo "The spurious makeinfo call might also be the consequence of" 188 | echo "using a buggy 'make' (AIX, DU, IRIX), in which case you might" 189 | echo "want to install GNU make:" 190 | echo "<$gnu_software_URL/make/>" 191 | ;; 192 | *) 193 | echo "You might have modified some files without having the proper" 194 | echo "tools for further handling them. Check the 'README' file, it" 195 | echo "often tells you about the needed prerequisites for installing" 196 | echo "this package. You may also peek at any GNU archive site, in" 197 | echo "case some other package contains this missing '$1' program." 198 | ;; 199 | esac 200 | } 201 | 202 | give_advice "$1" | sed -e '1s/^/WARNING: /' \ 203 | -e '2,$s/^/ /' >&2 204 | 205 | # Propagate the correct exit status (expected to be 127 for a program 206 | # not found, 63 for a program that failed due to version mismatch). 207 | exit $st 208 | 209 | # Local variables: 210 | # eval: (add-hook 'write-file-hooks 'time-stamp) 211 | # time-stamp-start: "scriptversion=" 212 | # time-stamp-format: "%:y-%02m-%02d.%02H" 213 | # time-stamp-time-zone: "UTC" 214 | # time-stamp-end: "; # UTC" 215 | # End: 216 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | Dockerfile 2 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AdolfVonKleist/Phonetisaurus/f08d3dfb10b8d619e665a9581d2a327bcc2504f7/.gitattributes -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | .*\.o$ 3 | .*\.arpa$ 4 | .*\.fst$ 5 | .*\.fst.txt$ 6 | .*\.dylib$ 7 | .*\.so 8 | .*-binding.cc 9 | src/bin/phonetisaurus-align 10 | src/bin/phonetisaurus-arpa2wfst 11 | src/bin/phonetisaurus-g2pfst 12 | src/bin/phonetisaurus-g2prnn 13 | src/bin/rnnlm 14 | .*\.pyc$ 15 | models/ 16 | .*\.egg-info/ 17 | build/ 18 | dist/ 19 | src/data 20 | src/rnnlm.direct$ 21 | *.pyc 22 | phonetisaurus.egg-info/ 23 | rnnlm.egg-info/ 24 | *.fst 25 | *.o 26 | *.so 27 | *-binding.cc 28 | nohup.out 29 | src/.autoconf/autom4te.cache 30 | src/.autoconf/config.log 31 | src/.autoconf/config.status 32 | src/.autoconf/configure 33 | src/Makefile 34 | html/ 35 | exp/ 36 | phonetisaurus.egg-info/ 37 | config.log 38 | config.status 39 | autom4te.cache 40 | .libs/ 41 | Makefile 42 | Phonetisaurus.la 43 | libtool 44 | phonetisaurus-align 45 | phonetisaurus-arpa2wfst 46 | phonetisaurus-g2pfst 47 | phonetisaurus-g2prnn 48 | python/.deps/ 49 | python/.dirstamp 50 | python/Phonetisaurus_la-Phonetisaurus-binding.lo 51 | rnnlm 52 | src/3rdparty/rnnlm/.deps/ 53 | src/3rdparty/rnnlm/.dirstamp 54 | src/bin/.deps/ 55 | src/bin/.dirstamp 56 | src/lib/.deps/ 57 | src/lib/.dirstamp 58 | src/lib/Phonetisaurus_la-util.lo 59 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | 3 | branches: 4 | only: 5 | - master 6 | 7 | sudo: required 8 | dist: trusty 9 | group: edge 10 | 11 | os: 12 | - linux 13 | 14 | compiler: 15 | - gcc 16 | 17 | env: 18 | - CROSSCOMPILE=native 19 | 20 | addons: 21 | apt: 22 | packages: 23 | - zlib1g-dev 24 | - wget 25 | - python-dev 26 | 27 | matrix: 28 | include: 29 | - os: linux 30 | compiler: gcc 31 | addons: 32 | apt: 33 | sources: ['ubuntu-toolchain-r-test'] 34 | packages: ['g++-4.9','zlib1g-dev','wget','python-dev', 'python-pip'] 35 | env: 36 | - CXXCOMPILER=g++-4.9 37 | 38 | before_install: 39 | - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update ; fi 40 | - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install openfst ; fi 41 | - if [[ "$TRAVIS_OS_NAME" != "osx" ]]; then wget https://github.com/AdolfVonKleist/packages/raw/master/Ubuntu-14.04/debs/openfst_1.6.2_amd64-trusty.deb ; fi 42 | - if [[ "$TRAVIS_OS_NAME" != "osx" ]]; then sudo dpkg -i openfst_1.6.2_amd64-trusty.deb ; fi 43 | - if [ ! -z "$CXXCOMPILER" ]; then export CXX="$CXXCOMPILER"; fi 44 | 45 | install: 46 | - pip install --user pybindgen 47 | 48 | script: 49 | - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then ./configure --enable-python && make -j2 && ( ./phonetisaurus-g2pfst --help || echo ) && cd python && cp ../.libs/Phonetisaurus.so . && python setup.py build ; fi 50 | - if [[ "$TRAVIS_OS_NAME" != "osx" ]]; then ./configure --enable-python && make -j2 && ./phonetisaurus-g2pfst --help && cd python && cp ../.libs/Phonetisaurus.so . && python setup.py build ; fi 51 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3 as build 2 | 3 | WORKDIR /build 4 | 5 | RUN apt-get -y update && apt-get -y install git g++ autoconf-archive make libtool gfortran tar gawk 6 | 7 | RUN wget http://www.openfst.org/twiki/pub/FST/FstDownload/openfst-1.6.2.tar.gz && \ 8 | tar -xvzf openfst-1.6.2.tar.gz && \ 9 | cd openfst-1.6.2 && \ 10 | ./configure --enable-static --enable-shared --enable-far --enable-ngram-fsts && \ 11 | make -j $(nproc) && \ 12 | make install && \ 13 | ldconfig 14 | 15 | RUN git clone https://github.com/mitlm/mitlm && \ 16 | cd mitlm && \ 17 | autoreconf -i && \ 18 | ./configure && \ 19 | make -j $(nproc) && \ 20 | make install 21 | 22 | WORKDIR /build/phonetisaurus 23 | 24 | COPY . ./ 25 | 26 | RUN pip3 install pybindgen 27 | 28 | RUN ./configure --enable-python && \ 29 | make -j $(nproc) && \ 30 | make install 31 | 32 | FROM python:3-slim 33 | 34 | RUN apt-get -y update && apt-get -y install gfortran && apt-get -y clean && apt-get -y autoclean 35 | 36 | WORKDIR /setup 37 | 38 | COPY --from=build /build/phonetisaurus/python ./ 39 | COPY --from=build /build/phonetisaurus/.libs/Phonetisaurus.so ./ 40 | 41 | RUN python setup.py install 42 | 43 | COPY --from=build /usr/local/lib/fst /usr/local/lib/fst 44 | COPY --from=build /usr/local/lib/libfst*so*0 /usr/local/lib/ 45 | COPY --from=build /usr/local/bin/phonetisaurus* /usr/local/bin/ 46 | COPY --from=build /build/phonetisaurus/src/scripts/* /usr/local/bin/ 47 | COPY --from=build /usr/local/bin/rnnlm /usr/local/bin/ 48 | COPY --from=build /usr/local/bin/estimate-ngram /usr/local/bin/ 49 | COPY --from=build /usr/local/lib/libmitlm.so.1.0.0 /usr/local/lib 50 | 51 | RUN ldconfig 52 | 53 | WORKDIR /work 54 | 55 | ENTRYPOINT [ "/bin/bash" , "-c" ] 56 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, Josef Novak 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | AUTOMAKE_OPTIONS = subdir-objects 2 | ACLOCAL_AMFLAGS = -I .autoconf/m4 3 | 4 | EXTRA_DIST= \ 5 | $(srcdir)/LICENSE \ 6 | $(srcdir)/README.md \ 7 | $(srcdir)/docs/doxygen.cfg \ 8 | $(srcdir)/docs/mainpage.dox \ 9 | $(srcdir)/src/3rdparty/rnnlm/COPYRIGHT.txt \ 10 | $(srcdir)/src/3rdparty/utfcpp/utf8/checked.h \ 11 | $(srcdir)/src/3rdparty/utfcpp/utf8/core.h \ 12 | $(srcdir)/src/3rdparty/utfcpp/utf8/unchecked.h \ 13 | $(srcdir)/src/3rdparty/utfcpp/utf8.h \ 14 | $(srcdir)/src/configure \ 15 | $(srcdir)/src/lib/feature-reader.cc \ 16 | $(srcdir)/src/include/RnnLMPy.h \ 17 | $(srcdir)/python/phonetisaurus-module.py \ 18 | $(srcdir)/python/script/phoneticize.py \ 19 | $(srcdir)/python/script/demo.html \ 20 | $(srcdir)/python/script/words.list \ 21 | $(srcdir)/python/script/g2pserver.py \ 22 | $(srcdir)/python/phonetisaurus/__init__.py \ 23 | $(srcdir)/python/pybindgen/__init__.py \ 24 | $(srcdir)/python/pybindgen/cppclass_typehandlers.py \ 25 | $(srcdir)/python/pybindgen/cppexception.py \ 26 | $(srcdir)/python/pybindgen/cppmethod.py \ 27 | $(srcdir)/python/pybindgen/enum.py \ 28 | $(srcdir)/python/pybindgen/wscript \ 29 | $(srcdir)/python/pybindgen/cppclass.py \ 30 | $(srcdir)/python/pybindgen/cppclass_container.py \ 31 | $(srcdir)/python/pybindgen/settings.py \ 32 | $(srcdir)/python/pybindgen/function.py \ 33 | $(srcdir)/python/pybindgen/utils.py \ 34 | $(srcdir)/python/pybindgen/module.py \ 35 | $(srcdir)/python/pybindgen/typehandlers/__init__.py \ 36 | $(srcdir)/python/pybindgen/typehandlers/codesink.py \ 37 | $(srcdir)/python/pybindgen/typehandlers/ctypeparser/__init__.py \ 38 | $(srcdir)/python/pybindgen/typehandlers/ctypeparser/tokenizer.py \ 39 | $(srcdir)/python/pybindgen/typehandlers/stringtype.py \ 40 | $(srcdir)/python/pybindgen/typehandlers/pyobjecttype.py \ 41 | $(srcdir)/python/pybindgen/typehandlers/inttype.py \ 42 | $(srcdir)/python/pybindgen/typehandlers/doubletype.py \ 43 | $(srcdir)/python/pybindgen/typehandlers/voidtype.py \ 44 | $(srcdir)/python/pybindgen/typehandlers/floattype.py \ 45 | $(srcdir)/python/pybindgen/typehandlers/base.py \ 46 | $(srcdir)/python/pybindgen/typehandlers/booltype.py \ 47 | $(srcdir)/python/pybindgen/cppattribute.py \ 48 | $(srcdir)/python/pybindgen/gccxmlparser.py \ 49 | $(srcdir)/python/pybindgen/overloading.py \ 50 | $(srcdir)/python/pybindgen/pytypeobject.py \ 51 | $(srcdir)/python/pybindgen/container.py \ 52 | $(srcdir)/python/pybindgen/converter_functions.py \ 53 | $(srcdir)/python/pybindgen/version.py \ 54 | $(srcdir)/python/pybindgen/wrapper_registry.py \ 55 | $(srcdir)/python/setup.py \ 56 | $(srcdir)/test/g014b2b/g014b2b.ref \ 57 | $(srcdir)/test/g014b2b/g014b2b.train \ 58 | $(srcdir)/test/g014b2b/g014b2b.words \ 59 | $(srcdir)/test/check-nbest-wer.py 60 | 61 | 62 | 63 | dist_bin_SCRIPTS = \ 64 | src/scripts/phonetisaurus-apply \ 65 | src/scripts/phonetisaurus-train 66 | 67 | bin_PROGRAMS = \ 68 | phonetisaurus-align \ 69 | phonetisaurus-arpa2wfst \ 70 | phonetisaurus-g2pfst \ 71 | phonetisaurus-g2prnn \ 72 | rnnlm 73 | 74 | AM_CPPFLAGS = -I$(top_srcdir)/src -DGIT_REVISION=\"$(GIT_REVISION)\" -std=c++0x -Wall -Wno-sign-compare -Wno-unused-local-typedefs 75 | 76 | phonetisaurus_align_SOURCES = src/bin/phonetisaurus-align.cc src/include/PhonetisaurusRex.h src/lib/util.cc src/include/util.h src/lib/LatticePruner.cc src/include/LatticePruner.h src/lib/M2MFstAligner.cc src/include/M2MFstAligner.h 77 | phonetisaurus_align_CXXFLAGS = $(OPENFST_CXXFLAGS) $(UTFCPP_CXXFLAGS) 78 | phonetisaurus_align_LDADD = $(OPENFST_LDFLAGS) 79 | 80 | phonetisaurus_arpa2wfst_SOURCES = src/bin/phonetisaurus-arpa2wfst.cc src/include/ARPA2WFST.h src/lib/util.cc src/include/util.h 81 | phonetisaurus_arpa2wfst_CXXFLAGS = $(OPENFST_CXXFLAGS) $(UTFCPP_CXXFLAGS) 82 | phonetisaurus_arpa2wfst_LDADD = $(OPENFST_LDFLAGS) 83 | 84 | phonetisaurus_g2pfst_SOURCES = src/bin/phonetisaurus-g2pfst.cc src/include/PhonetisaurusScript.h src/include/PhonetisaurusRex.h src/lib/util.cc src/include/util.h 85 | phonetisaurus_g2pfst_CXXFLAGS = $(OPENFST_CXXFLAGS) $(UTFCPP_CXXFLAGS) -funroll-loops -ffast-math 86 | phonetisaurus_g2pfst_LDADD = $(OPENFST_LDFLAGS) 87 | 88 | phonetisaurus_g2prnn_SOURCES = src/bin/phonetisaurus-g2prnn.cc src/include/LegacyRnnLMHash.h src/include/LegacyRnnLMDecodable.h src/include/LegacyRnnLMReader.h src/include/RnnLMDecoder.h src/lib/util.cc src/include/util.h src/3rdparty/rnnlm/rnnlmlib.cpp src/3rdparty/rnnlm/rnnlmlib.h 89 | phonetisaurus_g2prnn_CXXFLAGS = $(OPENFST_CXXFLAGS) $(UTFCPP_CXXFLAGS) -I$(top_srcdir)/src/3rdparty/rnnlm -funroll-loops -ffast-math 90 | phonetisaurus_g2prnn_LDADD = $(OPENFST_LDFLAGS) 91 | if WANT_OPENMP 92 | if OPENMP 93 | phonetisaurus_g2prnn_CXXFLAGS += $(OPENMP_CXXFLAGS) 94 | phonetisaurus_g2prnn_LDADD += $(OPENMP_LDFLAGS) 95 | endif 96 | endif 97 | 98 | rnnlm_SOURCES = src/bin/rnnlm.cc src/3rdparty/rnnlm/rnnlmlib.cpp src/3rdparty/rnnlm/rnnlmlib.h 99 | rnnlm_CXXFLAGS = $(OPENFST_CXXFLAGS) -I$(top_srcdir)/src/3rdparty/rnnlm -funroll-loops -ffast-math 100 | rnnlm_LDADD = $(OPENFST_LDFLAGS) 101 | 102 | if WANT_PYTHON 103 | if HAVE_PYTHON 104 | 105 | python/Phonetisaurus-binding.cc: $(top_srcdir)/python/phonetisaurus-module.py 106 | mkdir -p python 107 | $(PYTHON) -B $(top_srcdir)/python/phonetisaurus-module.py > python/Phonetisaurus-binding.cc 108 | 109 | CLEANFILES = python/Phonetisaurus-binding.cc 110 | 111 | if HAVE_PYTHON_DEV 112 | pyexec_LTLIBRARIES = Phonetisaurus.la 113 | nodist_Phonetisaurus_la_SOURCES = python/Phonetisaurus-binding.cc src/lib/util.cc src/include/util.h 114 | Phonetisaurus_la_CXXFLAGS = $(OPENFST_CXXFLAGS) $(PYTHON_CPPFLAGS) $(UTFCPP_CXXFLAGS) -funroll-loops -ffast-math 115 | Phonetisaurus_la_LIBADD = $(OPENFST_LDFLAGS) $(PYTHON_LIBS) 116 | Phonetisaurus_la_LDFLAGS = -avoid-version -module 117 | endif 118 | endif 119 | endif 120 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Phonetisaurus G2P ## 2 | [![Build Status](https://travis-ci.org/AdolfVonKleist/Phonetisaurus.svg?branch=master)](https://travis-ci.org/AdolfVonKleist/Phonetisaurus) 3 | 4 | This repository contains scripts suitable for training, evaluating and using grapheme-to-phoneme 5 | models for speech recognition using the OpenFst framework. The current build requires OpenFst 6 | version 1.6.0 or later, and the examples below use version 1.7.2. 7 | 8 | The repository includes C++ binaries suitable for training, compiling, and evaluating G2P models. 9 | It also some simple python bindings which may be used to extract individual 10 | multigram scores, alignments, and to dump the raw lattices in .fst format for each word. 11 | 12 | The python scripts and bindings were tested most recently with python v3.8.5. 13 | 14 | Standalone distributions related to previous INTERSPEECH papers, as well as the complete, exported 15 | final version of the old google-code repository are available via ```git-lfs``` in a separate 16 | repository: 17 | * https://github.com/AdolfVonKleist/phonetisaurus-downloads 18 | 19 | #### Contact: #### 20 | * phonetisaurus@gmail.com 21 | 22 | #### Scratch Build for OpenFst v1.7.2 and Ubuntu 20.04 #### 23 | This build was tested via AWS EC2 with a fresh Ubuntu 20.04 base, and m4.large instance. 24 | 25 | ``` 26 | $ sudo apt-get update 27 | # Basics 28 | $ sudo apt-get install git g++ autoconf-archive make libtool 29 | # Python bindings 30 | $ sudo apt-get install python-setuptools python-dev 31 | # mitlm (to build a quick play model) 32 | $ sudo apt-get install gfortran 33 | ``` 34 | 35 | Create a work directory of your choice: 36 | ``` 37 | $ mkdir g2p 38 | $ cd g2p/ 39 | ``` 40 | 41 | Next grab and install OpenFst-1.7.2: 42 | ``` 43 | $ wget http://www.openfst.org/twiki/pub/FST/FstDownload/openfst-1.7.2.tar.gz 44 | $ tar -xvzf openfst-1.7.2.tar.gz 45 | $ cd openfst-1.7.2 46 | # Minimal configure, compatible with current defaults for Kaldi 47 | $ ./configure --enable-static --enable-shared --enable-far --enable-ngram-fsts 48 | $ make -j 49 | # Now wait a while... 50 | $ sudo make install 51 | # Extend your LD_LIBRARY_PATH .bashrc (assumes OpenFst installed to default location): 52 | $ echo 'export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib:/usr/local/lib/fst' \ 53 | >> ~/.bashrc 54 | $ source ~/.bashrc 55 | $ cd .. 56 | ``` 57 | 58 | Checkout the latest Phonetisaurus from master and compile without bindings: 59 | ``` 60 | $ git clone https://github.com/AdolfVonKleist/Phonetisaurus.git 61 | $ cd Phonetisaurus 62 | # if OpenFst is installed in the default location: 63 | $ ./configure 64 | # if OpenFst is installed in a special location: 65 | $ ./configure \ 66 | --with-openfst-includes=${OFST_PATH}/openfst-1.7.2/include \ 67 | --with-openfst-libs=${OFST_PATH}/openfst-1.7.2/lib 68 | $ make 69 | $ sudo make install 70 | $ cd .. 71 | ``` 72 | 73 | Checkout the latest Phonetisaurus from master and compile with python3 bindings: 74 | ``` 75 | $ git clone https://github.com/AdolfVonKleist/Phonetisaurus.git 76 | $ cd Phonetisaurus 77 | $ sudo pip3 install pybindgen 78 | # if OpenFst is installed in the default location: 79 | $ PYTHON=python3 ./configure --enable-python 80 | # if OpenFst is installed in a special location: 81 | $ PYTHON=python3 ./configure \ 82 | --with-openfst-includes=${OFST_PATH}/openfst-1.7.2/include \ 83 | --with-openfst-libs=${OFST_PATH}/openfst-1.7.2/lib \ 84 | --enable-python 85 | $ make 86 | $ sudo make install 87 | $ cd python 88 | $ cp ../.libs/Phonetisaurus.so . 89 | $ sudo python3 setup.py install 90 | $ cd ../.. 91 | ``` 92 | 93 | Grab and install mitlm to build a quick test model with the cmudict (5m): 94 | ``` 95 | $ git clone https://github.com/mitlm/mitlm.git 96 | $ cd mitlm/ 97 | $ ./autogen.sh 98 | $ make 99 | $ sudo make install 100 | $ cd .. 101 | ``` 102 | 103 | Grab a copy of the latest version of CMUdict and clean it up a bit: 104 | ``` 105 | $ mkdir example 106 | $ cd example 107 | $ wget https://raw.githubusercontent.com/cmusphinx/cmudict/master/cmudict.dict 108 | # Clean it up a bit and reformat: 109 | $ cat cmudict.dict \ 110 | | perl -pe 's/\([0-9]+\)//; 111 | s/\s+/ /g; s/^\s+//; 112 | s/\s+$//; @_ = split (/\s+/); 113 | $w = shift (@_); 114 | $_ = $w."\t".join (" ", @_)."\n";' \ 115 | > cmudict.formatted.dict 116 | ``` 117 | 118 | Train a complete model with default parameters using the wrapper script. 119 | NOTE: this assumes the tool was compiled with the python3 bindings: 120 | ``` 121 | $ phonetisaurus-train --lexicon cmudict.formatted.dict --seq2_del 122 | INFO:phonetisaurus-train:2017-07-09 16:35:31: Checking command configuration... 123 | INFO:phonetisaurus-train:2017-07-09 16:35:31: Checking lexicon for reserved characters: '}', '|', '_'... 124 | INFO:phonetisaurus-train:2017-07-09 16:35:31: Aligning lexicon... 125 | INFO:phonetisaurus-train:2017-07-09 16:37:44: Training joint ngram model... 126 | INFO:phonetisaurus-train:2017-07-09 16:37:46: Converting ARPA format joint n-gram model to WFST format... 127 | INFO:phonetisaurus-train:2017-07-09 16:37:59: G2P training succeeded: train/model.fst 128 | ``` 129 | 130 | Generate pronunciations for a word list using the wrapper script: 131 | ``` 132 | $ phonetisaurus-apply --model train/model.fst --word_list test.wlist 133 | test T EH1 S T 134 | jumbotron JH AH1 M B OW0 T R AA0 N 135 | excellent EH1 K S AH0 L AH0 N T 136 | eggselent EH1 G S L AH0 N T 137 | ``` 138 | 139 | Generate pronunciations for a word list using the wrapper script. 140 | Filter against a reference lexicon, add n-best, and run in verbose mode, 141 | and generate : 142 | ``` 143 | $ phonetisaurus-apply --model train/model.fst --word_list test.wlist -n 2 -g -v -l cmudict.formatted.dict 144 | DEBUG:phonetisaurus-apply:2017-07-09 16:48:22: Checking command configuration... 145 | DEBUG:phonetisaurus-apply:2017-07-09 16:48:22: beam: 10000 146 | DEBUG:phonetisaurus-apply:2017-07-09 16:48:22: greedy: True 147 | DEBUG:phonetisaurus-apply:2017-07-09 16:48:22: lexicon_file: cmudict.formatted.dict 148 | DEBUG:phonetisaurus-apply:2017-07-09 16:48:22: model: train/model.fst 149 | DEBUG:phonetisaurus-apply:2017-07-09 16:48:22: nbest: 2 150 | DEBUG:phonetisaurus-apply:2017-07-09 16:48:22: thresh: 99.0 151 | DEBUG:phonetisaurus-apply:2017-07-09 16:48:22: verbose: True 152 | DEBUG:phonetisaurus-apply:2017-07-09 16:48:22: Loading lexicon from file... 153 | DEBUG:phonetisaurus-apply:2017-07-09 16:48:22: Applying G2P model... 154 | GitRevision: kaldi-1-g5028ba-dirty 155 | eggselent 26.85 EH1 G S L AH0 N T 156 | eggselent 28.12 EH1 G Z L AH0 N T 157 | excellent 0.00 EH1 K S AH0 L AH0 N T 158 | excellent 19.28 EH1 K S L EH1 N T 159 | jumbotron 0.00 JH AH1 M B OW0 T R AA0 N 160 | jumbotron 17.30 JH AH1 M B OW0 T R AA2 N 161 | test 0.00 T EH1 S T 162 | test 11.56 T EH2 S T 163 | ``` 164 | 165 | Generate pronunciations using the alternative % of total probability mass constraint, 166 | and print the resulting scores as human readable, normalized probabilities rather than 167 | raw negative log scores: 168 | ``` 169 | phonetisaurus-apply --model train/model.fst --word_list Phonetisaurus/script/words.list -v -a -p 0.85 -pr 170 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58: Checking command configuration... 171 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58: accumulate: True 172 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58: beam: 10000 173 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58: greedy: False 174 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58: lexicon_file: None 175 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58: logger: 176 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58: model: train/model.fst 177 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58: nbest: 100 178 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58: pmass: 0.85 179 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58: probs: True 180 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58: thresh: 99.0 181 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58: verbose: True 182 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58: phonetisaurus-g2pfst --model=train/model.fst --nbest=100 --beam=10000 --thresh=99.0 --accumulate=true --pmass=0.85 --nlog_probs=false --wordlist=Phonetisaurus/script/words.list 183 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58: Applying G2P model... 184 | GitRevision: kaldi-2-g6e7c04-dirty 185 | test 0.68 T EH1 S T 186 | test 0.21 T EH2 S T 187 | right 0.81 R AY1 T 188 | right 0.13 R AY0 T 189 | junkify 0.64 JH AH1 NG K AH0 F AY2 190 | junkify 0.23 JH AH1 NG K IH0 F AY2 191 | ``` 192 | 193 | Align, estimate, and convert a joint n-gram model step-by-step: 194 | ``` 195 | # Align the dictionary (5m-10m) 196 | $ phonetisaurus-align --input=cmudict.formatted.dict \ 197 | --ofile=cmudict.formatted.corpus --seq1_del=false 198 | # Train an n-gram model (5s-10s): 199 | $ estimate-ngram -o 8 -t cmudict.formatted.corpus \ 200 | -wl cmudict.o8.arpa 201 | # Convert to OpenFst format (10s-20s): 202 | $ phonetisaurus-arpa2wfst --lm=cmudict.o8.arpa --ofile=cmudict.o8.fst 203 | $ cd 204 | ``` 205 | 206 | Test the manual model with the wrapper script: 207 | ``` 208 | $ cd Phonetisaurus/script 209 | $ ./phoneticize.py -m ~/example/cmudict.o8.fst -w testing 210 | 11.24 T EH1 S T IH0 NG 211 | ------- 212 | t:T:3.31 213 | e:EH1:2.26 214 | s:S:2.61 215 | t:T:0.21 216 | i:IH0:2.66 217 | n|g:NG:0.16 218 | ::0.01 219 | ``` 220 | 221 | Test the G2P servlet [requires compilation of bindings and module install]: 222 | ``` 223 | $ nohup script/g2pserver.py -m ~/train/model.fst -l ~/cmudict.formatted.dict & 224 | $ curl -s -F "wordlist=@words.list" http://localhost:8080/phoneticize/list 225 | test T EH1 S T 226 | right R AY1 T 227 | junkify JH AH1 NG K AH0 F AY2 228 | junkify JH AH1 NG K IH0 F AY2 229 | ``` 230 | 231 | Use a special location for OpenFst, parallel build with 2 cores 232 | ``` 233 | $ ./configure --with-openfst-libs=/home/ubuntu/openfst-1.6.2/lib \ 234 | --with-openfst-includes=/home/ubuntu/openfst-1.6.2/include 235 | $ make -j 2 all 236 | ``` 237 | 238 | Use custom g++ under OSX (Note: OpenFst must also be compiled with this 239 | custom g++ alternative [untested with v1.6.2]) 240 | ``` 241 | $ ./configure --with-openfst-libs=/home/osx/openfst-1.6.2gcc/lib \ 242 | --with-openfst-includes=/home/osx/openfst-1.6.2gcc/include \ 243 | CXX=g++-4.9 244 | $ make -j 2 all 245 | ``` 246 | 247 | #### Rebuild configure #### 248 | If you need to rebuild the configure script you can do so: 249 | ``` 250 | $ autoreconf -i 251 | ``` 252 | 253 | ### Install [Linux]: ### 254 | ``` 255 | $ sudo make install 256 | ``` 257 | 258 | ### Uninstall [Linux]: ### 259 | ``` 260 | $ sudo make uninstall 261 | ``` 262 | 263 | ### Usage: ### 264 | #### phonetisaurus-align #### 265 | ``` 266 | $ bin/phonetisaurus-align --help 267 | ``` 268 | #### phonetisaurus-arpa2wfst #### 269 | ``` 270 | $ bin/phonetisaurus-arpa2wfst --help 271 | ``` 272 | #### phonetisaurus-g2prnn #### 273 | ``` 274 | $ bin/phonetisaurus-g2prnn --help 275 | ``` 276 | #### phonetisaurus-g2pfst #### 277 | ``` 278 | $ bin/phonetisaurus-g2pfst --help 279 | ``` 280 | 281 | ### Docker: ### 282 | 283 | Docker images are hosted on: https://hub.docker.com/r/phonetisaurus/phonetisaurus 284 | 285 | The images can be used in one of 3 ways: 286 | 287 | * directly, to process files on your computer without needing to install/compile anything (apart from docker) 288 | * as a base image for another project (using the `FROM` statement) 289 | * to copy portions of the binaries or libraries into a new image (using the `COPY --from=` statement) - most of the files are in `/usr/local/bin` and `/usr/local/lib` 290 | 291 | To use the program directly, you need to mount the local folder with the required files (eg. models, word lists, etc) into the Docker container under the `/work` path, as this is the default workdir in the image. Then you can call the programs directly after the name of the image, for example: 292 | ``` 293 | docker run --rm -it -v $PWD:/work phonetisaurus/phonetisaurus "phonetisaurus-apply -m model.fst -wl test.wlist" 294 | ``` 295 | 296 | You can also use the `bash` program to simply enter the interactive shell and run everything from there. 297 | 298 | ### Misc: ### 299 | cpplint command: 300 | ``` 301 | $ ./cpplint.py --filter=-whitespace/parens,-whitespace/braces,\ 302 | -legal/copyright,-build/namespaces,-runtime/references\ 303 | src/include/util.h 304 | ``` 305 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | # -*- Autoconf -*- 2 | # Process this file with autoconf to produce a configure script. 3 | AC_PREREQ([2.69]) 4 | AC_INIT([phonetisaurus], [0.8.1], [phonetisaurus@gmail.com]) 5 | AC_CONFIG_MACRO_DIR([.autoconf/m4]) 6 | AC_CONFIG_AUX_DIR([.autoconf]) 7 | AC_CONFIG_SRCDIR(configure.ac) 8 | 9 | GIT_REVISION=`git describe --abbrev=6 --dirty --always --tags 2>/dev/null || echo package` 10 | AC_SUBST([GIT_REVISION]) 11 | 12 | AM_INIT_AUTOMAKE([foreign -Wall]) 13 | AM_MAINTAINER_MODE 14 | 15 | AM_PROG_AR 16 | LT_INIT 17 | 18 | AC_ARG_ENABLE(python, 19 | AS_HELP_STRING([--enable-python], [Compile with Python support]), 20 | [case "${enableval}" in 21 | yes) enable_python=true ;; 22 | no) enable_python=false ;; 23 | *) AC_MSG_ERROR(bad value ${enableval} for --enable-python) ;; 24 | esac], [enable_python=false]) 25 | AM_CONDITIONAL(WANT_PYTHON, test x"$enable_python" = xtrue) 26 | 27 | AS_IF([test x"$enable_python" = xtrue], [ 28 | AM_PATH_PYTHON([2.7],, [:]) 29 | ]) 30 | AM_CONDITIONAL([HAVE_PYTHON], [test "$PYTHON" != :]) 31 | AC_SUBST([PYTHON]) 32 | if test x"$PYTHON" != x && test "$PYTHON" != ":"; then 33 | PYTHON_CPPFLAGS= 34 | PYTHON_LIBS= 35 | AX_PYTHON_DEVEL([>= '$PYTHON_VERSION']) 36 | fi 37 | AM_CONDITIONAL([HAVE_PYTHON_DEV], [test x"$PYTHON" != x && test "$PYTHON" != :]) 38 | AC_SUBST([PYTHON_CPPFLAGS]) 39 | AC_SUBST([PYTHON_LIBS]) 40 | 41 | AC_LANG(C++) 42 | # Checks for programs. 43 | AC_PROG_CXX 44 | AX_CXX_COMPILE_STDCXX_11([], [mandatory]) 45 | #Python stuff not currently needed. Will return. 46 | #AX_PYTHON_DEVEL([>= '2.7']) 47 | AC_PROG_CC 48 | AC_PROG_CPP 49 | #AC_PROG_INSTALL 50 | AC_PROG_LN_S 51 | 52 | # Checks for libraries. 53 | AC_CHECK_LIB([c], [getgid],[AC_DEFINE([HAVE_GETGID],[1],[libc includes getgid])]) 54 | AC_CHECK_LIB([dl], [dlopen], [], [echo "dl library not found. Weird."; exit -1]) 55 | AC_CHECK_LIB([m], [cos], [], [echo "m library not found. Please install m library before proceeding"; exit -1]) 56 | AC_CHECK_LIB([pthread], [pthread_mutex_init], [], [echo "pthreads not found. Please install pthread library before proceeding"; exit -1]) 57 | 58 | # Checks for header files. 59 | AC_CHECK_HEADERS([stddef.h stdlib.h string.h]) 60 | 61 | # Checks for typedefs, structures, and compiler characteristics. 62 | AC_CHECK_HEADER_STDBOOL 63 | AC_C_INLINE 64 | AC_TYPE_SIZE_T 65 | AC_TYPE_SSIZE_T 66 | AC_CHECK_TYPES([ptrdiff_t]) 67 | 68 | # Checks for library functions. 69 | AC_FUNC_STRTOD 70 | AC_CHECK_FUNCS([memmove strchr strrchr strspn]) 71 | AC_CHECK_MATH_FUNC(exp10) 72 | 73 | AC_ARG_WITH([openfst-includes], 74 | [AS_HELP_STRING([--with-openfst-includes], 75 | [Location of the OpenFst headers.])], 76 | [user_openfst_headers_path="$withval"], 77 | []) 78 | AC_ARG_WITH([openfst-libs], 79 | [AS_HELP_STRING([--with-openfst-libs], 80 | [Location of the OpenFst shared libraries.])], 81 | [user_openfst_libs_path="$withval"], 82 | []) 83 | 84 | saved_cppflags="${CPPFLAGS}" 85 | if [[ "$user_openfst_headers_path" != "" ]]; then 86 | CPPFLAGS="-I$user_openfst_headers_path" 87 | AC_CHECK_HEADERS([fst/fst.h], [openfst_headers_found=1], [openfst_headers_found=0], []) 88 | if [[ $openfst_headers_found -eq 1 ]]; then 89 | OPENFST_CXXFLAGS="$CPPFLAGS" 90 | fi 91 | else 92 | CPPFLAGS="" 93 | AC_CHECK_HEADERS([fst/fst.h], [openfst_headers_found=1], [openfst_headers_found=0], []) 94 | if [[ $openfst_headers_found -eq 1 ]]; then 95 | OPENFST_CXXFLAGS="$CPPFLAGS" 96 | else 97 | CPPFLAGS="-I/usr/local/include" 98 | AC_CHECK_HEADERS([fst/fst.h], [openfst_headers_found=1], [openfst_headers_found=0], []) 99 | if [[ $openfst_headers_found -eq 1 ]]; then 100 | OPENFST_CXXFLAGS="$CPPFLAGS" 101 | fi 102 | fi 103 | fi 104 | CPPFLAGS="${saved_cppflags}" 105 | 106 | AC_MSG_CHECKING([for openfst libraries]) 107 | CHECK_LIBRARIES="-lfst -lfstfar -lfstngram" 108 | saved_ldflags="${LDFLAGS}" 109 | saved_cppflags="${CPPFLAGS}" 110 | CPPFLAGS="${OPENFST_CXXFLAGS}" 111 | if test x"$user_openfst_libs_path" != x; then 112 | LDFLAGS="-L$user_openfst_libs_path ${CHECK_LIBRARIES}" 113 | AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], [std::unique_ptr ifst(fst::FstHeader());])], 114 | [have_openfst=true], 115 | [have_openfst=false]) 116 | if test x"$have_openfst" = xtrue; then 117 | OPENFST_LDFLAGS="$LDFLAGS" 118 | AC_MSG_RESULT([$OPENFST_LDFLAGS]) 119 | fi 120 | else 121 | LDFLAGS="${CHECK_LIBRARIES}" 122 | AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], [std::unique_ptr ifst(fst::FstHeader());])], 123 | [have_openfst=true], 124 | [have_openfst=false]) 125 | if test x"$have_openfst" = xtrue; then 126 | OPENFST_LDFLAGS="$LDFLAGS" 127 | AC_MSG_RESULT([$OPENFST_LDFLAGS]) 128 | else 129 | LDFLAGS="-L/usr/local/lib ${CHECK_LIBRARIES}" 130 | AC_LINK_IFELSE([AC_LANG_PROGRAM([#include ], [std::unique_ptr ifst(fst::FstHeader());])], 131 | [have_openfst=1], 132 | [have_openfst=0]) 133 | if test x"$have_openfst" = xtrue; then 134 | OPENFST_LDFLAGS="$LDFLAGS" 135 | AC_MSG_RESULT([$OPENFST_LDFLAGS]) 136 | else 137 | AC_MSG_RESULT([no]) 138 | fi 139 | fi 140 | fi 141 | LDFLAGS="${saved_ldflags}" 142 | CPPFLAGS="${saved_cppflags}" 143 | 144 | if test x"$have_openfst" != xtrue; then 145 | AC_MSG_ERROR([Can't find OpenFST or one or more of its extensions. Use --with-openfst-includes and --with-openfst-libs to specify where you have installed OpenFst. OpenFst should have been configured with the following flags: --enable-static --enable-shared --enable-far --enable-ngram-fsts]) 146 | fi 147 | AC_SUBST([OPENFST_CXXFLAGS]) 148 | AC_SUBST([OPENFST_LDFLAGS]) 149 | AM_CONDITIONAL(OPENFST, [test x"$have_openfst" = xtrue]) 150 | 151 | AX_OPENMP([AC_DEFINE(HAVE_OPENMP,1,[Define if OpenMP is enabled])] [have_openmp=true]) 152 | AM_CONDITIONAL(OPENMP, [test x"$have_openmp" = xtrue]) 153 | AC_SUBST([OPENMP_CXXFLAGS]) 154 | AC_SUBST([OPENMP_LDFLAGS]) 155 | AC_ARG_ENABLE(openmp, 156 | AS_HELP_STRING([--enable-openmp], [Compile with OpenMP support]), 157 | [case "${enableval}" in 158 | yes) enable_openmp=true ;; 159 | no) enable_openmp=false ;; 160 | *) AC_MSG_ERROR(bad value ${enableval} for --enable-openmp) ;; 161 | esac], [enable_openmp=false]) 162 | AM_CONDITIONAL(WANT_OPENMP, test x"$enable_openmp" = xtrue) 163 | 164 | saved_cppflags="${CPPFLAGS}" 165 | CPPFLAGS= 166 | AC_CHECK_HEADERS([utf8.h], [utfcpp_headers_found=1], [utfcpp_headers_found=0], []) 167 | if [[ $utfcpp_headers_found -eq 1 ]]; then 168 | UTFCPP_CXXFLAGS="$CPPFLAGS" 169 | else 170 | UTFCPP_CXXFLAGS="-I \${top_srcdir}/src/3rdparty/utfcpp" 171 | fi 172 | CPPFLAGS="${saved_cppflags}" 173 | AC_SUBST([UTFCPP_CXXFLAGS]) 174 | AM_CONDITIONAL(UTFCPP, [test x"$utfcpp_headers_found" = x1]) 175 | 176 | 177 | AC_CONFIG_FILES([Makefile]) 178 | AC_OUTPUT 179 | -------------------------------------------------------------------------------- /docs/mainpage.dox: -------------------------------------------------------------------------------- 1 | /** 2 | @brief Documentation file for Phonetisaurus G2P project 3 | @author Josef R. Novak 4 | @file 5 | */ 6 | /** @defgroup Phonetisaurus Sources */ 7 | /** 8 | @mainpage Phonetisaurus G2P - WFST-based Grapheme-to-Phoneme conversion. 9 | 10 | GitHub project page: Phonetisaurus 11 | */ 12 | -------------------------------------------------------------------------------- /python/phonetisaurus-module.py: -------------------------------------------------------------------------------- 1 | import pybindgen 2 | from pybindgen import param, retval 3 | import sys 4 | 5 | mod = pybindgen.Module ('Phonetisaurus') 6 | ################################################ 7 | #PhonetisaurusOmega decoder wrapper 8 | mod.add_include ('"include/PhonetisaurusScript.h"') 9 | 10 | #Build up the basic bits for the PathData return object 11 | mod.add_container ('std::vector', 'int', 'vector') 12 | mod.add_container ('std::vector', 'float', 'vector') 13 | 14 | #Register the PathDataPy struct 15 | struct = mod.add_struct('PathData') 16 | struct.add_constructor([]) 17 | struct.add_instance_attribute ('PathWeight', 'float') 18 | struct.add_instance_attribute ('PathWeights', 'std::vector') 19 | struct.add_instance_attribute ('ILabels', 'std::vector') 20 | struct.add_instance_attribute ('OLabels', 'std::vector') 21 | struct.add_instance_attribute ('Uniques', 'std::vector') 22 | 23 | #Register the vector container 24 | mod.add_container ('std::vector', 'PathData', 'vector' ) 25 | 26 | g2pklass = mod.add_class ('PhonetisaurusScript') 27 | std_exception = mod.add_exception ('exception', 28 | foreign_cpp_namespace='std', 29 | message_rvalue='%(EXC)s.what()') 30 | 31 | g2pklass.add_constructor ([param ('std::string', 'model')], 32 | throw=[std_exception]) 33 | 34 | g2pklass.add_method ('Phoneticize', retval ('std::vector'), 35 | [ param ('std::string', 'word'), 36 | param ('int', 'nbest'), 37 | param ('int', 'beam'), 38 | param ('float', 'threshold'), 39 | param ('bool', 'write_fsts'), 40 | param ('bool', 'accumulate'), 41 | param ('float', 'pmass') 42 | ] 43 | ) 44 | 45 | # Helper methods for the symbol lookup 46 | g2pklass.add_method ('FindIsym', retval ('std::string'), 47 | [param ('int', 'symbol_id')]) 48 | g2pklass.add_method ('FindIsym', retval('int'), 49 | [param ('std::string', 'symbol')]) 50 | g2pklass.add_method ('FindOsym', retval('std::string'), 51 | [param ('int', 'symbol_id')]) 52 | g2pklass.add_method ('FindOsym', retval('int'), 53 | [param ('std::string', 'symbol')]) 54 | 55 | 56 | 57 | mod.generate (sys.stdout) 58 | -------------------------------------------------------------------------------- /python/phonetisaurus/__init__.py: -------------------------------------------------------------------------------- 1 | from Phonetisaurus import PhonetisaurusScript as Phonetisaurus 2 | -------------------------------------------------------------------------------- /python/script/demo.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | G2P 6 | 7 | 8 | 10 | 12 | 15 | 16 | 26 | 27 | 28 |
29 |
30 |

31 | Grapheme-to-phoneme conversion 32 |

33 |

Using the Phonetisaurus toolkit

34 |
35 |
36 | 37 |
38 |

Enter a list of words (one per line) below and click "Convert":

39 | 40 | 41 | 42 |
43 | 44 | 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /python/script/g2pserver.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os, re, phonetisaurus, json 3 | from bottle import route, run, template, request, response 4 | from itertools import izip 5 | from collections import namedtuple, defaultdict 6 | 7 | #Globals, oh no! 8 | _g2pmodel = None 9 | _lexicon = defaultdict (list) 10 | 11 | 12 | ############################### 13 | # Utilities 14 | def _phoneticize (model, args) : 15 | """ 16 | Python wrapper function for g2p. 17 | """ 18 | 19 | results = model.Phoneticize ( 20 | args.token.encode ("utf8"), 21 | args.nbest, 22 | args.beam, 23 | args.thresh, 24 | args.write_fsts, 25 | args.accumulate, 26 | args.pmass 27 | ) 28 | 29 | pronunciations = [] 30 | for result in results : 31 | pronunciation = [model.FindOsym (u) for u in result.Uniques] 32 | yield u"{0}".format (u" ".join (pronunciation)) 33 | 34 | def _loadLexicon (lexiconfile) : 35 | with open (lexiconfile, "r") as ifp : 36 | for entry in ifp : 37 | word, pron = re.split (ur"\t", entry.decode ("utf8").strip ()) 38 | _lexicon [word].append (pron) 39 | return 40 | 41 | def _defaultArgs (userargs) : 42 | args = namedtuple ('args', [ 43 | 'token', 'nbest', 'beam', 'thresh', 'write_fsts', 44 | 'accumulate', 'pmass' 45 | ]) 46 | 47 | args.token = "" 48 | args.nbest = int (userargs.get ("nbest", 2)) 49 | args.beam = int (userargs.get ("beam", 500)) 50 | args.thresh = float (userargs.get ("thresh", 10.)) 51 | args.pmass = float (userargs.get ("pmass", 0.0)) 52 | args.write_fsts = False 53 | args.accumulate = userargs.get ( 54 | "accumulate", 55 | False 56 | ) 57 | return args 58 | ############################### 59 | 60 | 61 | 62 | @route ('/phoneticize/list', method="POST") 63 | def PhoneticizeList () : 64 | """Phoneticize a list of words. 65 | 66 | Phoneticize a list of words. This will do a simple lookup for 67 | the word in the reference lexicon, and backoff to the G2P server 68 | in the event that it finds no entry. 69 | """ 70 | default_args = _defaultArgs (request.forms) 71 | 72 | wlist = request.files.get ("wordlist") 73 | 74 | words = re.split (ur"\n", wlist.file.read ().decode ("utf8")) 75 | 76 | lexicon = [] 77 | for word in words : 78 | if re.match (ur"^\s*$", word) or u"<" in word or u"[" in word : 79 | continue 80 | 81 | default_args.token = word.lower () 82 | if default_args.token in _lexicon : 83 | for pronunciation in _lexicon [default_args.token] : 84 | lexicon.append (u"{0}\t{1}".format (word, pronunciation)) 85 | else : 86 | for pronunciation in _phoneticize (_g2pmodel, default_args) : 87 | lexicon.append (u"{0}\t{1}".format (word, pronunciation)) 88 | 89 | response.set_header('Access-Control-Allow-Origin', '*') 90 | 91 | return u"\n".join (lexicon).encode ("utf8") 92 | 93 | 94 | 95 | if __name__ == '__main__': 96 | import sys, argparse 97 | 98 | example = "{0} --host localhost --port 8080"\ 99 | "--model g2p.fst --lexicon ref.lexicon" 100 | example = example.format (sys.argv [0]) 101 | parser = argparse.ArgumentParser (description=example) 102 | parser.add_argument ("--host", "-hs", help="IP to host the service on.", 103 | default="localhost") 104 | parser.add_argument ("--port", "-p", help="Port to use for hosting.", 105 | default=8080, type=int) 106 | parser.add_argument ("--model", "-m", help="Phonetisaurus G2P model.", 107 | required=True) 108 | parser.add_argument ("--lexicon", "-l", help="Reference lexicon.", 109 | required=True) 110 | parser.add_argument ("--verbose", "-v", help="Verbose mode.", 111 | default=False, action="store_true") 112 | args = parser.parse_args () 113 | 114 | if args.verbose : 115 | for key,val in args.__dict__.iteritems () : 116 | print >> sys.stderr, "{0}:\t{1}".format (key, val) 117 | 118 | _g2pmodel = phonetisaurus.Phonetisaurus (args.model) 119 | _loadLexicon (args.lexicon) 120 | 121 | run (host=args.host, port=args.port, debug=False) 122 | -------------------------------------------------------------------------------- /python/script/phoneticize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import phonetisaurus 3 | from itertools import izip 4 | 5 | def Phoneticize (model, args) : 6 | """Python wrapper function for g2p bindings. 7 | 8 | Python wrapper function for g2p bindings. Most basic possible example. 9 | Intended as a template for doing something more useful. 10 | 11 | Args: 12 | model (str): The g2p fst model to load. 13 | args (obj): The argparse object with user specified options. 14 | """ 15 | 16 | results = model.Phoneticize ( 17 | args.token, 18 | args.nbest, 19 | args.beam, 20 | args.thresh, 21 | args.write_fsts, 22 | args.accumulate, 23 | args.pmass 24 | ) 25 | 26 | for result in results : 27 | uniques = [model.FindOsym (u) for u in result.Uniques] 28 | print ("{0:0.2f}\t{1}".format (result.PathWeight, " ".join (uniques))) 29 | print ("-------") 30 | 31 | #Should always be equal length 32 | for ilab, olab, weight in izip (result.ILabels, 33 | result.OLabels, 34 | result.PathWeights) : 35 | print ("{0}:{1}:{2:0.2f}".format ( 36 | model.FindIsym (ilab), 37 | model.FindOsym (olab), 38 | weight 39 | )) 40 | 41 | return 42 | 43 | 44 | if __name__ == "__main__" : 45 | import argparse, sys 46 | 47 | example = "{0} --model model.fst --word \"test\"".format (sys.argv [0]) 48 | parser = argparse.ArgumentParser (description=example) 49 | parser.add_argument ("--model", "-m", help="Phonetisaurus G2P model.", 50 | required=True) 51 | group = parser.add_mutually_exclusive_group (required=True) 52 | group.add_argument ("--word", "-w", help="Input word in lower case.") 53 | group.add_argument ("--wlist", "-wl", help="Provide a wordlist.") 54 | parser.add_argument ("--nbest", "-n", help="NBest", 55 | default=1, type=int) 56 | parser.add_argument ("--beam", "-b", help="Search beam", 57 | default=500, type=int) 58 | parser.add_argument ("--thresh", "-t", help="NBest threshold.", 59 | default=10., type=float) 60 | parser.add_argument ("--write_fsts", "-wf", help="Write decoded fsts " 61 | "to disk", default=False, action="store_true") 62 | parser.add_argument ("--accumulate", "-a", help="Accumulate probs across " 63 | "unique pronunciations.", default=False, 64 | action="store_true") 65 | parser.add_argument ("--pmass", "-p", help="Target probability mass.", 66 | default=0.0, type=float) 67 | parser.add_argument ("--verbose", "-v", help="Verbose mode.", 68 | default=False, action="store_true") 69 | args = parser.parse_args () 70 | 71 | if args.verbose : 72 | for key,val in args.__dict__.iteritems () : 73 | print ("{0}: {1}".format (key, val)) 74 | 75 | model = phonetisaurus.Phonetisaurus (args.model) 76 | 77 | if args.word : 78 | args.token = args.word 79 | Phoneticize (model, args) 80 | 81 | else : 82 | with open (args.wlist, "r") as ifp : 83 | for word in ifp : 84 | word = word.decode ("utf8").strip () 85 | args.token = word 86 | Phoneticize (model, args) 87 | print "-----------------------" 88 | print "" 89 | -------------------------------------------------------------------------------- /python/script/words.list: -------------------------------------------------------------------------------- 1 | test 2 | right 3 | junkify -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from setuptools import setup, find_packages 3 | import glob 4 | #Install phonetisaurus 5 | setup ( 6 | name = 'phonetisaurus', 7 | version = '0.3', 8 | description = 'Phonetisaurus G2P python package (OpenFst-1.6.x)', 9 | url = 'http://code.google.com/p/phonetisaurus', 10 | author = 'Josef Novak', 11 | author_email = 'josef.robert.novak@gmail.com', 12 | license = 'BSD', 13 | packages=find_packages(), 14 | data_files = [ 15 | ('.', ['Phonetisaurus.so']) 16 | ], 17 | include_package_data = True, 18 | install_requires = ["argparse", "bottle"], 19 | zip_safe = False 20 | ) 21 | -------------------------------------------------------------------------------- /src/3rdparty/rnnlm/COPYRIGHT.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010-2012 Tomas Mikolov 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | 3. Neither name of copyright holders nor the names of its contributors 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR 24 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 25 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 26 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 27 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 28 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 29 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /src/3rdparty/rnnlm/rnnlmlib.h: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////// 2 | // 3 | // Recurrent neural network based statistical language modeling toolkit 4 | // Version 0.3e 5 | // (c) 2010-2012 Tomas Mikolov (tmikolov@gmail.com) 6 | // 7 | // 2014-04-13 - Josef Robert Novak 8 | // Removed some protections to give bindings access! 9 | /////////////////////////////////////////////////////////////////////// 10 | 11 | #ifndef _RNNLMLIB_H_ 12 | #define _RNNLMLIB_H_ 13 | 14 | #define MAX_STRING 100 15 | #ifndef HAVE_EXP10 16 | #define exp10(n) pow((double)10,(4-n)) 17 | #endif 18 | 19 | //#include 20 | //#include 21 | #include 22 | #include 23 | //#include "util.h" 24 | //using namespace fst; 25 | 26 | const int MAX_NGRAM_ORDER=20; 27 | typedef double real; // doubles for NN weights 28 | typedef double direct_t; // doubles for ME weights; TODO: check why floats are not enough for RNNME (convergence problems) 29 | //typedef std::tr1::unordered_map > JointMap; 30 | //typedef std::tr1::unordered_map TokenMap; 31 | 32 | struct neuron { 33 | real ac; //actual value stored in neuron 34 | real er; //error value in neuron, used by learning algorithm 35 | }; 36 | 37 | struct synapse { 38 | real weight; //weight of synapse 39 | }; 40 | 41 | struct vocab_word { 42 | int cn; 43 | char word[MAX_STRING]; 44 | 45 | real prob; 46 | int class_index; 47 | }; 48 | 49 | /* 50 | struct RNNToken { 51 | RNNToken* parent; 52 | struct neuron* neu; 53 | int history[MAX_NGRAM_ORDER]; 54 | vector bptt_history; 55 | }; 56 | */ 57 | //typedef std::tr1::unordered_map NeuTokenMap; 58 | 59 | const unsigned int PRIMES [] = { 60 | 108641969, 116049371, 125925907, 133333309, 61 | 145678979, 175308587, 197530793, 234567803, 62 | 251851741, 264197411, 330864029, 399999781, 63 | 407407183, 459258997, 479012069, 545678687, 64 | 560493491, 607407037, 629629243, 656789717, 65 | 716048933, 718518067, 725925469, 733332871, 66 | 753085943, 755555077, 782715551, 790122953, 67 | 812345159, 814814293, 893826581, 923456189, 68 | 940740127, 953085797, 985184539, 990122807 69 | }; 70 | 71 | const unsigned int PRIMES_SIZE=sizeof(PRIMES)/sizeof(PRIMES[0]); 72 | 73 | enum FileTypeEnum {TEXT, BINARY, COMPRESSED}; //COMPRESSED not yet implemented 74 | 75 | class CRnnLM { 76 | public: 77 | char train_file[MAX_STRING]; 78 | char valid_file[MAX_STRING]; 79 | char test_file[MAX_STRING]; 80 | char rnnlm_file[MAX_STRING]; 81 | char lmprob_file[MAX_STRING]; 82 | bool joint; 83 | //JointMap joint_map; 84 | //TokenMap token_map; 85 | //NeuTokenMap NeuMap; 86 | 87 | int rand_seed; 88 | 89 | int debug_mode; 90 | 91 | int version; 92 | int filetype; 93 | 94 | int use_lmprob; 95 | real lambda; 96 | real gradient_cutoff; 97 | 98 | real dynamic; 99 | 100 | real alpha; 101 | real starting_alpha; 102 | int alpha_divide; 103 | double logp, llogp; 104 | float min_improvement; 105 | int iter; 106 | int vocab_max_size; 107 | int vocab_size; 108 | int train_words; 109 | int train_cur_pos; 110 | int counter; 111 | 112 | int one_iter; 113 | int anti_k; 114 | 115 | real beta; 116 | 117 | int class_size; 118 | int **class_words; 119 | int *class_cn; 120 | int *class_max_cn; 121 | int old_classes; 122 | 123 | struct vocab_word *vocab; 124 | void sortVocab(); 125 | int *vocab_hash; 126 | int vocab_hash_size; 127 | 128 | int layer0_size; 129 | int layer1_size; 130 | int layerc_size; 131 | int layer2_size; 132 | 133 | long long direct_size; 134 | int direct_order; 135 | int history[MAX_NGRAM_ORDER]; 136 | 137 | int bptt; 138 | int bptt_block; 139 | int *bptt_history; 140 | neuron *bptt_hidden; 141 | struct synapse *bptt_syn0; 142 | 143 | int gen; 144 | 145 | int independent; 146 | 147 | struct neuron *neu0; //neurons in input layer 148 | struct neuron *neu1; //neurons in hidden layer 149 | struct neuron *neuc; //neurons in hidden layer 150 | struct neuron *neu2; //neurons in output layer 151 | 152 | struct synapse *syn0; //weights between input and hidden layer 153 | struct synapse *syn1; //weights between hidden and output layer (or hidden and compression if compression>0) 154 | struct synapse *sync; //weights between hidden and compression layer 155 | direct_t *syn_d; //direct parameters between input and output layer (similar to Maximum Entropy model parameters) 156 | 157 | //backup used in training: 158 | struct neuron *neu0b; 159 | struct neuron *neu1b; 160 | struct neuron *neucb; 161 | struct neuron *neu2b; 162 | 163 | struct synapse *syn0b; 164 | struct synapse *syn1b; 165 | struct synapse *syncb; 166 | direct_t *syn_db; 167 | 168 | //backup used in n-bset rescoring: 169 | struct neuron *neu1b2; 170 | 171 | 172 | //public: 173 | 174 | int alpha_set, train_file_set; 175 | 176 | CRnnLM() //constructor initializes variables 177 | { 178 | version=10; 179 | joint=true; 180 | filetype=TEXT; 181 | 182 | use_lmprob=0; 183 | lambda=0.75; 184 | gradient_cutoff=15; 185 | dynamic=0; 186 | 187 | train_file[0]=0; 188 | valid_file[0]=0; 189 | test_file[0]=0; 190 | rnnlm_file[0]=0; 191 | 192 | alpha_set=0; 193 | train_file_set=0; 194 | 195 | alpha=0.1; 196 | beta=0.0000001; 197 | //beta=0.00000; 198 | alpha_divide=0; 199 | logp=0; 200 | llogp=-100000000; 201 | iter=0; 202 | 203 | min_improvement=1.003; 204 | 205 | train_words=0; 206 | train_cur_pos=0; 207 | vocab_max_size=100; 208 | vocab_size=0; 209 | vocab=(struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word)); 210 | 211 | layer1_size=30; 212 | 213 | direct_size=0; 214 | direct_order=0; 215 | 216 | bptt=0; 217 | bptt_block=10; 218 | bptt_history=NULL; 219 | bptt_hidden=NULL; 220 | bptt_syn0=NULL; 221 | 222 | gen=0; 223 | 224 | independent=0; 225 | 226 | neu0=NULL; 227 | neu1=NULL; 228 | neuc=NULL; 229 | neu2=NULL; 230 | 231 | syn0=NULL; 232 | syn1=NULL; 233 | sync=NULL; 234 | syn_d=NULL; 235 | syn_db=NULL; 236 | //backup 237 | neu0b=NULL; 238 | neu1b=NULL; 239 | neucb=NULL; 240 | neu2b=NULL; 241 | 242 | neu1b2=NULL; 243 | 244 | syn0b=NULL; 245 | syn1b=NULL; 246 | syncb=NULL; 247 | // 248 | 249 | rand_seed=1; 250 | 251 | class_size=100; 252 | old_classes=0; 253 | 254 | one_iter=0; 255 | 256 | debug_mode=1; 257 | srand(rand_seed); 258 | 259 | vocab_hash_size=100000000; 260 | vocab_hash=(int *)calloc(vocab_hash_size, sizeof(int)); 261 | } 262 | 263 | ~CRnnLM() //destructor, deallocates memory 264 | { 265 | int i; 266 | 267 | if (neu0!=NULL) { 268 | free(neu0); 269 | free(neu1); 270 | if (neuc!=NULL) free(neuc); 271 | free(neu2); 272 | 273 | free(syn0); 274 | free(syn1); 275 | if (sync!=NULL) free(sync); 276 | 277 | if (syn_d!=NULL) free(syn_d); 278 | 279 | if (syn_db!=NULL) free(syn_db); 280 | 281 | // 282 | free(neu0b); 283 | free(neu1b); 284 | if (neucb!=NULL) free(neucb); 285 | free(neu2b); 286 | 287 | free(neu1b2); 288 | 289 | free(syn0b); 290 | free(syn1b); 291 | if (syncb!=NULL) free(syncb); 292 | // 293 | 294 | for (i=0; i& SearchJointVocab (string& word); 312 | void SaveContext (std::string& id); 313 | void RestoreContext (std::string& id); 314 | 315 | real random(real min, real max); 316 | 317 | void setTrainFile(char *str); 318 | void setValidFile(char *str); 319 | void setTestFile(char *str); 320 | void setRnnLMFile(char *str); 321 | void setLMProbFile(char *str) {strcpy(lmprob_file, str);} 322 | 323 | void setFileType(int newt) {filetype=newt;} 324 | 325 | void setClassSize(int newSize) {class_size=newSize;} 326 | void setOldClasses(int newVal) {old_classes=newVal;} 327 | void setLambda(real newLambda) {lambda=newLambda;} 328 | void setGradientCutoff(real newGradient) {gradient_cutoff=newGradient;} 329 | void setDynamic(real newD) {dynamic=newD;} 330 | void setGen(real newGen) {gen=newGen;} 331 | void setIndependent(int newVal) {independent=newVal;} 332 | 333 | void setLearningRate(real newAlpha) {alpha=newAlpha;} 334 | void setRegularization(real newBeta) {beta=newBeta;} 335 | void setMinImprovement(real newMinImprovement) {min_improvement=newMinImprovement;} 336 | void setHiddenLayerSize(int newsize) {layer1_size=newsize;} 337 | void setCompressionLayerSize(int newsize) {layerc_size=newsize;} 338 | void setDirectSize(long long newsize) {direct_size=newsize;} 339 | void setDirectOrder(int newsize) {direct_order=newsize;} 340 | void setBPTT(int newval) {bptt=newval;} 341 | void setBPTTBlock(int newval) {bptt_block=newval;} 342 | void setRandSeed(int newSeed) {rand_seed=newSeed; srand(rand_seed);} 343 | void setDebugMode(int newDebug) {debug_mode=newDebug;} 344 | void setAntiKasparek(int newAnti) {anti_k=newAnti;} 345 | void setOneIter(int newOneIter) {one_iter=newOneIter;} 346 | 347 | int getWordHash(char *word); 348 | void readWord(char *word, FILE *fin); 349 | int searchVocab(char *word); 350 | int readWordIndex(FILE *fin); 351 | int addWordToVocab(char *word); 352 | void learnVocabFromTrainFile(); //train_file will be used to construct vocabulary 353 | 354 | void saveWeights(); //saves current weights and unit activations 355 | void restoreWeights(); //restores current weights and unit activations from backup copy 356 | //void saveWeights2(); //allows 2. copy to be stored, useful for dynamic rescoring of nbest lists 357 | //void restoreWeights2(); 358 | void saveContext(); 359 | void restoreContext(); 360 | void saveContext2(); 361 | void restoreContext2(); 362 | void initNet(); 363 | void saveNet(); 364 | void goToDelimiter(int delim, FILE *fi); 365 | void restoreNet(); 366 | void netFlush(); 367 | void netReset(); //will erase just hidden layer state + bptt history + maxent history (called at end of sentences in the independent mode) 368 | 369 | void computeNet(int last_word, int word); 370 | void learnNet(int last_word, int word); 371 | void copyHiddenLayerToInput(); 372 | void trainNet(); 373 | void useLMProb(int use) {use_lmprob=use;} 374 | void testNet(); 375 | void testNbest(); 376 | void testGen(); 377 | 378 | void matrixXvector(struct neuron *dest, struct neuron *srcvec, struct synapse *srcmatrix, int matrix_width, int from, int to, int from2, int to2, int type); 379 | }; 380 | 381 | #endif 382 | -------------------------------------------------------------------------------- /src/3rdparty/utfcpp/utf8.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include "utf8/checked.h" 32 | #include "utf8/unchecked.h" 33 | 34 | #endif // header guard 35 | -------------------------------------------------------------------------------- /src/3rdparty/utfcpp/utf8/checked.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include "core.h" 32 | #include 33 | 34 | namespace utf8 35 | { 36 | // Base for the exceptions that may be thrown from the library 37 | class exception : public std::exception { 38 | }; 39 | 40 | // Exceptions that may be thrown from the library functions. 41 | class invalid_code_point : public exception { 42 | uint32_t cp; 43 | public: 44 | invalid_code_point(uint32_t cp) : cp(cp) {} 45 | virtual const char* what() const throw() { return "Invalid code point"; } 46 | uint32_t code_point() const {return cp;} 47 | }; 48 | 49 | class invalid_utf8 : public exception { 50 | uint8_t u8; 51 | public: 52 | invalid_utf8 (uint8_t u) : u8(u) {} 53 | virtual const char* what() const throw() { return "Invalid UTF-8"; } 54 | uint8_t utf8_octet() const {return u8;} 55 | }; 56 | 57 | class invalid_utf16 : public exception { 58 | uint16_t u16; 59 | public: 60 | invalid_utf16 (uint16_t u) : u16(u) {} 61 | virtual const char* what() const throw() { return "Invalid UTF-16"; } 62 | uint16_t utf16_word() const {return u16;} 63 | }; 64 | 65 | class not_enough_room : public exception { 66 | public: 67 | virtual const char* what() const throw() { return "Not enough space"; } 68 | }; 69 | 70 | /// The library API - functions intended to be called by the users 71 | 72 | template 73 | output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) 74 | { 75 | while (start != end) { 76 | octet_iterator sequence_start = start; 77 | internal::utf_error err_code = internal::validate_next(start, end); 78 | switch (err_code) { 79 | case internal::UTF8_OK : 80 | for (octet_iterator it = sequence_start; it != start; ++it) 81 | *out++ = *it; 82 | break; 83 | case internal::NOT_ENOUGH_ROOM: 84 | throw not_enough_room(); 85 | case internal::INVALID_LEAD: 86 | append (replacement, out); 87 | ++start; 88 | break; 89 | case internal::INCOMPLETE_SEQUENCE: 90 | case internal::OVERLONG_SEQUENCE: 91 | case internal::INVALID_CODE_POINT: 92 | append (replacement, out); 93 | ++start; 94 | // just one replacement mark for the sequence 95 | while (internal::is_trail(*start) && start != end) 96 | ++start; 97 | break; 98 | } 99 | } 100 | return out; 101 | } 102 | 103 | template 104 | inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) 105 | { 106 | static const uint32_t replacement_marker = internal::mask16(0xfffd); 107 | return replace_invalid(start, end, out, replacement_marker); 108 | } 109 | 110 | template 111 | octet_iterator append(uint32_t cp, octet_iterator result) 112 | { 113 | if (!internal::is_code_point_valid(cp)) 114 | throw invalid_code_point(cp); 115 | 116 | if (cp < 0x80) // one octet 117 | *(result++) = static_cast(cp); 118 | else if (cp < 0x800) { // two octets 119 | *(result++) = static_cast((cp >> 6) | 0xc0); 120 | *(result++) = static_cast((cp & 0x3f) | 0x80); 121 | } 122 | else if (cp < 0x10000) { // three octets 123 | *(result++) = static_cast((cp >> 12) | 0xe0); 124 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 125 | *(result++) = static_cast((cp & 0x3f) | 0x80); 126 | } 127 | else { // four octets 128 | *(result++) = static_cast((cp >> 18) | 0xf0); 129 | *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); 130 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 131 | *(result++) = static_cast((cp & 0x3f) | 0x80); 132 | } 133 | return result; 134 | } 135 | 136 | template 137 | uint32_t next(octet_iterator& it, octet_iterator end) 138 | { 139 | uint32_t cp = 0; 140 | internal::utf_error err_code = internal::validate_next(it, end, &cp); 141 | switch (err_code) { 142 | case internal::UTF8_OK : 143 | break; 144 | case internal::NOT_ENOUGH_ROOM : 145 | throw not_enough_room(); 146 | case internal::INVALID_LEAD : 147 | case internal::INCOMPLETE_SEQUENCE : 148 | case internal::OVERLONG_SEQUENCE : 149 | throw invalid_utf8(*it); 150 | case internal::INVALID_CODE_POINT : 151 | throw invalid_code_point(cp); 152 | } 153 | return cp; 154 | } 155 | 156 | template 157 | uint32_t peek_next(octet_iterator it, octet_iterator end) 158 | { 159 | return next(it, end); 160 | } 161 | 162 | template 163 | uint32_t prior(octet_iterator& it, octet_iterator start) 164 | { 165 | // can't do much if it == start 166 | if (it == start) 167 | throw not_enough_room(); 168 | 169 | octet_iterator end = it; 170 | // Go back until we hit either a lead octet or start 171 | while (internal::is_trail(*(--it))) 172 | if (it == start) 173 | throw invalid_utf8(*it); // error - no lead byte in the sequence 174 | return peek_next(it, end); 175 | } 176 | 177 | /// Deprecated in versions that include "prior" 178 | template 179 | uint32_t previous(octet_iterator& it, octet_iterator pass_start) 180 | { 181 | octet_iterator end = it; 182 | while (internal::is_trail(*(--it))) 183 | if (it == pass_start) 184 | throw invalid_utf8(*it); // error - no lead byte in the sequence 185 | octet_iterator temp = it; 186 | return next(temp, end); 187 | } 188 | 189 | template 190 | void advance (octet_iterator& it, distance_type n, octet_iterator end) 191 | { 192 | for (distance_type i = 0; i < n; ++i) 193 | next(it, end); 194 | } 195 | 196 | template 197 | typename std::iterator_traits::difference_type 198 | distance (octet_iterator first, octet_iterator last) 199 | { 200 | typename std::iterator_traits::difference_type dist; 201 | for (dist = 0; first < last; ++dist) 202 | next(first, last); 203 | return dist; 204 | } 205 | 206 | template 207 | octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) 208 | { 209 | while (start != end) { 210 | uint32_t cp = internal::mask16(*start++); 211 | // Take care of surrogate pairs first 212 | if (internal::is_lead_surrogate(cp)) { 213 | if (start != end) { 214 | uint32_t trail_surrogate = internal::mask16(*start++); 215 | if (internal::is_trail_surrogate(trail_surrogate)) 216 | cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; 217 | else 218 | throw invalid_utf16(static_cast(trail_surrogate)); 219 | } 220 | else 221 | throw invalid_utf16(static_cast(cp)); 222 | 223 | } 224 | // Lone trail surrogate 225 | else if (internal::is_trail_surrogate(cp)) 226 | throw invalid_utf16(static_cast(cp)); 227 | 228 | result = append(cp, result); 229 | } 230 | return result; 231 | } 232 | 233 | template 234 | u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) 235 | { 236 | while (start != end) { 237 | uint32_t cp = next(start, end); 238 | if (cp > 0xffff) { //make a surrogate pair 239 | *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); 240 | *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); 241 | } 242 | else 243 | *result++ = static_cast(cp); 244 | } 245 | return result; 246 | } 247 | 248 | template 249 | octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) 250 | { 251 | while (start != end) 252 | result = append(*(start++), result); 253 | 254 | return result; 255 | } 256 | 257 | template 258 | u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) 259 | { 260 | while (start != end) 261 | (*result++) = next(start, end); 262 | 263 | return result; 264 | } 265 | 266 | // The iterator class 267 | template 268 | class iterator : public std::iterator { 269 | octet_iterator it; 270 | octet_iterator range_start; 271 | octet_iterator range_end; 272 | public: 273 | iterator () {}; 274 | explicit iterator (const octet_iterator& octet_it, 275 | const octet_iterator& range_start, 276 | const octet_iterator& range_end) : 277 | it(octet_it), range_start(range_start), range_end(range_end) 278 | { 279 | if (it < range_start || it > range_end) 280 | throw std::out_of_range("Invalid utf-8 iterator position"); 281 | } 282 | // the default "big three" are OK 283 | octet_iterator base () const { return it; } 284 | uint32_t operator * () const 285 | { 286 | octet_iterator temp = it; 287 | return next(temp, range_end); 288 | } 289 | bool operator == (const iterator& rhs) const 290 | { 291 | if (range_start != rhs.range_start || range_end != rhs.range_end) 292 | throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); 293 | return (it == rhs.it); 294 | } 295 | bool operator != (const iterator& rhs) const 296 | { 297 | return !(operator == (rhs)); 298 | } 299 | iterator& operator ++ () 300 | { 301 | next(it, range_end); 302 | return *this; 303 | } 304 | iterator operator ++ (int) 305 | { 306 | iterator temp = *this; 307 | next(it, range_end); 308 | return temp; 309 | } 310 | iterator& operator -- () 311 | { 312 | prior(it, range_start); 313 | return *this; 314 | } 315 | iterator operator -- (int) 316 | { 317 | iterator temp = *this; 318 | prior(it, range_start); 319 | return temp; 320 | } 321 | }; // class iterator 322 | 323 | } // namespace utf8 324 | 325 | #endif //header guard 326 | 327 | 328 | -------------------------------------------------------------------------------- /src/3rdparty/utfcpp/utf8/core.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include 32 | 33 | namespace utf8 34 | { 35 | // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers 36 | // You may need to change them to match your system. 37 | // These typedefs have the same names as ones from cstdint, or boost/cstdint 38 | typedef unsigned char uint8_t; 39 | typedef unsigned short uint16_t; 40 | typedef unsigned int uint32_t; 41 | 42 | // Helper code - not intended to be directly called by the library users. May be changed at any time 43 | namespace internal 44 | { 45 | // Unicode constants 46 | // Leading (high) surrogates: 0xd800 - 0xdbff 47 | // Trailing (low) surrogates: 0xdc00 - 0xdfff 48 | const uint16_t LEAD_SURROGATE_MIN = 0xd800u; 49 | const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; 50 | const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; 51 | const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; 52 | const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); 53 | const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; 54 | 55 | // Maximum valid value for a Unicode code point 56 | const uint32_t CODE_POINT_MAX = 0x0010ffffu; 57 | 58 | template 59 | inline uint8_t mask8(octet_type oc) 60 | { 61 | return static_cast(0xff & oc); 62 | } 63 | template 64 | inline uint16_t mask16(u16_type oc) 65 | { 66 | return static_cast(0xffff & oc); 67 | } 68 | template 69 | inline bool is_trail(octet_type oc) 70 | { 71 | return ((mask8(oc) >> 6) == 0x2); 72 | } 73 | 74 | template 75 | inline bool is_lead_surrogate(u16 cp) 76 | { 77 | return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); 78 | } 79 | 80 | template 81 | inline bool is_trail_surrogate(u16 cp) 82 | { 83 | return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); 84 | } 85 | 86 | template 87 | inline bool is_surrogate(u16 cp) 88 | { 89 | return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); 90 | } 91 | 92 | template 93 | inline bool is_code_point_valid(u32 cp) 94 | { 95 | return (cp <= CODE_POINT_MAX && !is_surrogate(cp)); 96 | } 97 | 98 | template 99 | inline typename std::iterator_traits::difference_type 100 | sequence_length(octet_iterator lead_it) 101 | { 102 | uint8_t lead = mask8(*lead_it); 103 | if (lead < 0x80) 104 | return 1; 105 | else if ((lead >> 5) == 0x6) 106 | return 2; 107 | else if ((lead >> 4) == 0xe) 108 | return 3; 109 | else if ((lead >> 3) == 0x1e) 110 | return 4; 111 | else 112 | return 0; 113 | } 114 | 115 | template 116 | inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) 117 | { 118 | if (cp < 0x80) { 119 | if (length != 1) 120 | return true; 121 | } 122 | else if (cp < 0x800) { 123 | if (length != 2) 124 | return true; 125 | } 126 | else if (cp < 0x10000) { 127 | if (length != 3) 128 | return true; 129 | } 130 | 131 | return false; 132 | } 133 | 134 | enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; 135 | 136 | /// get_sequence_x functions decode utf-8 sequences of the length x 137 | 138 | template 139 | utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t* code_point) 140 | { 141 | if (it != end) { 142 | if (code_point) 143 | *code_point = mask8(*it); 144 | return UTF8_OK; 145 | } 146 | return NOT_ENOUGH_ROOM; 147 | } 148 | 149 | template 150 | utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t* code_point) 151 | { 152 | utf_error ret_code = NOT_ENOUGH_ROOM; 153 | 154 | if (it != end) { 155 | uint32_t cp = mask8(*it); 156 | if (++it != end) { 157 | if (is_trail(*it)) { 158 | cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); 159 | 160 | if (code_point) 161 | *code_point = cp; 162 | ret_code = UTF8_OK; 163 | } 164 | else 165 | ret_code = INCOMPLETE_SEQUENCE; 166 | } 167 | else 168 | ret_code = NOT_ENOUGH_ROOM; 169 | } 170 | 171 | return ret_code; 172 | } 173 | 174 | template 175 | utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t* code_point) 176 | { 177 | utf_error ret_code = NOT_ENOUGH_ROOM; 178 | 179 | if (it != end) { 180 | uint32_t cp = mask8(*it); 181 | if (++it != end) { 182 | if (is_trail(*it)) { 183 | cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff); 184 | if (++it != end) { 185 | if (is_trail(*it)) { 186 | cp += (*it) & 0x3f; 187 | 188 | if (code_point) 189 | *code_point = cp; 190 | ret_code = UTF8_OK; 191 | } 192 | else 193 | ret_code = INCOMPLETE_SEQUENCE; 194 | } 195 | else 196 | ret_code = NOT_ENOUGH_ROOM; 197 | } 198 | else 199 | ret_code = INCOMPLETE_SEQUENCE; 200 | } 201 | else 202 | ret_code = NOT_ENOUGH_ROOM; 203 | } 204 | 205 | return ret_code; 206 | } 207 | 208 | template 209 | utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t* code_point) 210 | { 211 | utf_error ret_code = NOT_ENOUGH_ROOM; 212 | 213 | if (it != end) { 214 | uint32_t cp = mask8(*it); 215 | if (++it != end) { 216 | if (is_trail(*it)) { 217 | cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff); 218 | if (++it != end) { 219 | if (is_trail(*it)) { 220 | cp += (mask8(*it) << 6) & 0xfff; 221 | if (++it != end) { 222 | if (is_trail(*it)) { 223 | cp += (*it) & 0x3f; 224 | 225 | if (code_point) 226 | *code_point = cp; 227 | ret_code = UTF8_OK; 228 | } 229 | else 230 | ret_code = INCOMPLETE_SEQUENCE; 231 | } 232 | else 233 | ret_code = NOT_ENOUGH_ROOM; 234 | } 235 | else 236 | ret_code = INCOMPLETE_SEQUENCE; 237 | } 238 | else 239 | ret_code = NOT_ENOUGH_ROOM; 240 | } 241 | else 242 | ret_code = INCOMPLETE_SEQUENCE; 243 | } 244 | else 245 | ret_code = NOT_ENOUGH_ROOM; 246 | } 247 | 248 | return ret_code; 249 | } 250 | 251 | template 252 | utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point) 253 | { 254 | // Save the original value of it so we can go back in case of failure 255 | // Of course, it does not make much sense with i.e. stream iterators 256 | octet_iterator original_it = it; 257 | 258 | uint32_t cp = 0; 259 | // Determine the sequence length based on the lead octet 260 | typedef typename std::iterator_traits::difference_type octet_difference_type; 261 | octet_difference_type length = sequence_length(it); 262 | if (length == 0) 263 | return INVALID_LEAD; 264 | 265 | // Now that we have a valid sequence length, get trail octets and calculate the code point 266 | utf_error err = UTF8_OK; 267 | switch (length) { 268 | case 1: 269 | err = get_sequence_1(it, end, &cp); 270 | break; 271 | case 2: 272 | err = get_sequence_2(it, end, &cp); 273 | break; 274 | case 3: 275 | err = get_sequence_3(it, end, &cp); 276 | break; 277 | case 4: 278 | err = get_sequence_4(it, end, &cp); 279 | break; 280 | } 281 | 282 | if (err == UTF8_OK) { 283 | // Decoding succeeded. Now, security checks... 284 | if (is_code_point_valid(cp)) { 285 | if (!is_overlong_sequence(cp, length)){ 286 | // Passed! Return here. 287 | if (code_point) 288 | *code_point = cp; 289 | ++it; 290 | return UTF8_OK; 291 | } 292 | else 293 | err = OVERLONG_SEQUENCE; 294 | } 295 | else 296 | err = INVALID_CODE_POINT; 297 | } 298 | 299 | // Failure branch - restore the original value of the iterator 300 | it = original_it; 301 | return err; 302 | } 303 | 304 | template 305 | inline utf_error validate_next(octet_iterator& it, octet_iterator end) { 306 | return validate_next(it, end, 0); 307 | } 308 | 309 | } // namespace internal 310 | 311 | /// The library API - functions intended to be called by the users 312 | 313 | // Byte order mark 314 | const uint8_t bom[] = {0xef, 0xbb, 0xbf}; 315 | 316 | template 317 | octet_iterator find_invalid(octet_iterator start, octet_iterator end) 318 | { 319 | octet_iterator result = start; 320 | while (result != end) { 321 | internal::utf_error err_code = internal::validate_next(result, end); 322 | if (err_code != internal::UTF8_OK) 323 | return result; 324 | } 325 | return result; 326 | } 327 | 328 | template 329 | inline bool is_valid(octet_iterator start, octet_iterator end) 330 | { 331 | return (find_invalid(start, end) == end); 332 | } 333 | 334 | template 335 | inline bool starts_with_bom (octet_iterator it, octet_iterator end) 336 | { 337 | return ( 338 | ((it != end) && (internal::mask8(*it++)) == bom[0]) && 339 | ((it != end) && (internal::mask8(*it++)) == bom[1]) && 340 | ((it != end) && (internal::mask8(*it)) == bom[2]) 341 | ); 342 | } 343 | 344 | //Deprecated in release 2.3 345 | template 346 | inline bool is_bom (octet_iterator it) 347 | { 348 | return ( 349 | (internal::mask8(*it++)) == bom[0] && 350 | (internal::mask8(*it++)) == bom[1] && 351 | (internal::mask8(*it)) == bom[2] 352 | ); 353 | } 354 | } // namespace utf8 355 | 356 | #endif // header guard 357 | 358 | 359 | -------------------------------------------------------------------------------- /src/3rdparty/utfcpp/utf8/unchecked.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include "core.h" 32 | 33 | namespace utf8 34 | { 35 | namespace unchecked 36 | { 37 | template 38 | octet_iterator append(uint32_t cp, octet_iterator result) 39 | { 40 | if (cp < 0x80) // one octet 41 | *(result++) = static_cast(cp); 42 | else if (cp < 0x800) { // two octets 43 | *(result++) = static_cast((cp >> 6) | 0xc0); 44 | *(result++) = static_cast((cp & 0x3f) | 0x80); 45 | } 46 | else if (cp < 0x10000) { // three octets 47 | *(result++) = static_cast((cp >> 12) | 0xe0); 48 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 49 | *(result++) = static_cast((cp & 0x3f) | 0x80); 50 | } 51 | else { // four octets 52 | *(result++) = static_cast((cp >> 18) | 0xf0); 53 | *(result++) = static_cast(((cp >> 12) & 0x3f)| 0x80); 54 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 55 | *(result++) = static_cast((cp & 0x3f) | 0x80); 56 | } 57 | return result; 58 | } 59 | 60 | template 61 | uint32_t next(octet_iterator& it) 62 | { 63 | uint32_t cp = internal::mask8(*it); 64 | typename std::iterator_traits::difference_type length = utf8::internal::sequence_length(it); 65 | switch (length) { 66 | case 1: 67 | break; 68 | case 2: 69 | it++; 70 | cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); 71 | break; 72 | case 3: 73 | ++it; 74 | cp = ((cp << 12) & 0xffff) + ((internal::mask8(*it) << 6) & 0xfff); 75 | ++it; 76 | cp += (*it) & 0x3f; 77 | break; 78 | case 4: 79 | ++it; 80 | cp = ((cp << 18) & 0x1fffff) + ((internal::mask8(*it) << 12) & 0x3ffff); 81 | ++it; 82 | cp += (internal::mask8(*it) << 6) & 0xfff; 83 | ++it; 84 | cp += (*it) & 0x3f; 85 | break; 86 | } 87 | ++it; 88 | return cp; 89 | } 90 | 91 | template 92 | uint32_t peek_next(octet_iterator it) 93 | { 94 | return next(it); 95 | } 96 | 97 | template 98 | uint32_t prior(octet_iterator& it) 99 | { 100 | while (internal::is_trail(*(--it))) ; 101 | octet_iterator temp = it; 102 | return next(temp); 103 | } 104 | 105 | // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) 106 | template 107 | inline uint32_t previous(octet_iterator& it) 108 | { 109 | return prior(it); 110 | } 111 | 112 | template 113 | void advance (octet_iterator& it, distance_type n) 114 | { 115 | for (distance_type i = 0; i < n; ++i) 116 | next(it); 117 | } 118 | 119 | template 120 | typename std::iterator_traits::difference_type 121 | distance (octet_iterator first, octet_iterator last) 122 | { 123 | typename std::iterator_traits::difference_type dist; 124 | for (dist = 0; first < last; ++dist) 125 | next(first); 126 | return dist; 127 | } 128 | 129 | template 130 | octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) 131 | { 132 | while (start != end) { 133 | uint32_t cp = internal::mask16(*start++); 134 | // Take care of surrogate pairs first 135 | if (internal::is_lead_surrogate(cp)) { 136 | uint32_t trail_surrogate = internal::mask16(*start++); 137 | cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; 138 | } 139 | result = append(cp, result); 140 | } 141 | return result; 142 | } 143 | 144 | template 145 | u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) 146 | { 147 | while (start < end) { 148 | uint32_t cp = next(start); 149 | if (cp > 0xffff) { //make a surrogate pair 150 | *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); 151 | *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); 152 | } 153 | else 154 | *result++ = static_cast(cp); 155 | } 156 | return result; 157 | } 158 | 159 | template 160 | octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) 161 | { 162 | while (start != end) 163 | result = append(*(start++), result); 164 | 165 | return result; 166 | } 167 | 168 | template 169 | u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) 170 | { 171 | while (start < end) 172 | (*result++) = next(start); 173 | 174 | return result; 175 | } 176 | 177 | // The iterator class 178 | template 179 | class iterator : public std::iterator { 180 | octet_iterator it; 181 | public: 182 | iterator () {}; 183 | explicit iterator (const octet_iterator& octet_it): it(octet_it) {} 184 | // the default "big three" are OK 185 | octet_iterator base () const { return it; } 186 | uint32_t operator * () const 187 | { 188 | octet_iterator temp = it; 189 | return next(temp); 190 | } 191 | bool operator == (const iterator& rhs) const 192 | { 193 | return (it == rhs.it); 194 | } 195 | bool operator != (const iterator& rhs) const 196 | { 197 | return !(operator == (rhs)); 198 | } 199 | iterator& operator ++ () 200 | { 201 | std::advance(it, internal::sequence_length(it)); 202 | return *this; 203 | } 204 | iterator operator ++ (int) 205 | { 206 | iterator temp = *this; 207 | std::advance(it, internal::sequence_length(it)); 208 | return temp; 209 | } 210 | iterator& operator -- () 211 | { 212 | prior(it); 213 | return *this; 214 | } 215 | iterator operator -- (int) 216 | { 217 | iterator temp = *this; 218 | prior(it); 219 | return temp; 220 | } 221 | }; // class iterator 222 | 223 | } // namespace utf8::unchecked 224 | } // namespace utf8 225 | 226 | 227 | #endif // header guard 228 | 229 | -------------------------------------------------------------------------------- /src/bin/phonetisaurus-arpa2wfst.cc: -------------------------------------------------------------------------------- 1 | /* 2 | phonetisaurus-arpa2wfst.cc 3 | 4 | Copyright (c) [2012-], Josef Robert Novak 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted #provided that the following conditions 9 | are met: 10 | 11 | * Redistributions of source code must retain the above copyright 12 | notice, this list of conditions and the following disclaimer. 13 | * Redistributions in binary form must reproduce the above 14 | copyright notice, this list of #conditions and the following 15 | disclaimer in the documentation and/or other materials provided 16 | with the distribution. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22 | COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 23 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 27 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 29 | OF THE POSSIBILITY OF SUCH DAMAGE. 30 | * 31 | */ 32 | using namespace std; 33 | #include 34 | #include 35 | 36 | using namespace fst; 37 | 38 | DEFINE_string (lm, "", "Input ARPA format LM."); 39 | DEFINE_string (eps, "", "Epsilon symbol."); 40 | DEFINE_string (sb, "", "Sentence begin token."); 41 | DEFINE_string (se, "", "Sentence end token."); 42 | DEFINE_string (split, "}", "Character separating grapheme/phoneme info."); 43 | DEFINE_string (skip, "_", "Character indicating insertions/deletions."); 44 | DEFINE_string (tie, "|", "Character separating multi-token subsequences."); 45 | DEFINE_string (ssyms, "", "Output filename for state symbols tables (default: do not print)."); 46 | DEFINE_string (ofile, "", "Output file for writing. (STDOUT)"); 47 | 48 | int main (int argc, char* argv []) { 49 | cerr << "GitRevision: " << GIT_REVISION << endl; 50 | string usage = "arpa2wfsa - Transform an ARPA LM into an " 51 | "equivalent WFSA.\n\n Usage: "; 52 | set_new_handler (FailedNewHandler); 53 | PhonetisaurusSetFlags (usage.c_str(), &argc, &argv, false); 54 | 55 | if (FLAGS_lm.compare ("") == 0) { 56 | cerr << "You must supply an ARPA format lm " 57 | "to --lm for conversion!" << endl; 58 | return 0; 59 | } 60 | 61 | cerr << "Initializing..." << endl; 62 | ARPA2WFST* converter = new ARPA2WFST (FLAGS_lm, FLAGS_eps, FLAGS_sb, 63 | FLAGS_se, FLAGS_split, FLAGS_skip, 64 | FLAGS_tie); 65 | cerr << "Converting..." << endl; 66 | converter->arpa_to_wfst (); 67 | 68 | converter->arpafst.Write (FLAGS_ofile); 69 | 70 | if (FLAGS_ssyms.compare ("") != 0) { 71 | converter->ssyms->WriteText (FLAGS_ssyms); 72 | } 73 | 74 | delete converter; 75 | 76 | return 0; 77 | } 78 | -------------------------------------------------------------------------------- /src/bin/phonetisaurus-g2pfst.cc: -------------------------------------------------------------------------------- 1 | /* 2 | phonetisaurus-g2pfst.cc 3 | 4 | Copyright (c) [2012-], Josef Robert Novak 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted #provided that the following conditions 9 | are met: 10 | 11 | * Redistributions of source code must retain the above copyright 12 | notice, this list of conditions and the following disclaimer. 13 | * Redistributions in binary form must reproduce the above 14 | copyright notice, this list of #conditions and the following 15 | disclaimer in the documentation and/or other materials provided 16 | with the distribution. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22 | COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 23 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 27 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 29 | OF THE POSSIBILITY OF SUCH DAMAGE. 30 | * 31 | */ 32 | #include 33 | using namespace std; 34 | #include 35 | #include 36 | #include 37 | using namespace fst; 38 | 39 | typedef unordered_map > RMAP; 40 | 41 | void PrintPathData (const vector& results, string FLAGS_word, 42 | const SymbolTable* osyms, bool print_scores = true, 43 | bool nlog_probs = true) { 44 | for (int i = 0; i < results.size (); i++) { 45 | cout << FLAGS_word << "\t"; 46 | if (print_scores == true) { 47 | if (nlog_probs == true) 48 | cout << results [i].PathWeight << "\t"; 49 | else 50 | cout << std::setprecision (3) << exp (-results [i].PathWeight) << "\t"; 51 | } 52 | 53 | for (int j = 0; j < results [i].Uniques.size (); j++) { 54 | cout << osyms->Find (results [i].Uniques [j]); 55 | if (j < results [i].Uniques.size () - 1) 56 | cout << " "; 57 | } 58 | cout << endl; 59 | } 60 | } 61 | 62 | void EvaluateWordlist (PhonetisaurusScript& decoder, vector corpus, 63 | int FLAGS_beam, int FLAGS_nbest, bool FLAGS_reverse, 64 | string FLAGS_skip, double FLAGS_thresh, string FLAGS_gsep, 65 | bool FLAGS_write_fsts, bool FLAGS_print_scores, 66 | bool FLAGS_accumulate, double FLAGS_pmass, 67 | bool FLAGS_nlog_probs) { 68 | for (int i = 0; i < corpus.size (); i++) { 69 | vector results = decoder.Phoneticize (corpus [i], FLAGS_nbest, 70 | FLAGS_beam, FLAGS_thresh, 71 | FLAGS_write_fsts, 72 | FLAGS_accumulate, FLAGS_pmass); 73 | PrintPathData (results, corpus [i], 74 | decoder.osyms_, 75 | FLAGS_print_scores, 76 | FLAGS_nlog_probs); 77 | } 78 | } 79 | 80 | 81 | DEFINE_string (model, "", "Input FST G2P model."); 82 | DEFINE_string (word, "", "Input word to phoneticize."); 83 | DEFINE_string (wordlist, "", "Input wordlist to phoneticize"); 84 | DEFINE_string (gsep, "", "Grapheme separator."); 85 | DEFINE_string (skip, "_", "Phoneme skip marker."); 86 | DEFINE_int32 (nbest, 1, "N-best hypotheses to output."); 87 | DEFINE_int32 (beam, 10000, "Decoder beam."); 88 | DEFINE_double (thresh, 99.0, "N-best comparison threshold."); 89 | DEFINE_double (pmass, 0.0, "Percent of probability mass (0.0 < p <= 1.0)."); 90 | DEFINE_bool (write_fsts, false, "Write the output FSTs for debugging."); 91 | DEFINE_bool (reverse, false, "Reverse input word."); 92 | DEFINE_bool (print_scores, true, "Print scores in output."); 93 | DEFINE_bool (accumulate, false, "Accumulate weights for unique output prons."); 94 | DEFINE_bool (nlog_probs, true, "Default scores vals are negative logs. " 95 | "Otherwise exp (-val)."); 96 | int main (int argc, char* argv []) { 97 | cerr << "GitRevision: " << GIT_REVISION << endl; 98 | string usage = "phonetisaurus-g2pfst - joint N-gram decoder.\n\n Usage: "; 99 | set_new_handler (FailedNewHandler); 100 | PhonetisaurusSetFlags (usage.c_str(), &argc, &argv, false); 101 | 102 | if (FLAGS_model.compare ("") == 0) { 103 | cerr << "You must supply an FST model to --model" << endl; 104 | exit (1); 105 | } else { 106 | std::ifstream model_ifp (FLAGS_model); 107 | if (!model_ifp.good ()) { 108 | cout << "Failed to open --model file '" 109 | << FLAGS_model << "'" << endl; 110 | exit (1); 111 | } 112 | } 113 | 114 | if (FLAGS_pmass < 0.0 || FLAGS_pmass > 1) { 115 | cout << "--pmass must be a float value between 0.0 and 1.0." << endl; 116 | exit (1); 117 | } 118 | if (FLAGS_pmass == 0.0) 119 | FLAGS_pmass = 99.0; 120 | else 121 | FLAGS_pmass = -log (FLAGS_pmass); 122 | 123 | bool use_wordlist = false; 124 | if (FLAGS_wordlist.compare ("") != 0) { 125 | std::ifstream wordlist_ifp (FLAGS_wordlist); 126 | if (!wordlist_ifp.good ()) { 127 | cout << "Failed to open --wordlist file '" 128 | << FLAGS_wordlist << "'" << endl; 129 | exit (1); 130 | } else { 131 | use_wordlist = true; 132 | } 133 | } 134 | 135 | if (FLAGS_wordlist.compare ("") == 0 && FLAGS_word.compare ("") == 0) { 136 | cout << "Either --wordlist or --word must be set!" << endl; 137 | exit (1); 138 | } 139 | 140 | if (use_wordlist == true) { 141 | vector corpus; 142 | LoadWordList (FLAGS_wordlist, &corpus); 143 | 144 | PhonetisaurusScript decoder (FLAGS_model, FLAGS_gsep); 145 | EvaluateWordlist ( 146 | decoder, corpus, FLAGS_beam, FLAGS_nbest, FLAGS_reverse, 147 | FLAGS_skip, FLAGS_thresh, FLAGS_gsep, FLAGS_write_fsts, 148 | FLAGS_print_scores, FLAGS_accumulate, FLAGS_pmass, 149 | FLAGS_nlog_probs 150 | ); 151 | } else { 152 | PhonetisaurusScript decoder (FLAGS_model, FLAGS_gsep); 153 | vector results = decoder.Phoneticize ( 154 | FLAGS_word, FLAGS_nbest, FLAGS_beam, FLAGS_thresh, 155 | FLAGS_write_fsts, FLAGS_accumulate, FLAGS_pmass 156 | ); 157 | PrintPathData (results, FLAGS_word, 158 | decoder.osyms_, 159 | FLAGS_print_scores, 160 | FLAGS_nlog_probs); 161 | } 162 | 163 | return 0; 164 | } 165 | -------------------------------------------------------------------------------- /src/bin/phonetisaurus-g2prnn.cc: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace std; 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "utf8.h" 9 | #ifdef _OPENMP 10 | #include 11 | #endif 12 | using namespace fst; 13 | 14 | typedef LegacyRnnLMDecodable Decodable; 15 | typedef unordered_map RMAP; 16 | 17 | 18 | void ThreadedEvaluateWordlist (vector& corpus, RMAP& rmap, 19 | LegacyRnnLMHash& h, Decodable& s, 20 | int FLAGS_threads, int FLAGS_beam, 21 | int FLAGS_kmax, int FLAGS_nbest, 22 | bool FLAGS_reverse, string FLAGS_gpdelim, 23 | string FLAGS_gdelim, string FLAGS_skip, 24 | double FLAGS_thresh, string FLAGS_gsep) { 25 | int csize = corpus.size (); 26 | 27 | #ifdef _OPENMP 28 | #pragma omp parallel for 29 | #endif 30 | for (int x = 0; x < FLAGS_threads; x++) { 31 | RnnLMDecoder decoder (s); 32 | 33 | int start = x * (csize / FLAGS_threads); 34 | int end = (x == FLAGS_threads - 1) ? csize \ 35 | : start + (csize / FLAGS_threads); 36 | for (int i = start; i < end; i++) { 37 | vector graphemes = tokenize_utf8_string (&corpus [i], 38 | &FLAGS_gsep); 39 | if (FLAGS_reverse == true) 40 | reverse (graphemes.begin (), graphemes.end ()); 41 | 42 | graphemes.push_back (""); 43 | SimpleResult result = \ 44 | decoder.Decode (graphemes, FLAGS_beam, FLAGS_kmax, 45 | FLAGS_nbest, FLAGS_thresh, FLAGS_gpdelim, 46 | FLAGS_gdelim, FLAGS_skip); 47 | rmap [i] = result; 48 | } 49 | } 50 | 51 | for (int i = 0; i < csize; i++) { 52 | const SimpleResult& result = rmap [i]; 53 | 54 | for (int k = 0; k < result.pronunciations.size (); k++) 55 | cout << result.word << "\t" << result.scores [k] << "\t" 56 | << result.pronunciations [k] << "\n"; 57 | } 58 | } 59 | 60 | void EvaluateWordlist (vector& corpus, 61 | LegacyRnnLMHash& h, Decodable& s, int FLAGS_beam, 62 | int FLAGS_kmax, int FLAGS_nbest, bool FLAGS_reverse, 63 | string FLAGS_gpdelim, string FLAGS_gdelim, 64 | string FLAGS_skip, double FLAGS_thresh, 65 | string FLAGS_gsep) { 66 | 67 | RnnLMDecoder decoder (s); 68 | for (int i = 0; i < corpus.size (); i++) { 69 | vector graphemes = tokenize_utf8_string (&corpus [i], 70 | &FLAGS_gsep); 71 | if (FLAGS_reverse == true) 72 | reverse (graphemes.begin (), graphemes.end ()); 73 | 74 | graphemes.push_back (""); 75 | 76 | SimpleResult result = \ 77 | decoder.Decode (graphemes, FLAGS_beam, FLAGS_kmax, 78 | FLAGS_nbest, FLAGS_thresh, FLAGS_gpdelim, 79 | FLAGS_gdelim, FLAGS_skip); 80 | 81 | for (int k = 0; k < result.pronunciations.size (); k++) 82 | cout << result.word << "\t" << result.scores [k] << "\t" 83 | << result.pronunciations [k] << "\n"; 84 | } 85 | } 86 | 87 | void EvaluateWord (string word, LegacyRnnLMHash& h, Decodable& s, 88 | int FLAGS_beam, int FLAGS_kmax, int FLAGS_nbest, 89 | bool FLAGS_reverse, string FLAGS_gpdelim, 90 | string FLAGS_gdelim, string FLAGS_skip, 91 | double FLAGS_thresh, string FLAGS_gsep) { 92 | 93 | vector graphemes = tokenize_utf8_string (&word, 94 | &FLAGS_gsep); 95 | if (FLAGS_reverse == true) 96 | reverse (graphemes.begin (), graphemes.end ()); 97 | graphemes.push_back (""); 98 | 99 | RnnLMDecoder decoder (s); 100 | SimpleResult result = \ 101 | decoder.Decode (graphemes, FLAGS_beam, FLAGS_kmax, 102 | FLAGS_nbest, FLAGS_thresh, FLAGS_gpdelim, 103 | FLAGS_gdelim, FLAGS_skip); 104 | 105 | for (int k = 0; k < result.pronunciations.size (); k++) 106 | cout << result.word << "\t" << result.scores [k] << "\t" 107 | << result.pronunciations [k] << "\n"; 108 | } 109 | 110 | DEFINE_string (rnnlm, "", "The input RnnLM model."); 111 | DEFINE_string (wordlist, "", "Input word list to evaluate."); 112 | DEFINE_string (word, "", "Single input word to evaluate."); 113 | DEFINE_string (gdelim, "|", "The default multigram delimiter."); 114 | DEFINE_string (gpdelim, "}", "The default grapheme / phoneme delimiter."); 115 | DEFINE_string (gsep, "", "The default grapheme delimiter for testing. Typically ''."); 116 | DEFINE_string (skip, "_", "The default null/skip token."); 117 | DEFINE_int32 (nbest, 1, "Maximum number of hypotheses to return."); 118 | DEFINE_int32 (threads, 1, "Number of parallel threads (OpenMP)."); 119 | DEFINE_int32 (kmax, 20, "State-local maximum queue size."); 120 | DEFINE_int32 (beam, 20, "The state-local beam width."); 121 | DEFINE_double (thresh, 0.0, "The n-best pruning threshold. Relative to 1-best."); 122 | DEFINE_bool (reverse, false, "Reverse the input word before decoding."); 123 | 124 | int main (int argc, char* argv []) { 125 | cerr << "GitRevision: " << GIT_REVISION << endl; 126 | string usage = "phonetisaurus-g2prnn --rnnlm=test.rnnlm " \ 127 | "--wordlist=test.words --nbest=5\n\n Usage: "; 128 | set_new_handler (FailedNewHandler); 129 | PhonetisaurusSetFlags (usage.c_str (), &argc, &argv, false); 130 | 131 | if (FLAGS_rnnlm.compare ("") == 0) { 132 | cout << "--rnnlm model is required!" << endl; 133 | exit (1); 134 | } else { 135 | std::ifstream rnnlm_ifp (FLAGS_rnnlm); 136 | if (!rnnlm_ifp.good ()) { 137 | cout << "Faile to open --rnnlm file '" 138 | << FLAGS_rnnlm << "'" << endl; 139 | exit (1); 140 | } 141 | } 142 | 143 | bool use_wordlist = false; 144 | if (FLAGS_wordlist.compare ("") != 0) { 145 | std::ifstream wordlist_ifp (FLAGS_wordlist); 146 | if (!wordlist_ifp.good ()) { 147 | cout << "Failed to open --wordlist file '" 148 | << FLAGS_wordlist << "'" << endl; 149 | exit (1); 150 | } else { 151 | use_wordlist = true; 152 | } 153 | } 154 | 155 | if (FLAGS_wordlist.compare ("") == 0 && FLAGS_word.compare ("") == 0) { 156 | cout << "Either --wordlist or --word must be set!" << endl; 157 | } 158 | 159 | #ifdef _OPENMP 160 | omp_set_num_threads (FLAGS_threads); 161 | #endif 162 | vector corpus; 163 | 164 | LoadWordList (FLAGS_wordlist, &corpus); 165 | 166 | RMAP rmap; 167 | 168 | LegacyRnnLMReader reader (FLAGS_rnnlm); 169 | LegacyRnnLMHash h = reader.CopyVocabHash (FLAGS_gdelim, FLAGS_gpdelim); 170 | Decodable s = reader.CopyLegacyRnnLM (h); 171 | 172 | if (use_wordlist == true) { 173 | if (FLAGS_threads > 1) { 174 | ThreadedEvaluateWordlist (corpus, rmap, h, s, FLAGS_threads, 175 | FLAGS_beam, FLAGS_kmax, FLAGS_nbest, 176 | FLAGS_reverse, FLAGS_gpdelim, 177 | FLAGS_gdelim, FLAGS_skip, 178 | FLAGS_thresh, FLAGS_gsep); 179 | } else { 180 | EvaluateWordlist (corpus, h, s, FLAGS_beam, 181 | FLAGS_kmax, FLAGS_nbest, FLAGS_reverse, 182 | FLAGS_gpdelim, FLAGS_gdelim, FLAGS_skip, 183 | FLAGS_thresh, FLAGS_gsep); 184 | } 185 | } else { 186 | EvaluateWord (FLAGS_word, h, s, FLAGS_beam, FLAGS_kmax, 187 | FLAGS_nbest, FLAGS_reverse, FLAGS_gpdelim, 188 | FLAGS_gdelim, FLAGS_skip, FLAGS_thresh, FLAGS_gsep); 189 | } 190 | 191 | return 0; 192 | } 193 | -------------------------------------------------------------------------------- /src/include/LatticePruner.h: -------------------------------------------------------------------------------- 1 | #ifndef SRC_INCLUDE_LATTICEPRUNER_H_ 2 | #define SRC_INCLUDE_LATTICEPRUNER_H_ 3 | /* 4 | LatticePruner.hpp 5 | 6 | Copyright (c) [2012-], Josef Robert Novak 7 | All rights reserved. 8 | 9 | Redistribution and use in source and binary forms, with or without 10 | modification, are permitted #provided that the following conditions 11 | are met: 12 | 13 | * Redistributions of source code must retain the above copyright 14 | notice, this list of conditions and the following disclaimer. 15 | * Redistributions in binary form must reproduce the above 16 | copyright notice, this list of #conditions and the following 17 | disclaimer in the documentation and/or other materials provided 18 | with the distribution. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 | COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 25 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 29 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 31 | OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | #include 34 | #include 35 | #include "./util.h" 36 | using namespace std; 37 | 38 | namespace fst { 39 | class LatticePruner { 40 | /* 41 | Generic pruning/re-weighting class for simple WFST lattices. 42 | Implements several simple pruning methods including the following: 43 | - Arc penalization 44 | - N-best extraction via ShortestPath() 45 | - Arc-based beam pruning via Prune() 46 | - Forward-Backward pruning 47 | These may be combined into a cascade as well. 48 | */ 49 | public: 50 | // Basics declarations 51 | vector alpha, beta; 52 | LabelData penalties; 53 | bool penalize; 54 | int nbest; 55 | bool fb; 56 | TropicalWeight beam; 57 | 58 | // Constructors 59 | LatticePruner (); 60 | // Used with M2MFstAligner we should have a symbol-based penalty model to use 61 | LatticePruner (LabelData _penalties, TropicalWeight _beam, int _nbest, 62 | bool _fb, bool _penalize); 63 | // Otherwise just use an arbitrary lattice/WFST so no penalizing 64 | LatticePruner (TropicalWeight _beam, int _nbest, bool _fb); 65 | 66 | void prune_fst (VectorFst* fst); 67 | 68 | private: 69 | VectorFst _nbest_prune (VectorFst* fst); 70 | void _penalize_arcs (VectorFst* fst); 71 | void _forward_backward (VectorFst* fst); 72 | }; 73 | } // namespace fst 74 | #endif // SRC_INCLUDE_LATTICEPRUNER_H_ 75 | -------------------------------------------------------------------------------- /src/include/LegacyRnnLMDecodable.h: -------------------------------------------------------------------------------- 1 | #ifndef SRC_INCLUDE_LEGACYRNNLMDECODABLE_H_ 2 | #define SRC_INCLUDE_LEGACYRNNLMDECODABLE_H_ 3 | #include 4 | using std::vector; 5 | 6 | // Fast exponent implementation from RnnLM 7 | /* 8 | static union { 9 | double d; 10 | struct{ 11 | int j,i; 12 | } n; 13 | } d2i; 14 | #define EXP_A (1048576/M_LN2) 15 | #define EXP_C 60801 16 | #define FAST_EXP(y)(d2i.n.i=EXP_A*(y)+(1072693248-EXP_C),d2i.d) 17 | */ 18 | 19 | #ifdef __cplusplus 20 | #define cast_uint32_t static_cast 21 | #else 22 | #define cast_uint32_t (uint32_t) 23 | #endif 24 | static inline float fastpow2 (float p) { 25 | float offset = (p < 0) ? 1.0f : 0.0f; 26 | float clipp = (p < -126) ? -126.0f : p; 27 | int w = clipp; 28 | float z = clipp - w + offset; 29 | union {uint32_t i; float f;} v = { 30 | cast_uint32_t ( 31 | (1 << 23) * (clipp + 121.2740575f + 27.7280233f / 32 | (4.84252568f - z) - 1.49012907f * z) 33 | ) 34 | }; 35 | 36 | return v.f; 37 | } 38 | 39 | static inline float FAST_EXP (float p) { 40 | return fastpow2 (1.442695040f * p); 41 | } 42 | 43 | 44 | template 45 | class LegacyRnnLMDecodable { 46 | public: 47 | LegacyRnnLMDecodable (H& hash, int i, int h, int o, int d, int m) 48 | : h (hash), isize (i), hsize (h), osize (o), order (d), max_order (m) { } 49 | 50 | double ComputeNet (const T& p, T* t) { 51 | vector olayer; 52 | olayer.resize (osize, 0.0); 53 | 54 | for (int j = 0; j < hsize; j++) 55 | for (int i = 0; i < hsize; i++) 56 | t->hlayer [j] += p.hlayer [i] * syn0 [i + h.vocab_.size () + j * isize]; 57 | 58 | for (int i = 0; i < hsize; i++) 59 | if (p.word != -1) 60 | t->hlayer [i] += syn0 [p.word + i * (hsize + h.vocab_.size ())]; 61 | 62 | for (int i = 0; i < hsize; i++) { 63 | if (t->hlayer [i] > 50) 64 | t->hlayer [i] = 50; 65 | if (t->hlayer [i] < -50) 66 | t->hlayer [i] = -50; 67 | t->hlayer [i] = 1 / (1 + FAST_EXP (-t->hlayer [i])); 68 | } 69 | 70 | for (int j = h.vocab_.size (); j < osize; j++) 71 | for (int i = 0; i < hsize; i++) 72 | olayer [j] += t->hlayer [i] * syn1 [i + j * hsize]; 73 | 74 | // Begin class direct connection activations 75 | if (synd.size () > 0) { 76 | // Feature hash begin 77 | vector hash; 78 | hash.resize (max_order, 0); 79 | 80 | for (int i = 0; i < order; i++) { 81 | if (i > 0) 82 | if (t->history [i - 1] == -1) 83 | break; 84 | hash [i] = h.primes_[0] * h.primes_[1]; 85 | for (int j = 1; j <= i; j++) 86 | hash [i] += 87 | h.primes_[(i * h.primes_[j] + j) % h.primes_.size ()] 88 | * static_cast(t->history [j - 1] + 1); 89 | 90 | hash [i] = hash [i] % (synd.size () / 2); 91 | } 92 | // Feature hash end 93 | for (int i = h.vocab_.size (); i < osize; i++) { 94 | for (int j = 0; j < order; j++) { 95 | if (hash [j]) { 96 | olayer [i] += synd [hash [j]]; 97 | hash [j]++; 98 | } else { 99 | break; 100 | } 101 | } 102 | } 103 | } 104 | // End class direct connection activations 105 | 106 | double sum = 0; 107 | // Softmax on classes 108 | for (int i = h.vocab_.size (); i < osize; i++) { 109 | if (olayer [i] > 50) 110 | olayer [i] = 50; 111 | if (olayer [i] < -50) 112 | olayer [i] = -50; 113 | double val = FAST_EXP (olayer [i]); 114 | sum += val; 115 | olayer [i] = val; 116 | } 117 | for (int i = h.vocab_.size (); i < osize; i++) 118 | olayer [i] /= sum; 119 | 120 | // 1->2 word activations 121 | if (t->word != -1) { 122 | int begin = h.class_sizes_[h.vocab_[t->word].class_index].begin; 123 | int end = h.class_sizes_[h.vocab_[t->word].class_index].end; 124 | for (int j = begin; j <= end; j++) 125 | for (int i = 0; i < hsize; i++) 126 | olayer [j] += t->hlayer [i] * syn1 [i + j * hsize]; 127 | 128 | // Begin word direct connection activations 129 | if (synd.size () > 0) { 130 | // Begin feature hashing 131 | uint64 hash [max_order]; 132 | for (int i = 0; i < order; i++) 133 | hash [i] = 0; 134 | 135 | for (int i = 0; i < order; i++) { 136 | if (i > 0) 137 | if (t->history [i - 1] == -1) 138 | break; 139 | 140 | hash [i] = h.primes_[0] * h.primes_[1] 141 | * static_cast (h.vocab_[t->word].class_index + 1); 142 | 143 | for (int j = 1; j <= i; j++) 144 | hash [i] += h.primes_[(i * h.primes_[j] + j) % h.primes_.size ()] 145 | * static_cast (t->history [j - 1] + 1); 146 | 147 | hash [i] = (hash [i] % (synd.size () / 2)) + (synd.size () / 2); 148 | } 149 | // End feature hashing 150 | 151 | for (int i = begin; i <= end; i++) { 152 | for (int j = 0; j < order; j++) { 153 | if (hash [j]) { 154 | olayer [i] += synd [hash [j]]; 155 | hash [j]++; 156 | hash [j] = hash [j] % synd.size (); 157 | } else { 158 | break; 159 | } 160 | } 161 | } 162 | } 163 | // End word direct connection activations 164 | 165 | sum = 0.0; 166 | for (int i = begin; i <= end; i++) { 167 | if (olayer [i] > 50) 168 | olayer [i] = 50; 169 | if (olayer [i] < -50) 170 | olayer [i] = -50; 171 | olayer [i] = FAST_EXP (olayer [i]); 172 | sum += olayer [i]; 173 | } 174 | for (int i = begin; i <= end; i++) 175 | olayer [i] /= sum; 176 | } 177 | 178 | return olayer [t->word] 179 | * olayer [h.vocab_.size () + h.vocab_[t->word].class_index]; 180 | } 181 | 182 | // We need the synapses and the vocabulary hash 183 | H& h; 184 | int isize; 185 | int hsize; 186 | int osize; 187 | int order; 188 | int max_order; 189 | vector syn0; 190 | vector syn1; 191 | vector synd; 192 | }; 193 | #endif // SRC_INCLUDE_LEGACYRNNLMDECODABLE_H_ 194 | -------------------------------------------------------------------------------- /src/include/LegacyRnnLMHash.h: -------------------------------------------------------------------------------- 1 | #ifndef SRC_INCLUDE_LEGACYRNNLMHASH_H_ 2 | #define SRC_INCLUDE_LEGACYRNNLMHASH_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | 12 | typedef double real; 13 | 14 | struct VocabWord { 15 | public: 16 | VocabWord () {} 17 | explicit VocabWord (std::string word_) : cn (1), word (word_) {} 18 | VocabWord (std::string word_, int cn_) : cn (cn_), word (word_) {} 19 | int cn; // Unigram count 20 | std::string word; 21 | real prob; 22 | int class_index; 23 | }; 24 | 25 | struct ClassIndex { 26 | public: 27 | ClassIndex () : begin(0), end(0) {} 28 | int begin; 29 | int end; 30 | }; 31 | 32 | class LegacyRnnLMHash { 33 | public: 34 | explicit LegacyRnnLMHash (int class_size) 35 | : class_size_ (class_size), g_delim_("|"), gp_delim_("}") { 36 | vocab_hash_.resize (100000000); 37 | } 38 | 39 | LegacyRnnLMHash (int class_size, const string g_delim, const string gp_delim) 40 | : class_size_ (class_size), g_delim_(g_delim.c_str ()), 41 | gp_delim_(gp_delim.c_str ()) { 42 | vocab_hash_.resize (100000000); 43 | } 44 | 45 | static const std::vector primes_; 46 | 47 | void Split (const std::string& s, char delim, 48 | std::vector& elems) { 49 | std::stringstream ss (s); 50 | std::string item; 51 | while (getline (ss, item, delim)) 52 | elems.push_back (item); 53 | } 54 | 55 | template 56 | int HashInput (I start, I end) { 57 | size_t hash = 0; 58 | for (I it = start; it != end; ++it) 59 | hash = hash * 237 + isyms.Find (*it); 60 | 61 | return hash; 62 | } 63 | 64 | void MapToken (string& token) { 65 | std::vector gp; 66 | std::vector graphs; 67 | // std::vector phones; 68 | 69 | Split (token, *gp_delim_, gp); 70 | Split (gp [0], *g_delim_, graphs); 71 | // Split (gp [1], *g_delim, phones); 72 | 73 | size_t hash = 0; 74 | for (int i = 0; i < graphs.size (); i++) 75 | hash = hash * 237 + isyms.AddSymbol (graphs [i]); 76 | 77 | if (imap.find (hash) == imap.end ()) 78 | imap [hash] = std::vector {FindWord (token)}; 79 | else 80 | imap [hash].push_back (FindWord (token)); 81 | 82 | /* 83 | if (omap.find (FindWord (token)) == omap.end ()) 84 | omap [FindWord (token)] = phones; 85 | */ 86 | } 87 | 88 | int HashWord (std::string& word) const { 89 | size_t hash = 0; 90 | for (size_t i = 0; i < word.size (); i++) 91 | hash = hash * 237 + word[i]; 92 | hash = hash % vocab_hash_.size (); 93 | return hash; 94 | } 95 | 96 | int FindWord (std::string& word) { 97 | size_t hash = HashWord (word); 98 | 99 | if (vocab_hash_[hash] == -1) 100 | return -1; 101 | 102 | if (word.compare (vocab_[vocab_hash_[hash]].word) == 0) 103 | return vocab_hash_[hash]; 104 | 105 | for (size_t i = 0; i < vocab_.size (); i++) { 106 | if (word.compare (vocab_[i].word) == 0) { 107 | vocab_hash_[hash] = i; 108 | return i; 109 | } 110 | } 111 | return -1; 112 | } 113 | 114 | int GetWordId (std::string& word) const { 115 | size_t hash = HashWord (word); 116 | if (vocab_hash_[hash] == -1) 117 | return -1; 118 | return vocab_hash_[hash]; 119 | } 120 | 121 | int AddWordToVocab (std::string& word, int cn = 1) { 122 | vocab_.push_back (VocabWord (word, cn)); 123 | size_t hash = HashWord (word); 124 | vocab_hash_[hash] = vocab_.size () - 1; 125 | return vocab_.size () - 1; 126 | } 127 | 128 | void SortVocab () { 129 | // Just sorts based on Class 130 | for (int i = 1; i < vocab_.size (); i++) { 131 | int max = i; 132 | for (int j = i + 1; j < vocab_.size (); j++) 133 | if (vocab_[max].cn < vocab_[j].cn) 134 | max = j; 135 | VocabWord swap = vocab_[max]; 136 | vocab_[max] = vocab_[i]; 137 | vocab_[i] = swap; 138 | } 139 | } 140 | 141 | void SetClasses () { 142 | double df = 0; 143 | double dd = 0; 144 | int a = 0; 145 | int b = 0; 146 | 147 | for (int i = 0; i < vocab_.size (); i++) 148 | b += vocab_[i].cn; 149 | for (int i = 0; i < vocab_.size (); i++) 150 | dd += sqrt (vocab_[i].cn / static_cast (b)); 151 | for (int i = 0; i < vocab_.size (); i++) { 152 | df += sqrt (vocab_[i].cn / static_cast (b)) / dd; 153 | if (df > 1) 154 | df = 1; 155 | if (df > (a + 1) / static_cast (class_size_)) { 156 | vocab_[i].class_index = a; 157 | if (a < class_size_ - 1) 158 | a++; 159 | } else { 160 | vocab_[i].class_index = a; 161 | } 162 | } 163 | 164 | class_sizes_.resize (class_size_); 165 | int c = 0; 166 | for (int i = 0; i < vocab_.size (); i++) { 167 | if (i == 0) { 168 | class_sizes_[c].begin = i; 169 | } 170 | 171 | if (i + 1 == vocab_.size ()) { 172 | class_sizes_[c].end = i; 173 | } else if (vocab_[i].class_index < vocab_[i + 1].class_index) { 174 | class_sizes_[c].end = i; 175 | c++; 176 | class_sizes_[c].begin = i + 1; 177 | } 178 | } 179 | } 180 | 181 | std::vector vocab_hash_; 182 | std::vector vocab_; 183 | std::vector class_sizes_; 184 | std::unordered_map > imap; 185 | // std::unordered_map > omap; 186 | fst::SymbolTable isyms; 187 | int class_size_; 188 | const char* g_delim_; 189 | const char* gp_delim_; 190 | }; 191 | 192 | const std::vector LegacyRnnLMHash::primes_ = { 193 | 108641969, 116049371, 125925907, 133333309, 194 | 145678979, 175308587, 197530793, 234567803, 195 | 251851741, 264197411, 330864029, 399999781, 196 | 407407183, 459258997, 479012069, 545678687, 197 | 560493491, 607407037, 629629243, 656789717, 198 | 716048933, 718518067, 725925469, 733332871, 199 | 753085943, 755555077, 782715551, 790122953, 200 | 812345159, 814814293, 893826581, 923456189, 201 | 940740127, 953085797, 985184539, 990122807 202 | }; 203 | 204 | // const char* LegacyRnnLMHash::g_delim = "|"; 205 | // const char* LegacyRnnLMHash::gp_delim = "}"; 206 | 207 | #endif // SRC_INCLUDE_LEGACYRNNLMHASH_H_ 208 | -------------------------------------------------------------------------------- /src/include/LegacyRnnLMReader.h: -------------------------------------------------------------------------------- 1 | #ifndef SRC_INCLUDE_LEGACYRNNLMREADER_H_ 2 | #define SRC_INCLUDE_LEGACYRNNLMREADER_H_ 3 | #include 4 | #include "./rnnlmlib.h" 5 | using std::string; 6 | 7 | template 8 | class LegacyRnnLMReader { 9 | public: 10 | typedef D Decodable; 11 | typedef H Hasher; 12 | 13 | explicit LegacyRnnLMReader (const string& rnnlm_file) { 14 | srand (1); 15 | // We don't actually need or use any of this 16 | rnnlm_.setLambda (0.75); 17 | rnnlm_.setRegularization (0.0000001); 18 | rnnlm_.setDynamic (false); 19 | rnnlm_.setRnnLMFile (const_cast (rnnlm_file.c_str ())); 20 | rnnlm_.setRandSeed (1); 21 | rnnlm_.useLMProb (false); 22 | rnnlm_.setDebugMode (1); 23 | // This will actually load the thing 24 | rnnlm_.restoreNet (); 25 | } 26 | 27 | Decodable CopyLegacyRnnLM (Hasher& h, int max_order = 5) { 28 | // Copy static data that can be shared by all tokens 29 | Decodable d (h, rnnlm_.layer0_size, rnnlm_.layer1_size, 30 | rnnlm_.layer2_size, rnnlm_.direct_order, 31 | max_order); 32 | for (int i = 0; i < rnnlm_.layer0_size * rnnlm_.layer1_size; i++) 33 | d.syn0.push_back (static_cast (rnnlm_.syn0 [i].weight)); 34 | 35 | for (int i = 0; i < rnnlm_.layer1_size * rnnlm_.layer2_size; i++) 36 | d.syn1.push_back (static_cast (rnnlm_.syn1 [i].weight)); 37 | 38 | for (int i = 0; i < rnnlm_.direct_size; i++) 39 | d.synd.push_back (static_cast (rnnlm_.syn_d [i])); 40 | 41 | return d; 42 | } 43 | 44 | Hasher CopyVocabHash (const string g_delim, const string gp_delim) { 45 | Hasher h (rnnlm_.class_size, g_delim, gp_delim); 46 | for (int i = 0; i < rnnlm_.vocab_size; i++) { 47 | string word = rnnlm_.vocab [i].word; 48 | h.AddWordToVocab (word, rnnlm_.vocab [i].cn); 49 | } 50 | h.SortVocab (); 51 | h.SetClasses (); 52 | for (int i = 0; i < h.vocab_.size (); i++) 53 | h.MapToken (h.vocab_[i].word); 54 | 55 | return h; 56 | } 57 | 58 | Hasher CopyVocabHash () { 59 | Hasher h (rnnlm_.class_size); 60 | for (int i = 0; i < rnnlm_.vocab_size; i++) { 61 | string word = rnnlm_.vocab [i].word; 62 | h.AddWordToVocab (word, rnnlm_.vocab [i].cn); 63 | } 64 | h.SortVocab (); 65 | h.SetClasses (); 66 | for (int i = 0; i < h.vocab_.size (); i++) 67 | h.MapToken (h.vocab_[i].word); 68 | 69 | return h; 70 | } 71 | 72 | private: 73 | CRnnLM rnnlm_; // 1The actual model 74 | }; 75 | #endif // SRC_INCLUDE_LEGACYRNNLMREADER_H_ 76 | -------------------------------------------------------------------------------- /src/include/M2MFstAligner.h: -------------------------------------------------------------------------------- 1 | #ifndef SRC_INCLUDE_M2MFSTALIGNER_H_ 2 | #define SRC_INCLUDE_M2MFSTALIGNER_H_ 3 | /* 4 | M2MFstAligner.hpp 5 | 6 | Copyright (c) [2012-], Josef Robert Novak 7 | All rights reserved. 8 | 9 | Redistribution and use in source and binary forms, with or without 10 | modification, are permitted #provided that the following conditions 11 | are met: 12 | 13 | * Redistributions of source code must retain the above copyright 14 | notice, this list of conditions and the following disclaimer. 15 | * Redistributions in binary form must reproduce the above 16 | copyright notice, this list of #conditions and the following 17 | disclaimer in the documentation and/or other materials provided 18 | with the distribution. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 23 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 24 | COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 25 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 29 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 31 | OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include "./util.h" 40 | using namespace std; 41 | 42 | 43 | namespace fst{ 44 | class M2MFstAligner { 45 | /* 46 | Read in pairs of sequences of the form SEQ1 and SEQ2 and 47 | transform them into an FST that encodes all possible 48 | alignments between the symbols in the two sequences. 49 | Note that this may include a combination of multi-symbol 50 | subsequences depending on user specifications. 51 | 52 | This is achieved by simply generating the entire alignment 53 | graph during a single nested loop through the two input 54 | sequences that are to be aligned. 55 | 56 | The user may optionally specify whether to allow deletions 57 | for SEQ1 or SEQ2, as well as a maximum subsequence length 58 | for each sequence. 59 | 60 | This class does not implement any lattice pruning or printing 61 | methods. A combination of the LatticePruner and FstPathFinder 62 | classes may be used to achieve this a-la phonetisaurus-align.cpp. 63 | */ 64 | public: 65 | // Basics declarations 66 | bool seq1_del; 67 | bool seq2_del; 68 | unsigned int seq1_max; 69 | unsigned int seq2_max; 70 | string seq1_sep; 71 | string seq2_sep; 72 | string s1s2_sep; 73 | string eps; 74 | string skip; 75 | bool penalize; 76 | bool penalize_em; 77 | bool restrict; 78 | bool grow; 79 | 80 | // vector alpha, beta; 81 | // This will be used during decoding to clean the paths 82 | set skipSeqs; 83 | // OpenFst stuff 84 | // These will be overwritten after each FST construction 85 | vector > fsas; 86 | 87 | // This will be maintained for the life of object 88 | // These symbol tables will be maintained entire life of 89 | // the object. This will ensure that any resulting 'corpus' 90 | // shares the same symbol tables. 91 | SymbolTable *isyms; 92 | map alignment_model; 93 | map prev_alignment_model; 94 | LabelData penalties; 95 | LogWeight total; 96 | LogWeight prevTotal; 97 | 98 | // Constructors 99 | M2MFstAligner (); 100 | // Train from scratch using a dictionary 101 | M2MFstAligner (bool seq1_del, bool seq2_del, unsigned int seq1_max, 102 | unsigned int seq2_max, 103 | string seq1_sep, string seq2_sep, string s1s2_sep, 104 | string eps, string _skip, bool _penalize, 105 | bool penalize_em, bool restrict, bool grow); 106 | // We've already got a model to go on 107 | M2MFstAligner (string model_file, bool penalize, bool penalize_em, 108 | bool restrict); 109 | 110 | // Write an aligner model to disk. Critical info is stored in the 111 | // the symbol table so that it can be restored when the model is loaded. 112 | void write_model (string model_name); 113 | 114 | // Transform a sequence pair into an equivalent multiple-to-multiple FST, 115 | // encoding all possible alignments between the two sequences 116 | void Sequences2FST (VectorFst* fst, vector* seq1, 117 | vector* seq2); 118 | void Sequences2FST (VectorFst* fst, int s1m, int s2m, 119 | vector* seq1, vector* seq2); 120 | void Sequences2FSTNoInit (VectorFst* fst, vector* seq1, 121 | vector* seq2); 122 | 123 | // Initialize all of the training data 124 | void entry2alignfst (vector seq1, vector seq2); 125 | void entry2alignfstnoinit (vector seq1, vector seq2, 126 | int nbest, string lattice = ""); 127 | void _conditional_max (bool x_given_y); 128 | // The expectation routines 129 | void expectation (); 130 | 131 | // The maximization routine. Returns the change since the last iteration 132 | float maximization (bool lastiter); 133 | 134 | // Precompute the label and subsequence lengths for all possible alignment 135 | // units this helps speedup the penalization and decoding routines. 136 | void _compute_penalties (LogArc::Label label, int lhs, int rhs, 137 | bool lhsE, bool rhsE); 138 | }; 139 | } // namespace fst 140 | #endif // SRC_INCLUDE_M2MFSTALIGNER_H_ 141 | -------------------------------------------------------------------------------- /src/include/PhonetisaurusScript.h: -------------------------------------------------------------------------------- 1 | /* 2 | PhonetisaurusPy.h 3 | 4 | Copyright (c) [2012-], Josef Robert Novak 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted #provided that the following conditions 9 | are met: 10 | 11 | * Redistributions of source code must retain the above copyright 12 | notice, this list of conditions and the following disclaimer. 13 | * Redistributions in binary form must reproduce the above 14 | copyright notice, this list of #conditions and the following 15 | disclaimer in the documentation and/or other materials provided 16 | with the distribution. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22 | COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 23 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 27 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 29 | OF THE POSSIBILITY OF SUCH DAMAGE. 30 | */ 31 | // \file 32 | // This implements the scripting interface for the FST-based 33 | // decoder. The associated classes are suitable for 34 | // construction of command-line utilities and bindings for 35 | // scripting languages such as Python. 36 | // 37 | #ifndef SRC_INCLUDE_PHONETISAURUSSCRIPT_H_ 38 | #define SRC_INCLUDE_PHONETISAURUSSCRIPT_H_ 39 | #include "PhonetisaurusRex.h" 40 | #include 41 | #include 42 | #include 43 | #include 44 | /*! \struct PathData 45 | \brief Response data. 46 | 47 | The PathData structure is used to encapsulate 48 | a single FST G2P result. 49 | */ 50 | struct PathData { 51 | PathData () {} 52 | PathData (float PathWeight_, const vector& PathWeights_, 53 | const vector& ILabels_, const vector& OLabels_, 54 | const vector& Uniques_) 55 | : PathWeight (PathWeight_), PathWeights (PathWeights_), 56 | ILabels (ILabels_), OLabels (OLabels_), Uniques(Uniques_) {} 57 | 58 | float PathWeight; 59 | vector PathWeights; 60 | vector ILabels; 61 | vector OLabels; 62 | // Contains only 'interesting' phone labels 63 | vector Uniques; 64 | }; 65 | 66 | /*! \class PhonetisaurusScript 67 | \brief A wrapper class encapsulating the FST G2P decoder. 68 | 69 | A wrapper class for the FST G2P decoder. Suitable for 70 | incorporation into commandline binaries and bindings 71 | for various scripting languages. 72 | */ 73 | class PhonetisaurusScript { 74 | private: 75 | void normalizeModel() { 76 | ArcSort (&model_, ILabelCompare ()); 77 | isyms_ = model_.InputSymbols (); 78 | osyms_ = model_.OutputSymbols (); 79 | imax_ = LoadClusters (isyms_, &imap_, &invimap_); 80 | omax_ = LoadClusters (osyms_, &omap_, &invomap_); 81 | veto_set_.insert (0); 82 | veto_set_.insert (1); 83 | veto_set_.insert (2); 84 | } 85 | public: 86 | explicit PhonetisaurusScript (const VectorFst model, string delim="") : delim_(delim) { 87 | model_ = model; 88 | normalizeModel(); 89 | } 90 | 91 | explicit PhonetisaurusScript(string model, string delim="") : delim_(delim) { 92 | struct stat buffer; 93 | if (!(stat (model.c_str(), &buffer) == 0)) 94 | throw std::exception(); 95 | 96 | // this is solving the memory leak problem 97 | VectorFst* model_temp{nullptr}; 98 | model_temp = (VectorFst::Read(model)); 99 | if(!model_temp) { throw std::exception(); } 100 | model_ = *model_temp; 101 | delete model_temp; 102 | 103 | normalizeModel(); 104 | } 105 | 106 | // The actual phoneticizer routine 107 | vector Phoneticize (const string& word, int nbest = 1, 108 | int beam = 10000, float threshold = 99, 109 | bool write_fsts = false, 110 | bool accumulate = false, 111 | double pmass = 99.0) { 112 | VectorFst* fst = new VectorFst (); 113 | vector entry = tokenize2ints ( 114 | const_cast (&word), 115 | &delim_, isyms_ 116 | ); 117 | Entry2FSA (entry, fst, imax_, invimap_); 118 | 119 | fst->SetInputSymbols (isyms_); 120 | fst->SetOutputSymbols (isyms_); 121 | 122 | // Useful for debugging; print the input word machine 123 | if (write_fsts) 124 | fst->Write (word + ".fst"); 125 | 126 | VectorFst ofst; 127 | 128 | StdArc::Weight weight_threshold = threshold; 129 | StdArc::StateId state_threshold = kNoStateId; 130 | AnyArcFilter arc_filter; 131 | vector distance; 132 | 133 | VectorFst* ifst = new VectorFst(); 134 | Compose(*fst, model_, ifst); 135 | 136 | // Useful for debugging; print the g2p lattice 137 | if (write_fsts) 138 | ifst->Write (word+".lat.fst"); 139 | 140 | AutoQueue state_queue (*ifst, &distance, arc_filter); 141 | 142 | M2MPathFilter path_filter (omap_, veto_set_); 143 | 144 | ShortestPathOptions, 145 | AnyArcFilter > 146 | opts (&state_queue, arc_filter, nbest, false, false, 147 | kDelta, false, weight_threshold, 148 | state_threshold); 149 | 150 | ShortestPathSpecialized (*ifst, &ofst, &distance, 151 | &path_filter, beam, opts, accumulate); 152 | 153 | vector paths; 154 | float total = 99.0; 155 | if (pmass < 99.0) { 156 | for (size_t i = 0; i < path_filter.ordered_paths.size(); i++) { 157 | const vector& u = path_filter.ordered_paths [i]; 158 | const Path& orig = path_filter.path_map [u]; 159 | total = Plus (LogWeight (total), LogWeight (orig.PathWeight)).Value (); 160 | } 161 | } 162 | 163 | LogWeight nbest_pmass = 99.0; 164 | for (size_t i = 0; i < path_filter.ordered_paths.size(); i++) { 165 | const vector& u = path_filter.ordered_paths [i]; 166 | const Path& orig = path_filter.path_map [u]; 167 | float pweight = orig.PathWeight; 168 | if (pmass < 99.0) { 169 | pweight = pweight - total; 170 | nbest_pmass = Plus ( 171 | LogWeight (nbest_pmass), 172 | LogWeight (pweight) 173 | ).Value (); 174 | } 175 | 176 | PathData path = PathData ( 177 | pweight, orig.PathWeights, 178 | orig.ILabels, orig.OLabels, orig.unique_olabels 179 | ); 180 | paths.push_back (path); 181 | 182 | // We are greedy with this, in order to ensure that if pmass =~ -log (.8), 183 | // and we have h1 = -log (.5), and h2 = -log (.4) that we get both. 184 | if (pmass < 99.0 && nbest_pmass.Value () < pmass) 185 | break; 186 | } 187 | 188 | // Make sure that we clean up 189 | delete fst; 190 | delete ifst; 191 | return paths; 192 | } 193 | 194 | // Helper functions for the bindings 195 | string FindIsym (int symbol_id) { 196 | return isyms_->Find (symbol_id); 197 | } 198 | 199 | int FindIsym (const string& symbol) { 200 | return isyms_->Find (symbol); 201 | } 202 | 203 | string FindOsym (int symbol_id) { 204 | return osyms_->Find (symbol_id); 205 | } 206 | 207 | int FindOsym (const string& symbol) { 208 | return osyms_->Find (symbol); 209 | } 210 | 211 | const SymbolTable* isyms_; 212 | const SymbolTable* osyms_; 213 | 214 | private: 215 | VectorFst model_; 216 | SymbolMap12M imap_, omap_; 217 | SymbolMapM21 invimap_, invomap_; 218 | int imax_; 219 | int omax_; 220 | VetoSet veto_set_; 221 | string delim_; 222 | }; 223 | #endif // SRC_INCLUDE_PHONETISAURUSSCRIPT_H_ 224 | -------------------------------------------------------------------------------- /src/include/RnnLMDecoder.h: -------------------------------------------------------------------------------- 1 | #ifndef SRC_INCLUDE_RNNLMDECODER_H_ 2 | #define SRC_INCLUDE_RNNLMDECODER_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | using fst::VectorFst; 13 | using fst::ArcIterator; 14 | using fst::StateIterator; 15 | using fst::StdArc; 16 | using fst::Heap; 17 | using std::vector; 18 | using std::unordered_set; 19 | 20 | 21 | class Token { 22 | public: 23 | Token (int hsize, int max_order) 24 | : word (0), weight (0.0), total (0.0), 25 | g (0.0), prev (NULL), state (0), key (-1) { 26 | hlayer.resize (hsize, 1.0); 27 | history.resize (max_order, 0); 28 | 29 | HashHistory (); 30 | } 31 | 32 | Token (Token* tok, int w, int s) 33 | : word (w), weight (0.0), total (0.0), 34 | g (0.0), prev (tok), state (s), key (-1) { 35 | // Copy an existing token and update the 36 | // various layers as needed 37 | hlayer.resize (tok->hlayer.size(), 0.0); 38 | history.resize (tok->history.size (), 0); 39 | 40 | // Would it be more efficient to perform the hash 41 | // by iterating back throug the parent tokens? 42 | for (int i = tok->history.size () - 1; i > 0; i--) 43 | history [i] = tok->history [i - 1]; 44 | history [0] = tok->word; 45 | 46 | HashHistory (); 47 | } 48 | 49 | void HashHistory () { 50 | hhash = state * 7853; 51 | for (int i = 0; i < history.size (); i++) 52 | hhash = hhash * 7877 + history [i]; 53 | } 54 | 55 | int word; 56 | mutable double weight; 57 | mutable double total; 58 | mutable double g; 59 | mutable Token* prev; 60 | int state; 61 | mutable int key; 62 | mutable vector hlayer; 63 | mutable vector history; 64 | size_t hhash; 65 | }; 66 | 67 | class TokenCompare { 68 | public: 69 | bool operator () (const Token& t1, const Token& t2) const { 70 | return (t1.state == t2.state && 71 | t1.word == t2.word && 72 | t1.hhash == t2.hhash); 73 | /* 74 | return (t1.state == t2.state && 75 | t1.word == t2.word); 76 | */ 77 | } 78 | }; 79 | 80 | class TokenHash { 81 | public: 82 | size_t operator () (const Token& t) const { 83 | return t.state * kPrime0 + t.word * kPrime1 + t.hhash * kPrime2; 84 | // return t.state * kPrime0 + t.word * kPrime1; 85 | } 86 | private: 87 | static const size_t kPrime0; 88 | static const size_t kPrime1; 89 | static const size_t kPrime2; 90 | }; 91 | const size_t TokenHash::kPrime0 = 7853; 92 | const size_t TokenHash::kPrime1 = 7867; 93 | const size_t TokenHash::kPrime2 = 7873; 94 | 95 | 96 | class TokenPointerCompare { 97 | public: 98 | bool operator () (const Token* t1, const Token* t2) const { 99 | return (t1->g < t2->g); 100 | } 101 | }; 102 | 103 | class Chunk { 104 | public: 105 | Chunk (int word, double cost, double total) 106 | : w (word), c (cost), t (total) { } 107 | int w; 108 | double c; 109 | double t; 110 | template 111 | vector Tokenize (char gpdelim, char gdelim, H& h, 112 | bool graphemes = false) const { 113 | vector gp_elems; 114 | Split (h.vocab_[w].word, gpdelim, gp_elems); 115 | vector elems; 116 | if (graphemes == true) 117 | Split (gp_elems [0], gdelim, elems); 118 | else if (gp_elems.size () == 2) 119 | Split (gp_elems [1], gdelim, elems); 120 | return elems; 121 | } 122 | }; 123 | 124 | class SimpleResult { 125 | public: 126 | SimpleResult (string word, vector scores, 127 | vector pronunciations) 128 | : word (word), scores (scores), pronunciations (pronunciations) { } 129 | 130 | SimpleResult () { } 131 | 132 | string word; 133 | vector scores; 134 | vector pronunciations; 135 | }; 136 | 137 | /* Standalone function for convenience */ 138 | template 139 | VectorFst WordToRnnLMFst (const vector& word, H& h) { 140 | VectorFst fst; 141 | fst.AddState (); 142 | fst.SetStart (0); 143 | for (int i = 0; i < word.size (); i++) { 144 | int hash = h.HashInput (word.begin () + i, 145 | word.begin () + i + 1); 146 | fst.AddState (); 147 | fst.AddArc (i, StdArc (hash, hash, StdArc::Weight::One(), i + 1)); 148 | } 149 | 150 | for (int i = 0; i < word.size (); i++) { 151 | for (int j = 2; j <= 3; j++) { 152 | if (i + j <= word.size ()) { 153 | int hash = h.HashInput (word.begin () + i, word.begin () + i + j); 154 | if (h.imap.find (hash) != h.imap.end ()) 155 | fst.AddArc (i, StdArc (hash, hash, StdArc::Weight::One (), i + j)); 156 | } 157 | } 158 | } 159 | fst.SetFinal (word.size (), StdArc::Weight::One ()); 160 | 161 | return fst; 162 | } 163 | 164 | template 165 | class RnnLMDecoder { 166 | public: 167 | typedef D Decodable; 168 | typedef vector > RawResults; 169 | typedef Heap Queue; 170 | typedef unordered_set TokenSet; 171 | 172 | explicit RnnLMDecoder (Decodable& decodable) 173 | : d (decodable) { } 174 | 175 | double Heuristic (int nstate, int nstates, double hcost) { 176 | int factor = nstates - nstate - 1; 177 | if (factor > 0) 178 | return factor * hcost; 179 | return 0.0; 180 | } 181 | 182 | VectorFst WordToRnnLMFst (const vector& word) { 183 | VectorFst fst; 184 | fst.AddState (); 185 | fst.SetStart (0); 186 | for (int i = 0; i < word.size (); i++) { 187 | int hash = d.h.HashInput (word.begin () + i, 188 | word.begin () + i + 1); 189 | fst.AddState (); 190 | fst.AddArc (i, StdArc (hash, hash, StdArc::Weight::One(), i + 1)); 191 | } 192 | 193 | for (int i = 0; i < word.size (); i++) { 194 | for (int j = 2; j <= 3; j++) { 195 | if (i + j <= word.size ()) { 196 | int hash = d.h.HashInput (word.begin () + i, word.begin () + i + j); 197 | if (d.h.imap.find (hash) != d.h.imap.end ()) 198 | fst.AddArc (i, StdArc (hash, hash, StdArc::Weight::One (), i + j)); 199 | } 200 | } 201 | } 202 | fst.SetFinal (word.size (), StdArc::Weight::One ()); 203 | 204 | return fst; 205 | } 206 | 207 | SimpleResult Decode (const vector& word, int beam, int kMax, 208 | int nbest, double thresh, const string& gpdelim, 209 | const string& gdelim, const string& skip) { 210 | RawResults raw_results = DecodeRaw (word, beam, kMax, nbest, thresh); 211 | SimpleResult simple_result; 212 | stringstream word_ss; 213 | for (int i = 0; i < word.size (); i++) 214 | if (i != word.size () - 1) 215 | word_ss << word [i]; 216 | simple_result.word = word_ss.str (); 217 | 218 | for (int i = 0; i < raw_results.size (); i++) { 219 | const vector& result = raw_results [i]; 220 | stringstream pronunciation_ss; 221 | for (vector::const_iterator it = result.begin (); 222 | it != result.end (); ++it) { 223 | vector chunk_vec = \ 224 | it->Tokenize (static_cast(*gpdelim.c_str ()), 225 | static_cast(*gdelim.c_str ()), 226 | d.h); 227 | for (int j = 0; j < chunk_vec.size (); j++) { 228 | if (chunk_vec [j].compare (skip) != 0) 229 | pronunciation_ss << chunk_vec [j]; 230 | else 231 | continue; 232 | 233 | if (!(it == result.end () && j != chunk_vec.size () - 1)) 234 | pronunciation_ss << " "; 235 | } 236 | if (it+1 == result.end ()) 237 | simple_result.scores.push_back (it->t); 238 | } 239 | simple_result.pronunciations.push_back (pronunciation_ss.str ()); 240 | } 241 | 242 | return simple_result; 243 | } 244 | 245 | RawResults DecodeRaw (const vector& word, int beam, int kMax, 246 | int nbest, double thresh = 0.0) { 247 | VectorFst fst = WordToRnnLMFst (word); 248 | for (int i = 0; i < sQueue.size (); i++) 249 | sQueue [i].Clear (); 250 | sQueue.resize (fst.NumStates () + 1); 251 | 252 | Initialize (); 253 | int n = 0; 254 | for (StateIterator > siter (fst); 255 | !siter.Done(); siter.Next ()) { 256 | int s = siter.Value (); 257 | int k = 0; 258 | while (!sQueue [s].Empty () && k < kMax && n < nbest) { 259 | Token* top = sQueue [s].Pop (); 260 | if (fst.Final (top->state) != StdArc::Weight::Zero ()) { 261 | // Token* a = (Token*)&(*top); 262 | Token* a = reinterpret_cast(top); 263 | if (n > 0 && thresh > 0.0) 264 | if (a->total - results [0][results [0].size () - 1].t > thresh) 265 | break; 266 | 267 | vector result; 268 | while (a->prev != NULL) { 269 | result.push_back (Chunk (a->word, a->weight, a->total)); 270 | a = reinterpret_cast (a->prev); 271 | } 272 | reverse (result.begin (), result.end ()); 273 | results.push_back (result); 274 | n++; 275 | continue; 276 | } 277 | 278 | for (ArcIterator > aiter (fst, top->state); 279 | !aiter.Done (); aiter.Next ()) { 280 | const StdArc& arc = aiter.Value (); 281 | const vector& map = d.h.imap [arc.ilabel]; 282 | 283 | for (int i = 0; i < map.size (); i++) { 284 | Token ntoken (reinterpret_cast(top), map [i], 285 | arc.nextstate); 286 | ntoken.weight = -log (d.ComputeNet ((*top), &ntoken)); 287 | if (ntoken.weight > beam) 288 | continue; 289 | 290 | ntoken.total += top->total + ntoken.weight; 291 | // Heuristic here if we use one (we don't) 292 | ntoken.g = ntoken.total; 293 | 294 | TokenSet::iterator niterator = pool.find (ntoken); 295 | 296 | if (niterator == pool.end ()) { 297 | pool.insert (ntoken); 298 | Token* npointer = (Token*)&(*pool.find (ntoken)); 299 | sQueue [arc.nextstate].Insert (npointer); 300 | } else { 301 | if (ntoken.g < niterator->g) { 302 | niterator->weight = ntoken.weight; 303 | niterator->total = ntoken.total; 304 | niterator->prev = ntoken.prev; 305 | niterator->history = ntoken.history; 306 | niterator->g = ntoken.g; 307 | niterator->hlayer = ntoken.hlayer; 308 | sQueue [arc.nextstate].Insert ((Token*)&(*niterator)); 309 | } 310 | } 311 | } 312 | } 313 | k++; 314 | } 315 | } 316 | return results; 317 | } 318 | 319 | RawResults results; 320 | 321 | 322 | private: 323 | void Initialize () { 324 | pool.clear (); 325 | results.clear (); 326 | 327 | Token start (d.hsize, d.max_order); 328 | pool.insert (start); 329 | TokenSet::iterator prev = pool.find (start); 330 | prev->key = sQueue [0].Insert (reinterpret_cast(&prev)); 331 | return; 332 | } 333 | 334 | Decodable& d; 335 | vector sQueue; 336 | TokenSet pool; 337 | }; 338 | #endif // SRC_INCLUDE_RNNLMDECODER_H_ 339 | -------------------------------------------------------------------------------- /src/include/RnnLMPy.h: -------------------------------------------------------------------------------- 1 | // RnnLMWrapper.h 2 | // 3 | // Copyright (c) [2013-], Yandex, LLC 4 | // Author: jorono@yandex-team.ru (Josef Robert Novak) 5 | // All rights reserved. 6 | /* 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted #provided that the following conditions 9 | are met: 10 | 11 | * Redistributions of source code must retain the above copyright 12 | notice, this list of conditions and the following disclaimer. 13 | * Redistributions in binary form must reproduce the above 14 | copyright notice, this list of #conditions and the following 15 | disclaimer in the documentation and/or other materials provided 16 | with the distribution. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22 | COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 23 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 27 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 29 | OF THE POSSIBILITY OF SUCH DAMAGE. 30 | * 31 | */ 32 | /// \file 33 | /// Python bindings for RnnLM. These only correspond 34 | /// to basic evaluation functions, not training. By default 35 | /// the evaluations utilizes the -independent convention from 36 | /// the original rnnlm tool. This is all we are interested in 37 | /// for G2P evaluations. 38 | #ifndef SRC_INCLUDE_RNNLMPY_H_ 39 | #define SRC_INCLUDE_RNNLMPY_H_ 40 | 41 | #include 42 | #include 43 | #include 44 | #include "./rnnlmlib.h" 45 | 46 | using namespace fst; 47 | 48 | typedef struct UttResult { 49 | UttResult () : sent_prob(0.0) {} 50 | double sent_prob; 51 | vector word_probs; 52 | vector words; 53 | } UttResult; 54 | 55 | class RnnLMPy { 56 | public: 57 | explicit RnnLMPy (string rnnlm_file) { 58 | srand (1); 59 | rnnlm_.setLambda (0.75); 60 | rnnlm_.setRegularization (0.0000001); 61 | rnnlm_.setDynamic (false); 62 | rnnlm_.setRnnLMFile (const_cast (rnnlm_file.c_str())); 63 | rnnlm_.setRandSeed (1); 64 | rnnlm_.useLMProb (false); 65 | rnnlm_.setDebugMode (1); 66 | rnnlm_.restoreNet (); 67 | } 68 | 69 | vector GetJointVocab (string& token) { 70 | return rnnlm_.SearchJointVocab (token); 71 | } 72 | 73 | string GetString (int id) { 74 | return rnnlm_.token_map[id]; 75 | } 76 | 77 | UttResult EvaluateSentence (vector words) { 78 | /* 79 | Note that the user is responsible for explicitly 80 | providing the sentence-end token in the words vector! 81 | */ 82 | int a, word, last_word; 83 | UttResult result; 84 | string delim = "}"; 85 | 86 | last_word = 0; 87 | rnnlm_.copyHiddenLayerToInput (); 88 | if (rnnlm_.bptt > 0) { 89 | for (a = 0; a < rnnlm_.bptt + rnnlm_.bptt_block; a++) 90 | rnnlm_.bptt_history[a] = 0; 91 | } 92 | for (a = 0; a < MAX_NGRAM_ORDER; a++) 93 | rnnlm_.history[a] = 0; 94 | rnnlm_.netReset(); 95 | 96 | // Check the G2P tokens 97 | for (size_t i = 0; i < words.size(); i++) { 98 | word = rnnlm_.searchVocab (const_cast (words[i].c_str())); 99 | /* 100 | vector toks = tokenize_utf8_string (&words[i], &delim); 101 | cout << toks[0] << endl; 102 | vector& tokens = rnnlm_.SearchJointVocab (toks[0]); 103 | float tscore = -999; 104 | for (int j = 0; j < tokens.size(); j++) { 105 | cout << " " << tokens[j] << "\t" 106 | << rnnlm_.token_map[tokens[j]] << "\t"; 107 | rnnlm_.computeNet (last_word, tokens[j]); 108 | float tval = log10 (rnnlm_.neu2[rnnlm_.vocab[tokens[j]].class_index 109 | + rnnlm_.vocab_size].ac 110 | * rnnlm_.neu2[tokens[j]].ac); 111 | if (tval > tscore) { 112 | tscore = tval; 113 | word = tokens[j]; 114 | } 115 | cout << tval << endl; 116 | } 117 | ///////////////////// 118 | */ 119 | result.words.push_back (rnnlm_.token_map[word]); 120 | rnnlm_.computeNet (last_word, word); 121 | 122 | 123 | if (word != -1) { 124 | result.word_probs.push_back ( 125 | log10 (rnnlm_.neu2[rnnlm_.vocab[word].class_index 126 | + rnnlm_.vocab_size].ac 127 | * rnnlm_.neu2[word].ac)); 128 | result.sent_prob += result.word_probs.back (); 129 | } else { 130 | // cout << "-1\t0\tOOV" << endl; 131 | result.word_probs.push_back (0.0); 132 | } 133 | 134 | rnnlm_.copyHiddenLayerToInput (); 135 | if (last_word != -1) 136 | rnnlm_.neu0[last_word].ac = 0; 137 | 138 | last_word = word; 139 | for (a = MAX_NGRAM_ORDER - 1; a > 0; a--) 140 | rnnlm_.history[a] = rnnlm_.history[a-1]; 141 | rnnlm_.history[0] = last_word; 142 | } 143 | 144 | return result; 145 | } 146 | 147 | private: 148 | CRnnLM rnnlm_; // The actual rnnlm 149 | }; 150 | 151 | #endif // SRC_INCLUDE_RNNLMPY_H_ 152 | -------------------------------------------------------------------------------- /src/include/util.h: -------------------------------------------------------------------------------- 1 | #ifndef SRC_INCLUDE_UTIL_H_ 2 | #define SRC_INCLUDE_UTIL_H_ 3 | /* 4 | Copyright (c) [2012-], Josef Robert Novak 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted #provided that the following conditions 9 | are met: 10 | 11 | * Redistributions of source code must retain the above copyright 12 | notice, this list of conditions and the following disclaimer. 13 | * Redistributions in binary form must reproduce the above 14 | copyright notice, this list of #conditions and the following 15 | disclaimer in the documentation and/or other materials provided 16 | with the distribution. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22 | COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 23 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 27 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 29 | OF THE POSSIBILITY OF SUCH DAMAGE. 30 | * 31 | */ 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #ifdef __MACH__ 38 | #include 39 | #include 40 | #endif 41 | using namespace fst; 42 | 43 | typedef struct LabelDatum {int max, tot, lhs, rhs; bool lhsE, rhsE;} LabelDatum; 44 | typedef unordered_map LabelData; 45 | 46 | string vec2str (vector vec, string sep); 47 | 48 | string itoas (int i); 49 | 50 | vector tokenize_utf8_string (string* utf8_string, string* delimiter); 51 | 52 | vector tokenize_entry (string* testword, string* sep, 53 | SymbolTable* syms); 54 | 55 | vector tokenize2ints (string* word, string* sep, const SymbolTable* syms); 56 | 57 | timespec get_time( ); 58 | 59 | timespec diff (timespec start, timespec end); 60 | 61 | void PhonetisaurusSetFlags (const char* usage, int* argc, char*** argv, 62 | bool remove_flags); 63 | 64 | void LoadWordList (const std::string& filename, 65 | std::vector* corpus); 66 | 67 | void Split (const std::string& s, char delim, std::vector& elems); 68 | 69 | #endif // SRC_INCLUDE_UTIL_H_ 70 | -------------------------------------------------------------------------------- /src/lib/LatticePruner.cc: -------------------------------------------------------------------------------- 1 | /* 2 | LatticePruner.cpp 3 | 4 | Copyright (c) [2012-], Josef Robert Novak 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted #provided that the following conditions 9 | are met: 10 | 11 | * Redistributions of source code must retain the above copyright 12 | notice, this list of conditions and the following disclaimer. 13 | * Redistributions in binary form must reproduce the above 14 | copyright notice, this list of #conditions and the following 15 | disclaimer in the documentation and/or other materials provided 16 | with the distribution. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22 | COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 23 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 27 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 29 | OF THE POSSIBILITY OF SUCH DAMAGE. 30 | * 31 | */ 32 | using namespace std; 33 | #include "include/LatticePruner.h" 34 | 35 | 36 | LatticePruner::LatticePruner( ){ 37 | //Default constructor 38 | } 39 | 40 | LatticePruner::LatticePruner( LabelData _penalties, TropicalWeight _beam, int _nbest, bool _fb, bool _penalize ) { 41 | penalties = _penalties; 42 | penalize = _penalize; 43 | beam = _beam; 44 | nbest = _nbest; 45 | fb = _fb; 46 | } 47 | 48 | LatticePruner::LatticePruner( TropicalWeight _beam, int _nbest, bool _fb ) { 49 | //TODO 50 | beam = _beam; 51 | nbest = _nbest; 52 | fb = _fb; 53 | penalize = false; 54 | } 55 | 56 | void LatticePruner::prune_fst( VectorFst* fst ){ 57 | /* 58 | Apply several optional pruning heuristics to the lattice. 59 | */ 60 | if( penalize==true ) 61 | _penalize_arcs( fst ); 62 | 63 | if( fb==true ) 64 | _forward_backward( fst ); 65 | 66 | if( nbest==1 ){ 67 | //If N=1 then all the remaining stuff is a waste of time. 68 | //This is because the pruning heuristics are all computed 69 | // *relative* to the 1-best hypothesis. 70 | //This is in contrast LMBR and arc penalization. 71 | *fst = _nbest_prune( fst ); 72 | return; 73 | } 74 | 75 | 76 | if( beam.Value() != LogWeight::Zero() ) 77 | Prune( fst, beam ); 78 | 79 | if( nbest > 1 ) 80 | *fst = _nbest_prune( fst ); 81 | 82 | return; 83 | } 84 | 85 | VectorFst LatticePruner::_nbest_prune( VectorFst* fst ){ 86 | /* 87 | This is just a destructive wrapper for the OpenFst ShortestPath 88 | implementation. I wish they'd implement desctructive versions of 89 | all the algos in the library... 90 | */ 91 | VectorFst sfst; 92 | 93 | ShortestPath( *fst, &sfst, nbest ); 94 | 95 | return sfst; 96 | } 97 | 98 | void LatticePruner::_forward_backward( VectorFst* fst ){ 99 | /* 100 | OpenFst-based implementation of forward-backward lattice pruning based on, 101 | Sixtus and Ortmanns, "HIGH QUALITY WORD GRAPHS USING FORWARD-BACKWARD PRUNING", 1999 102 | 103 | Note-to-self: It seems to give consistent WER and PER improvements so I guess I 104 | got the implementation right, but it seems like maybe it was too easy. 105 | */ 106 | //Setup 107 | VectorFst* pfst = new VectorFst(); 108 | VectorFst* lfst = new VectorFst(); 109 | vector alpha, beta; 110 | 111 | Map(*fst, lfst, StdToLogMapper()); 112 | 113 | //Normalize so that subsequent operations don't go crazy 114 | Push(*lfst, pfst, kPushWeights); 115 | for( StateIterator > siter(*pfst); !siter.Done(); siter.Next() ){ 116 | size_t i = siter.Value(); 117 | if( pfst->Final(i)!=LogArc::Weight::Zero() ){ 118 | pfst->SetFinal(i,LogArc::Weight::One()); 119 | } 120 | } 121 | 122 | //Compute Forward and Backward probabilities 123 | ShortestDistance( *pfst, &alpha ); 124 | ShortestDistance( *pfst, &beta, true ); 125 | 126 | //Compute arc posteriors. This is the same as the Expectation step. 127 | for( StateIterator > siter(*pfst); !siter.Done(); siter.Next() ){ 128 | LogArc::StateId q = siter.Value(); 129 | for( MutableArcIterator > aiter(pfst,q); !aiter.Done(); aiter.Next() ){ 130 | LogArc arc = aiter.Value(); 131 | LogWeight gamma = Divide(Times(Times(alpha[q], arc.weight), beta[arc.nextstate]), beta[0]); 132 | 133 | if( gamma.Value()==gamma.Value() ){ 134 | arc.weight = gamma; 135 | aiter.SetValue(arc); 136 | } 137 | } 138 | } 139 | 140 | Map(*pfst, fst, LogToStdMapper()); 141 | 142 | delete lfst; 143 | delete pfst; 144 | return; 145 | } 146 | 147 | void LatticePruner::_penalize_arcs( VectorFst* fst ){ 148 | 149 | for( StateIterator > siter(*fst); !siter.Done(); siter.Next() ){ 150 | StdArc::StateId q = siter.Value(); 151 | for( MutableArcIterator > aiter(fst,q); !aiter.Done(); aiter.Next() ){ 152 | StdArc arc = aiter.Value(); 153 | LabelDatum* ld = &penalties[arc.ilabel]; 154 | 155 | if( ld->lhs>1 && ld->rhs>1 ){ 156 | arc.weight = 999; 157 | }else{ 158 | arc.weight = arc.weight.Value() * ld->max; 159 | } 160 | if( arc.weight == LogWeight::Zero() ) 161 | arc.weight = 999; 162 | if( arc.weight != arc.weight ) 163 | arc.weight = 999; 164 | aiter.SetValue(arc); 165 | } 166 | } 167 | 168 | return; 169 | } 170 | 171 | -------------------------------------------------------------------------------- /src/lib/feature-reader.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include "LegacyRnnLMHash.h" 3 | #include "RnnLMDecoder.h" 4 | #include "LegacyRnnLMDecodable.h" 5 | #include "LegacyRnnLMReader.h" 6 | using namespace fst; 7 | 8 | //typedef std::unordered_map > FMAP; 9 | typedef std::unordered_map > FMAP; 10 | 11 | template 12 | void LoadFeatureConf (const H&h, FMAP* fmap, std::string& featurefilename) { 13 | std::ifstream ifp (featurefilename.c_str ()); 14 | std::string prefix = "#"; 15 | std::string line; 16 | 17 | if (ifp.is_open ()) { 18 | while (ifp.good ()) { 19 | getline (ifp, line); 20 | if (line.empty ()) 21 | continue; 22 | 23 | std::vector ids; 24 | int id; 25 | std::string word; 26 | if (!line.compare (0, prefix.size (), prefix)) 27 | continue; 28 | 29 | std::stringstream ss (line); 30 | ss >> word; 31 | while (ss >> id) 32 | ids.push_back (id); 33 | cout << "Item: " << word << " " << h.GetWordId (word) << endl; 34 | (*fmap) [h.GetWordId (word)] = ids; 35 | } 36 | ifp.close (); 37 | } 38 | } 39 | 40 | typedef LegacyRnnLMDecodable Decodable; 41 | DEFINE_string (rnnlm, "", "The input RnnLM model."); 42 | DEFINE_string (feats, "", "Auxiliary features conf file."); 43 | 44 | int main (int argc, char* argv []) { 45 | string usage = "feature-reader --rnnlm=test.rnnlm --feats=features.conf\n\n Usage: "; 46 | set_new_handler (FailedNewHandler); 47 | SetFlags (usage.c_str (), &argc, &argv, false); 48 | 49 | LegacyRnnLMReader reader (FLAGS_rnnlm); 50 | LegacyRnnLMHash h = reader.CopyVocabHash (); 51 | 52 | FMAP fmap; 53 | 54 | LoadFeatureConf (h, &fmap, FLAGS_feats); 55 | 56 | for (FMAP::iterator it = fmap.begin (); it != fmap.end (); ++it) { 57 | std::cout << it->first << "\t"; 58 | const std::vector& feats = (*it).second; 59 | for (int i = 0; i < feats.size (); i++) 60 | cout << feats [i] << ((i == feats.size ()) ? "" : " "); 61 | cout << endl; 62 | } 63 | 64 | return 0; 65 | } 66 | -------------------------------------------------------------------------------- /src/lib/util.cc: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) [2012-], Josef Robert Novak 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted #provided that the following conditions 7 | are met: 8 | 9 | * Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of #conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 19 | FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 20 | COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 21 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 25 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 27 | OF THE POSSIBILITY OF SUCH DAMAGE. 28 | * 29 | */ 30 | using namespace std; 31 | #include 32 | using namespace fst; 33 | 34 | 35 | string vec2str( vector vec, string sep ){ 36 | string ss; 37 | for(size_t i = 0; i < vec.size(); ++i){ 38 | if(i != 0) 39 | ss += sep; 40 | ss += vec[i]; 41 | } 42 | return ss; 43 | } 44 | 45 | string itoas( int i ){ 46 | std::stringstream ostring; 47 | ostring << i; 48 | return ostring.str(); 49 | } 50 | 51 | vector tokenize_utf8_string (string* utf8_string, string* delimiter) { 52 | /* 53 | Support for tokenizing a utf-8 string. Adapted to also 54 | support a delimiter. Note that leading, trailing or multiple 55 | consecutive delimiters will result in empty vector elements. 56 | Normally should not be a problem but just in case. Also note 57 | that any tokens that cannot be found in the model symbol table will be 58 | deleted from the input word prior to grapheme-to-phoneme conversion. 59 | 60 | http://stackoverflow.com/questions/2852895/c-iterate-or-split-\ 61 | utf-8-string-into-array-of-symbols#2856241 62 | */ 63 | char* str = (char*) utf8_string->c_str (); // utf-8 string 64 | char* str_i = str; // string iterator 65 | char* str_j = str; 66 | char* end = str + strlen (str) + 1; // end iterator 67 | vector string_vec; 68 | if (delimiter->compare ("") != 0) 69 | string_vec.push_back (""); 70 | 71 | do { 72 | str_j = str_i; 73 | utf8::uint32_t code = utf8::next (str_i, end); // get 32 bit code 74 | if (code == 0) 75 | continue; 76 | int start = strlen (str) - strlen (str_j); 77 | int end = strlen (str) - strlen (str_i); 78 | int len = end - start; 79 | 80 | if (delimiter->compare ("") == 0) { 81 | string_vec.push_back (utf8_string->substr (start,len)); 82 | } else { 83 | if (delimiter->compare (utf8_string->substr (start, len)) == 0) 84 | string_vec.push_back (""); 85 | else 86 | string_vec [string_vec.size () - 1] += utf8_string->substr (start, len); 87 | } 88 | } while (str_i < end); 89 | 90 | return string_vec; 91 | } 92 | 93 | 94 | vector tokenize_entry (string* testword, string* sep, 95 | SymbolTable* syms) { 96 | vector tokens = tokenize_utf8_string (testword, sep); 97 | vector entry; 98 | for (unsigned int i=0; iFind (tokens.at (i)) != -1) { 100 | entry.push_back (tokens.at (i)); 101 | }else{ 102 | cerr << "Symbol: '" << tokens.at (i) 103 | << "' not found in input symbols table." << endl 104 | << "Mapping to null..." << endl; 105 | } 106 | } 107 | 108 | return entry; 109 | } 110 | 111 | vector tokenize2ints (string* testword, string* sep, 112 | const SymbolTable* syms) { 113 | vector tokens = tokenize_utf8_string (testword, sep); 114 | vector entry; 115 | for (unsigned int i=0; iFind (tokens[i]); 117 | if (label == -1) 118 | cerr << "Symbol: '" << tokens[i] 119 | << "' not found in input symbols table." << endl 120 | << "Mapping to null..." << endl; 121 | else 122 | entry.push_back (label); 123 | } 124 | 125 | return entry; 126 | } 127 | 128 | #ifdef __MACH__ 129 | timespec get_time( ){ 130 | clock_serv_t cclock; 131 | mach_timespec_t mts; 132 | host_get_clock_service(mach_host_self(), REALTIME_CLOCK, &cclock); 133 | clock_get_time(cclock, &mts); 134 | 135 | timespec ts = {mts.tv_sec, mts.tv_nsec}; 136 | return ts; 137 | } 138 | #else 139 | timespec get_time( ){ 140 | timespec ts; 141 | clock_gettime(CLOCK_REALTIME, &ts); 142 | return ts; 143 | } 144 | #endif 145 | 146 | timespec diff(timespec start, timespec end){ 147 | timespec temp; 148 | if ((end.tv_nsec-start.tv_nsec)<0) { 149 | temp.tv_sec = end.tv_sec-start.tv_sec-1; 150 | temp.tv_nsec = 1000000000+end.tv_nsec-start.tv_nsec; 151 | } else { 152 | temp.tv_sec = end.tv_sec-start.tv_sec; 153 | temp.tv_nsec = end.tv_nsec-start.tv_nsec; 154 | } 155 | return temp; 156 | } 157 | 158 | DEFINE_bool (help, false, "show usage information"); 159 | void PhonetisaurusSetFlags (const char* usage, int* argc, char*** argv, 160 | bool remove_flags) { 161 | //Workaround for Apple's. It just skips all the options processing. 162 | #if defined(__APPLE__) && defined(__MACH__) 163 | SetFlags (usage, argc, argv, remove_flags); 164 | #else 165 | int index = 1; 166 | for (; index < *argc; ++index) { 167 | string argval = (*argv)[index]; 168 | 169 | if (argval[0] != '-' || argval == "-") 170 | break; 171 | while (argval[0] == '-') 172 | argval = argval.substr(1); // remove initial '-'s 173 | 174 | string arg = argval; 175 | string val = ""; 176 | 177 | // split argval (arg=val) into arg and val 178 | size_t pos = argval.find("="); 179 | if (pos != string::npos) { 180 | arg = argval.substr(0, pos); 181 | val = argval.substr(pos + 1); 182 | } 183 | 184 | 185 | FlagRegister *bool_register = 186 | FlagRegister::GetRegister(); 187 | if (bool_register->SetFlag(arg, val)) 188 | continue; 189 | FlagRegister *string_register = 190 | FlagRegister::GetRegister(); 191 | if (string_register->SetFlag(arg, val)) 192 | continue; 193 | FlagRegister *int32_register = 194 | FlagRegister::GetRegister(); 195 | if (int32_register->SetFlag(arg, val)) 196 | continue; 197 | FlagRegister *int64_register = 198 | FlagRegister::GetRegister(); 199 | if (int64_register->SetFlag(arg, val)) 200 | continue; 201 | FlagRegister *double_register = 202 | FlagRegister::GetRegister(); 203 | if (double_register->SetFlag(arg, val)) 204 | continue; 205 | 206 | LOG(FATAL) << "SetFlags: Bad option: " << (*argv)[index]; 207 | } 208 | 209 | if (FLAGS_help) { 210 | //Just show program flags - NOT general OpenFst flags 211 | // There are too many and they are just confusing. 212 | std::set< pair > usage_set; 213 | 214 | cout << usage << "\n"; 215 | 216 | FlagRegister *bool_register = FlagRegister::GetRegister(); 217 | bool_register->GetUsage(&usage_set); 218 | FlagRegister *string_register = FlagRegister::GetRegister(); 219 | string_register->GetUsage(&usage_set); 220 | FlagRegister *int32_register = FlagRegister::GetRegister(); 221 | int32_register->GetUsage(&usage_set); 222 | FlagRegister *int64_register = FlagRegister::GetRegister(); 223 | int64_register->GetUsage(&usage_set); 224 | FlagRegister *double_register = FlagRegister::GetRegister(); 225 | double_register->GetUsage(&usage_set); 226 | 227 | for (std::set< pair >::const_iterator it = 228 | usage_set.begin(); 229 | it != usage_set.end(); 230 | ++it) { 231 | const string &file = it->first; 232 | const string &usage = it->second; 233 | 234 | //if (file.compare ("flags.cc") == 0 || file.compare ("fst.cc") == 0 235 | if (file.compare ("fst.cc") == 0 \ 236 | || file.compare ("symbol-table.cc") == 0 || \ 237 | file.compare ("util.cc") == 0) 238 | continue; 239 | 240 | //Else print out the args - they are from the actual program 241 | cout << usage << endl; 242 | } 243 | //Fake this 244 | cout << " --help: type = bool, default = false" << endl; 245 | cout << " show usage information" << endl; 246 | exit (0); 247 | } 248 | #endif 249 | } 250 | 251 | void LoadWordList (const std::string& filename, 252 | std::vector* corpus) { 253 | std::ifstream ifp (filename.c_str ()); 254 | std::string line; 255 | 256 | if (ifp.is_open ()) { 257 | while (ifp.good ()) { 258 | getline (ifp, line); 259 | if (line.empty ()) 260 | continue; 261 | 262 | corpus->push_back (line); 263 | } 264 | ifp.close (); 265 | } 266 | } 267 | 268 | 269 | void Split (const std::string& s, char delim, std::vector& elems) { 270 | std::stringstream ss (s); 271 | std::string item; 272 | while (getline (ss, item, delim)) 273 | elems.push_back (item); 274 | } 275 | -------------------------------------------------------------------------------- /test/check-nbest-wer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import re, sys, os 3 | from collections import defaultdict 4 | 5 | def RunRegressionPrep () : 6 | print "Standard alignment" 7 | command = """phonetisaurus-align --input=g014b2b/g014b2b.train \ 8 | --ofile=g014b2b/g014b2b.corpus \ 9 | --seq1_del=false \ 10 | --grow=false 11 | """ 12 | os.system (command) 13 | 14 | print "Alignment with support for growing" 15 | command = """phonetisaurus-align --input=g014b2b/g014b2b.train \ 16 | --ofile=g014b2b/g014b2b.grow.corpus \ 17 | --seq1_del=false \ 18 | --grow=true 19 | """ 20 | os.system (command) 21 | 22 | print "\nTraining standard ARPA" 23 | command = """estimate-ngram -o 8 -t g014b2b/g014b2b.corpus \ 24 | -wl g014b2b/g014b2b.o8.arpa 25 | """ 26 | os.system (command) 27 | 28 | print "Training grow-supported ARPA" 29 | command = """estimate-ngram -o 8 -t g014b2b/g014b2b.grow.corpus \ 30 | -wl g014b2b/g014b2b.grow.o8.arpa 31 | """ 32 | os.system (command) 33 | 34 | print "\nConverting stanard model to Fst" 35 | command = """phonetisaurus-arpa2wfst --lm=g014b2b/g014b2b.o8.arpa \ 36 | --ofile=g014b2b/g014b2b.o8.fst 37 | """ 38 | os.system (command) 39 | 40 | print "Converting grow-reported stanard model to Fst" 41 | command = """phonetisaurus-arpa2wfst --lm=g014b2b/g014b2b.grow.o8.arpa \ 42 | --ofile=g014b2b/g014b2b.grow.o8.fst 43 | """ 44 | os.system (command) 45 | 46 | print "\nTesting 5-best standard" 47 | command = """phonetisaurus-g2pfst --model=g014b2b/g014b2b.o8.fst \ 48 | --wordlist=g014b2b/g014b2b.words \ 49 | --nbest=5 | perl -e'while(<>){s/\|/ /g; print $_;}' \ 50 | > g014b2b/g014b2b-n5.hyp 51 | """ 52 | os.system (command) 53 | 54 | print "Testing 5-best grow-supported standard" 55 | command = """phonetisaurus-g2pfst --model=g014b2b/g014b2b.grow.o8.fst \ 56 | --wordlist=g014b2b/g014b2b.words \ 57 | --nbest=5 | perl -e'while(<>){s/\|/ /g; print $_;}' \ 58 | > g014b2b/g014b2b-grow-n5.hyp 59 | """ 60 | os.system (command) 61 | 62 | return 63 | 64 | def LoadRefs (refs_file) : 65 | refs = {} 66 | 67 | with open (refs_file, "r") as ifp : 68 | for line in ifp : 69 | parts = re.split (ur"\t", line.decode ("utf8").strip ()) 70 | word = parts.pop (0) 71 | refs [word] = parts 72 | 73 | return refs 74 | 75 | def LoadNbestHyps (hyps_file) : 76 | hyps = defaultdict (list) 77 | 78 | with open (hyps_file, "r") as ifp : 79 | for line in ifp : 80 | parts = re.split (ur"\t", line.decode ("utf8").strip ()) 81 | if parts [-1] == "" : 82 | continue 83 | 84 | hyps [parts [0]].append (parts [-1]) 85 | 86 | return hyps 87 | 88 | def ComputeEval (hyps) : 89 | refs = LoadRefs ("g014b2b/g014b2b.ref") 90 | hyps = LoadNbestHyps (hyps) 91 | 92 | total = 0. 93 | corr = 0. 94 | for ref_word, ref_prons in refs.iteritems () : 95 | hyp_prons = hyps [ref_word] 96 | ref_set = set (ref_prons) 97 | hyp_set = set (hyp_prons) 98 | intersection = ref_set.intersection (hyp_set) 99 | 100 | total += 1.0 101 | if len (intersection) > 0 : 102 | corr += 1.0 103 | 104 | print "Corr: {0}, Err: {1}, WACC: {2:0.2f}%, WER: {3:0.2f}%".format ( 105 | corr, 106 | total - corr, 107 | corr / total * 100, 108 | (1.0 - (corr / total)) * 100 109 | ) 110 | 111 | 112 | if __name__ == "__main__" : 113 | import argparse 114 | 115 | example = "{0} --prefix g014b2b".format (sys.argv [0]) 116 | parser = argparse.ArgumentParser (description=example) 117 | parser.add_argument ("--prefix", "-p", help="Prefix.", 118 | default="g014b2b") 119 | args = parser.parse_args () 120 | 121 | RunRegressionPrep () 122 | ComputeEval ("{0}/{0}-n5.hyp".format (args.prefix)) 123 | ComputeEval ("{0}/{0}-grow-n5.hyp".format (args.prefix)) 124 | --------------------------------------------------------------------------------