├── .autoconf
    ├── ar-lib
    ├── compile
    ├── config.guess
    ├── config.sub
    ├── depcomp
    ├── install-sh
    ├── ltmain.sh
    ├── m4
    │   ├── libtool.m4
    │   ├── ltoptions.m4
    │   ├── ltsugar.m4
    │   ├── ltversion.m4
    │   └── lt~obsolete.m4
    └── missing
├── .dockerignore
├── .gitattributes
├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── Makefile.am
├── Makefile.in
├── README.md
├── aclocal.m4
├── configure
├── configure.ac
├── docs
    ├── doxygen.cfg
    └── mainpage.dox
├── python
    ├── phonetisaurus-module.py
    ├── phonetisaurus
    │   └── __init__.py
    ├── script
    │   ├── demo.html
    │   ├── g2pserver.py
    │   ├── phoneticize.py
    │   └── words.list
    └── setup.py
├── src
    ├── 3rdparty
    │   ├── rnnlm
    │   │   ├── COPYRIGHT.txt
    │   │   ├── rnnlmlib.cpp
    │   │   └── rnnlmlib.h
    │   └── utfcpp
    │   │   ├── utf8.h
    │   │   └── utf8
    │   │       ├── checked.h
    │   │       ├── core.h
    │   │       └── unchecked.h
    ├── bin
    │   ├── phonetisaurus-align.cc
    │   ├── phonetisaurus-arpa2wfst.cc
    │   ├── phonetisaurus-g2pfst.cc
    │   ├── phonetisaurus-g2prnn.cc
    │   └── rnnlm.cc
    ├── include
    │   ├── ARPA2WFST.h
    │   ├── LatticePruner.h
    │   ├── LegacyRnnLMDecodable.h
    │   ├── LegacyRnnLMHash.h
    │   ├── LegacyRnnLMReader.h
    │   ├── M2MFstAligner.h
    │   ├── PhonetisaurusRex.h
    │   ├── PhonetisaurusScript.h
    │   ├── RnnLMDecoder.h
    │   ├── RnnLMPy.h
    │   └── util.h
    ├── lib
    │   ├── LatticePruner.cc
    │   ├── M2MFstAligner.cc
    │   ├── feature-reader.cc
    │   └── util.cc
    └── scripts
    │   ├── phonetisaurus-apply
    │   └── phonetisaurus-train
└── test
    ├── check-nbest-wer.py
    └── g014b2b
        ├── g014b2b.ref
        ├── g014b2b.train
        └── g014b2b.words


/.autoconf/ar-lib:
--------------------------------------------------------------------------------
  1 | #! /bin/sh
  2 | # Wrapper for Microsoft lib.exe
  3 | 
  4 | me=ar-lib
  5 | scriptversion=2012-03-01.08; # UTC
  6 | 
  7 | # Copyright (C) 2010-2017 Free Software Foundation, Inc.
  8 | # Written by Peter Rosin <peda@lysator.liu.se>.
  9 | #
 10 | # This program is free software; you can redistribute it and/or modify
 11 | # it under the terms of the GNU General Public License as published by
 12 | # the Free Software Foundation; either version 2, or (at your option)
 13 | # any later version.
 14 | #
 15 | # This program is distributed in the hope that it will be useful,
 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 | # GNU General Public License for more details.
 19 | #
 20 | # You should have received a copy of the GNU General Public License
 21 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 22 | 
 23 | # As a special exception to the GNU General Public License, if you
 24 | # distribute this file as part of a program that contains a
 25 | # configuration script generated by Autoconf, you may include it under
 26 | # the same distribution terms that you use for the rest of that program.
 27 | 
 28 | # This file is maintained in Automake, please report
 29 | # bugs to <bug-automake@gnu.org> or send patches to
 30 | # <automake-patches@gnu.org>.
 31 | 
 32 | 
 33 | # func_error message
 34 | func_error ()
 35 | {
 36 |   echo "$me: $1" 1>&2
 37 |   exit 1
 38 | }
 39 | 
 40 | file_conv=
 41 | 
 42 | # func_file_conv build_file
 43 | # Convert a $build file to $host form and store it in $file
 44 | # Currently only supports Windows hosts.
 45 | func_file_conv ()
 46 | {
 47 |   file=$1
 48 |   case $file in
 49 |     / | /[!/]*) # absolute file, and not a UNC file
 50 |       if test -z "$file_conv"; then
 51 | 	# lazily determine how to convert abs files
 52 | 	case `uname -s` in
 53 | 	  MINGW*)
 54 | 	    file_conv=mingw
 55 | 	    ;;
 56 | 	  CYGWIN*)
 57 | 	    file_conv=cygwin
 58 | 	    ;;
 59 | 	  *)
 60 | 	    file_conv=wine
 61 | 	    ;;
 62 | 	esac
 63 |       fi
 64 |       case $file_conv in
 65 | 	mingw)
 66 | 	  file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'`
 67 | 	  ;;
 68 | 	cygwin)
 69 | 	  file=`cygpath -m "$file" || echo "$file"`
 70 | 	  ;;
 71 | 	wine)
 72 | 	  file=`winepath -w "$file" || echo "$file"`
 73 | 	  ;;
 74 |       esac
 75 |       ;;
 76 |   esac
 77 | }
 78 | 
 79 | # func_at_file at_file operation archive
 80 | # Iterate over all members in AT_FILE performing OPERATION on ARCHIVE
 81 | # for each of them.
 82 | # When interpreting the content of the @FILE, do NOT use func_file_conv,
 83 | # since the user would need to supply preconverted file names to
 84 | # binutils ar, at least for MinGW.
 85 | func_at_file ()
 86 | {
 87 |   operation=$2
 88 |   archive=$3
 89 |   at_file_contents=`cat "$1"`
 90 |   eval set x "$at_file_contents"
 91 |   shift
 92 | 
 93 |   for member
 94 |   do
 95 |     $AR -NOLOGO $operation:"$member" "$archive" || exit $?
 96 |   done
 97 | }
 98 | 
 99 | case $1 in
100 |   '')
101 |      func_error "no command.  Try '$0 --help' for more information."
102 |      ;;
103 |   -h | --h*)
104 |     cat <<EOF
105 | Usage: $me [--help] [--version] PROGRAM ACTION ARCHIVE [MEMBER...]
106 | 
107 | Members may be specified in a file named with @FILE.
108 | EOF
109 |     exit $?
110 |     ;;
111 |   -v | --v*)
112 |     echo "$me, version $scriptversion"
113 |     exit $?
114 |     ;;
115 | esac
116 | 
117 | if test $# -lt 3; then
118 |   func_error "you must specify a program, an action and an archive"
119 | fi
120 | 
121 | AR=$1
122 | shift
123 | while :
124 | do
125 |   if test $# -lt 2; then
126 |     func_error "you must specify a program, an action and an archive"
127 |   fi
128 |   case $1 in
129 |     -lib | -LIB \
130 |     | -ltcg | -LTCG \
131 |     | -machine* | -MACHINE* \
132 |     | -subsystem* | -SUBSYSTEM* \
133 |     | -verbose | -VERBOSE \
134 |     | -wx* | -WX* )
135 |       AR="$AR $1"
136 |       shift
137 |       ;;
138 |     *)
139 |       action=$1
140 |       shift
141 |       break
142 |       ;;
143 |   esac
144 | done
145 | orig_archive=$1
146 | shift
147 | func_file_conv "$orig_archive"
148 | archive=$file
149 | 
150 | # strip leading dash in $action
151 | action=${action#-}
152 | 
153 | delete=
154 | extract=
155 | list=
156 | quick=
157 | replace=
158 | index=
159 | create=
160 | 
161 | while test -n "$action"
162 | do
163 |   case $action in
164 |     d*) delete=yes  ;;
165 |     x*) extract=yes ;;
166 |     t*) list=yes    ;;
167 |     q*) quick=yes   ;;
168 |     r*) replace=yes ;;
169 |     s*) index=yes   ;;
170 |     S*)             ;; # the index is always updated implicitly
171 |     c*) create=yes  ;;
172 |     u*)             ;; # TODO: don't ignore the update modifier
173 |     v*)             ;; # TODO: don't ignore the verbose modifier
174 |     *)
175 |       func_error "unknown action specified"
176 |       ;;
177 |   esac
178 |   action=${action#?}
179 | done
180 | 
181 | case $delete$extract$list$quick$replace,$index in
182 |   yes,* | ,yes)
183 |     ;;
184 |   yesyes*)
185 |     func_error "more than one action specified"
186 |     ;;
187 |   *)
188 |     func_error "no action specified"
189 |     ;;
190 | esac
191 | 
192 | if test -n "$delete"; then
193 |   if test ! -f "$orig_archive"; then
194 |     func_error "archive not found"
195 |   fi
196 |   for member
197 |   do
198 |     case $1 in
199 |       @*)
200 |         func_at_file "${1#@}" -REMOVE "$archive"
201 |         ;;
202 |       *)
203 |         func_file_conv "$1"
204 |         $AR -NOLOGO -REMOVE:"$file" "$archive" || exit $?
205 |         ;;
206 |     esac
207 |   done
208 | 
209 | elif test -n "$extract"; then
210 |   if test ! -f "$orig_archive"; then
211 |     func_error "archive not found"
212 |   fi
213 |   if test $# -gt 0; then
214 |     for member
215 |     do
216 |       case $1 in
217 |         @*)
218 |           func_at_file "${1#@}" -EXTRACT "$archive"
219 |           ;;
220 |         *)
221 |           func_file_conv "$1"
222 |           $AR -NOLOGO -EXTRACT:"$file" "$archive" || exit $?
223 |           ;;
224 |       esac
225 |     done
226 |   else
227 |     $AR -NOLOGO -LIST "$archive" | sed -e 's/\\/\\\\/g' | while read member
228 |     do
229 |       $AR -NOLOGO -EXTRACT:"$member" "$archive" || exit $?
230 |     done
231 |   fi
232 | 
233 | elif test -n "$quick$replace"; then
234 |   if test ! -f "$orig_archive"; then
235 |     if test -z "$create"; then
236 |       echo "$me: creating $orig_archive"
237 |     fi
238 |     orig_archive=
239 |   else
240 |     orig_archive=$archive
241 |   fi
242 | 
243 |   for member
244 |   do
245 |     case $1 in
246 |     @*)
247 |       func_file_conv "${1#@}"
248 |       set x "$@" "@$file"
249 |       ;;
250 |     *)
251 |       func_file_conv "$1"
252 |       set x "$@" "$file"
253 |       ;;
254 |     esac
255 |     shift
256 |     shift
257 |   done
258 | 
259 |   if test -n "$orig_archive"; then
260 |     $AR -NOLOGO -OUT:"$archive" "$orig_archive" "$@" || exit $?
261 |   else
262 |     $AR -NOLOGO -OUT:"$archive" "$@" || exit $?
263 |   fi
264 | 
265 | elif test -n "$list"; then
266 |   if test ! -f "$orig_archive"; then
267 |     func_error "archive not found"
268 |   fi
269 |   $AR -NOLOGO -LIST "$archive" || exit $?
270 | fi
271 | 


--------------------------------------------------------------------------------
/.autoconf/compile:
--------------------------------------------------------------------------------
  1 | #! /bin/sh
  2 | # Wrapper for compilers which do not understand '-c -o'.
  3 | 
  4 | scriptversion=2012-10-14.11; # UTC
  5 | 
  6 | # Copyright (C) 1999-2014 Free Software Foundation, Inc.
  7 | # Written by Tom Tromey <tromey@cygnus.com>.
  8 | #
  9 | # This program is free software; you can redistribute it and/or modify
 10 | # it under the terms of the GNU General Public License as published by
 11 | # the Free Software Foundation; either version 2, or (at your option)
 12 | # any later version.
 13 | #
 14 | # This program is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU General Public License
 20 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 21 | 
 22 | # As a special exception to the GNU General Public License, if you
 23 | # distribute this file as part of a program that contains a
 24 | # configuration script generated by Autoconf, you may include it under
 25 | # the same distribution terms that you use for the rest of that program.
 26 | 
 27 | # This file is maintained in Automake, please report
 28 | # bugs to <bug-automake@gnu.org> or send patches to
 29 | # <automake-patches@gnu.org>.
 30 | 
 31 | nl='
 32 | '
 33 | 
 34 | # We need space, tab and new line, in precisely that order.  Quoting is
 35 | # there to prevent tools from complaining about whitespace usage.
 36 | IFS=" ""	$nl"
 37 | 
 38 | file_conv=
 39 | 
 40 | # func_file_conv build_file lazy
 41 | # Convert a $build file to $host form and store it in $file
 42 | # Currently only supports Windows hosts. If the determined conversion
 43 | # type is listed in (the comma separated) LAZY, no conversion will
 44 | # take place.
 45 | func_file_conv ()
 46 | {
 47 |   file=$1
 48 |   case $file in
 49 |     / | /[!/]*) # absolute file, and not a UNC file
 50 |       if test -z "$file_conv"; then
 51 | 	# lazily determine how to convert abs files
 52 | 	case `uname -s` in
 53 | 	  MINGW*)
 54 | 	    file_conv=mingw
 55 | 	    ;;
 56 | 	  CYGWIN*)
 57 | 	    file_conv=cygwin
 58 | 	    ;;
 59 | 	  *)
 60 | 	    file_conv=wine
 61 | 	    ;;
 62 | 	esac
 63 |       fi
 64 |       case $file_conv/,$2, in
 65 | 	*,$file_conv,*)
 66 | 	  ;;
 67 | 	mingw/*)
 68 | 	  file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'`
 69 | 	  ;;
 70 | 	cygwin/*)
 71 | 	  file=`cygpath -m "$file" || echo "$file"`
 72 | 	  ;;
 73 | 	wine/*)
 74 | 	  file=`winepath -w "$file" || echo "$file"`
 75 | 	  ;;
 76 |       esac
 77 |       ;;
 78 |   esac
 79 | }
 80 | 
 81 | # func_cl_dashL linkdir
 82 | # Make cl look for libraries in LINKDIR
 83 | func_cl_dashL ()
 84 | {
 85 |   func_file_conv "$1"
 86 |   if test -z "$lib_path"; then
 87 |     lib_path=$file
 88 |   else
 89 |     lib_path="$lib_path;$file"
 90 |   fi
 91 |   linker_opts="$linker_opts -LIBPATH:$file"
 92 | }
 93 | 
 94 | # func_cl_dashl library
 95 | # Do a library search-path lookup for cl
 96 | func_cl_dashl ()
 97 | {
 98 |   lib=$1
 99 |   found=no
100 |   save_IFS=$IFS
101 |   IFS=';'
102 |   for dir in $lib_path $LIB
103 |   do
104 |     IFS=$save_IFS
105 |     if $shared && test -f "$dir/$lib.dll.lib"; then
106 |       found=yes
107 |       lib=$dir/$lib.dll.lib
108 |       break
109 |     fi
110 |     if test -f "$dir/$lib.lib"; then
111 |       found=yes
112 |       lib=$dir/$lib.lib
113 |       break
114 |     fi
115 |     if test -f "$dir/lib$lib.a"; then
116 |       found=yes
117 |       lib=$dir/lib$lib.a
118 |       break
119 |     fi
120 |   done
121 |   IFS=$save_IFS
122 | 
123 |   if test "$found" != yes; then
124 |     lib=$lib.lib
125 |   fi
126 | }
127 | 
128 | # func_cl_wrapper cl arg...
129 | # Adjust compile command to suit cl
130 | func_cl_wrapper ()
131 | {
132 |   # Assume a capable shell
133 |   lib_path=
134 |   shared=:
135 |   linker_opts=
136 |   for arg
137 |   do
138 |     if test -n "$eat"; then
139 |       eat=
140 |     else
141 |       case $1 in
142 | 	-o)
143 | 	  # configure might choose to run compile as 'compile cc -o foo foo.c'.
144 | 	  eat=1
145 | 	  case $2 in
146 | 	    *.o | *.[oO][bB][jJ])
147 | 	      func_file_conv "$2"
148 | 	      set x "$@" -Fo"$file"
149 | 	      shift
150 | 	      ;;
151 | 	    *)
152 | 	      func_file_conv "$2"
153 | 	      set x "$@" -Fe"$file"
154 | 	      shift
155 | 	      ;;
156 | 	  esac
157 | 	  ;;
158 | 	-I)
159 | 	  eat=1
160 | 	  func_file_conv "$2" mingw
161 | 	  set x "$@" -I"$file"
162 | 	  shift
163 | 	  ;;
164 | 	-I*)
165 | 	  func_file_conv "${1#-I}" mingw
166 | 	  set x "$@" -I"$file"
167 | 	  shift
168 | 	  ;;
169 | 	-l)
170 | 	  eat=1
171 | 	  func_cl_dashl "$2"
172 | 	  set x "$@" "$lib"
173 | 	  shift
174 | 	  ;;
175 | 	-l*)
176 | 	  func_cl_dashl "${1#-l}"
177 | 	  set x "$@" "$lib"
178 | 	  shift
179 | 	  ;;
180 | 	-L)
181 | 	  eat=1
182 | 	  func_cl_dashL "$2"
183 | 	  ;;
184 | 	-L*)
185 | 	  func_cl_dashL "${1#-L}"
186 | 	  ;;
187 | 	-static)
188 | 	  shared=false
189 | 	  ;;
190 | 	-Wl,*)
191 | 	  arg=${1#-Wl,}
192 | 	  save_ifs="$IFS"; IFS=','
193 | 	  for flag in $arg; do
194 | 	    IFS="$save_ifs"
195 | 	    linker_opts="$linker_opts $flag"
196 | 	  done
197 | 	  IFS="$save_ifs"
198 | 	  ;;
199 | 	-Xlinker)
200 | 	  eat=1
201 | 	  linker_opts="$linker_opts $2"
202 | 	  ;;
203 | 	-*)
204 | 	  set x "$@" "$1"
205 | 	  shift
206 | 	  ;;
207 | 	*.cc | *.CC | *.cxx | *.CXX | *.[cC]++)
208 | 	  func_file_conv "$1"
209 | 	  set x "$@" -Tp"$file"
210 | 	  shift
211 | 	  ;;
212 | 	*.c | *.cpp | *.CPP | *.lib | *.LIB | *.Lib | *.OBJ | *.obj | *.[oO])
213 | 	  func_file_conv "$1" mingw
214 | 	  set x "$@" "$file"
215 | 	  shift
216 | 	  ;;
217 | 	*)
218 | 	  set x "$@" "$1"
219 | 	  shift
220 | 	  ;;
221 |       esac
222 |     fi
223 |     shift
224 |   done
225 |   if test -n "$linker_opts"; then
226 |     linker_opts="-link$linker_opts"
227 |   fi
228 |   exec "$@" $linker_opts
229 |   exit 1
230 | }
231 | 
232 | eat=
233 | 
234 | case $1 in
235 |   '')
236 |      echo "$0: No command.  Try '$0 --help' for more information." 1>&2
237 |      exit 1;
238 |      ;;
239 |   -h | --h*)
240 |     cat <<\EOF
241 | Usage: compile [--help] [--version] PROGRAM [ARGS]
242 | 
243 | Wrapper for compilers which do not understand '-c -o'.
244 | Remove '-o dest.o' from ARGS, run PROGRAM with the remaining
245 | arguments, and rename the output as expected.
246 | 
247 | If you are trying to build a whole package this is not the
248 | right script to run: please start by reading the file 'INSTALL'.
249 | 
250 | Report bugs to <bug-automake@gnu.org>.
251 | EOF
252 |     exit $?
253 |     ;;
254 |   -v | --v*)
255 |     echo "compile $scriptversion"
256 |     exit $?
257 |     ;;
258 |   cl | *[/\\]cl | cl.exe | *[/\\]cl.exe )
259 |     func_cl_wrapper "$@"      # Doesn't return...
260 |     ;;
261 | esac
262 | 
263 | ofile=
264 | cfile=
265 | 
266 | for arg
267 | do
268 |   if test -n "$eat"; then
269 |     eat=
270 |   else
271 |     case $1 in
272 |       -o)
273 | 	# configure might choose to run compile as 'compile cc -o foo foo.c'.
274 | 	# So we strip '-o arg' only if arg is an object.
275 | 	eat=1
276 | 	case $2 in
277 | 	  *.o | *.obj)
278 | 	    ofile=$2
279 | 	    ;;
280 | 	  *)
281 | 	    set x "$@" -o "$2"
282 | 	    shift
283 | 	    ;;
284 | 	esac
285 | 	;;
286 |       *.c)
287 | 	cfile=$1
288 | 	set x "$@" "$1"
289 | 	shift
290 | 	;;
291 |       *)
292 | 	set x "$@" "$1"
293 | 	shift
294 | 	;;
295 |     esac
296 |   fi
297 |   shift
298 | done
299 | 
300 | if test -z "$ofile" || test -z "$cfile"; then
301 |   # If no '-o' option was seen then we might have been invoked from a
302 |   # pattern rule where we don't need one.  That is ok -- this is a
303 |   # normal compilation that the losing compiler can handle.  If no
304 |   # '.c' file was seen then we are probably linking.  That is also
305 |   # ok.
306 |   exec "$@"
307 | fi
308 | 
309 | # Name of file we expect compiler to create.
310 | cofile=`echo "$cfile" | sed 's|^.*[\\/]||; s|^[a-zA-Z]:||; s/\.c$/.o/'`
311 | 
312 | # Create the lock directory.
313 | # Note: use '[/\\:.-]' here to ensure that we don't use the same name
314 | # that we are using for the .o file.  Also, base the name on the expected
315 | # object file name, since that is what matters with a parallel build.
316 | lockdir=`echo "$cofile" | sed -e 's|[/\\:.-]|_|g'`.d
317 | while true; do
318 |   if mkdir "$lockdir" >/dev/null 2>&1; then
319 |     break
320 |   fi
321 |   sleep 1
322 | done
323 | # FIXME: race condition here if user kills between mkdir and trap.
324 | trap "rmdir '$lockdir'; exit 1" 1 2 15
325 | 
326 | # Run the compile.
327 | "$@"
328 | ret=$?
329 | 
330 | if test -f "$cofile"; then
331 |   test "$cofile" = "$ofile" || mv "$cofile" "$ofile"
332 | elif test -f "${cofile}bj"; then
333 |   test "${cofile}bj" = "$ofile" || mv "${cofile}bj" "$ofile"
334 | fi
335 | 
336 | rmdir "$lockdir"
337 | exit $ret
338 | 
339 | # Local Variables:
340 | # mode: shell-script
341 | # sh-indentation: 2
342 | # eval: (add-hook 'write-file-hooks 'time-stamp)
343 | # time-stamp-start: "scriptversion="
344 | # time-stamp-format: "%:y-%02m-%02d.%02H"
345 | # time-stamp-time-zone: "UTC"
346 | # time-stamp-end: "; # UTC"
347 | # End:
348 | 


--------------------------------------------------------------------------------
/.autoconf/m4/ltsugar.m4:
--------------------------------------------------------------------------------
  1 | # ltsugar.m4 -- libtool m4 base layer.                         -*-Autoconf-*-
  2 | #
  3 | # Copyright (C) 2004-2005, 2007-2008, 2011-2015 Free Software
  4 | # Foundation, Inc.
  5 | # Written by Gary V. Vaughan, 2004
  6 | #
  7 | # This file is free software; the Free Software Foundation gives
  8 | # unlimited permission to copy and/or distribute it, with or without
  9 | # modifications, as long as this notice is preserved.
 10 | 
 11 | # serial 6 ltsugar.m4
 12 | 
 13 | # This is to help aclocal find these macros, as it can't see m4_define.
 14 | AC_DEFUN([LTSUGAR_VERSION], [m4_if([0.1])])
 15 | 
 16 | 
 17 | # lt_join(SEP, ARG1, [ARG2...])
 18 | # -----------------------------
 19 | # Produce ARG1SEPARG2...SEPARGn, omitting [] arguments and their
 20 | # associated separator.
 21 | # Needed until we can rely on m4_join from Autoconf 2.62, since all earlier
 22 | # versions in m4sugar had bugs.
 23 | m4_define([lt_join],
 24 | [m4_if([$#], [1], [],
 25 |        [$#], [2], [[$2]],
 26 |        [m4_if([$2], [], [], [[$2]_])$0([$1], m4_shift(m4_shift($@)))])])
 27 | m4_define([_lt_join],
 28 | [m4_if([$#$2], [2], [],
 29 |        [m4_if([$2], [], [], [[$1$2]])$0([$1], m4_shift(m4_shift($@)))])])
 30 | 
 31 | 
 32 | # lt_car(LIST)
 33 | # lt_cdr(LIST)
 34 | # ------------
 35 | # Manipulate m4 lists.
 36 | # These macros are necessary as long as will still need to support
 37 | # Autoconf-2.59, which quotes differently.
 38 | m4_define([lt_car], [[$1]])
 39 | m4_define([lt_cdr],
 40 | [m4_if([$#], 0, [m4_fatal([$0: cannot be called without arguments])],
 41 |        [$#], 1, [],
 42 |        [m4_dquote(m4_shift($@))])])
 43 | m4_define([lt_unquote], $1)
 44 | 
 45 | 
 46 | # lt_append(MACRO-NAME, STRING, [SEPARATOR])
 47 | # ------------------------------------------
 48 | # Redefine MACRO-NAME to hold its former content plus 'SEPARATOR''STRING'.
 49 | # Note that neither SEPARATOR nor STRING are expanded; they are appended
 50 | # to MACRO-NAME as is (leaving the expansion for when MACRO-NAME is invoked).
 51 | # No SEPARATOR is output if MACRO-NAME was previously undefined (different
 52 | # than defined and empty).
 53 | #
 54 | # This macro is needed until we can rely on Autoconf 2.62, since earlier
 55 | # versions of m4sugar mistakenly expanded SEPARATOR but not STRING.
 56 | m4_define([lt_append],
 57 | [m4_define([$1],
 58 | 	   m4_ifdef([$1], [m4_defn([$1])[$3]])[$2])])
 59 | 
 60 | 
 61 | 
 62 | # lt_combine(SEP, PREFIX-LIST, INFIX, SUFFIX1, [SUFFIX2...])
 63 | # ----------------------------------------------------------
 64 | # Produce a SEP delimited list of all paired combinations of elements of
 65 | # PREFIX-LIST with SUFFIX1 through SUFFIXn.  Each element of the list
 66 | # has the form PREFIXmINFIXSUFFIXn.
 67 | # Needed until we can rely on m4_combine added in Autoconf 2.62.
 68 | m4_define([lt_combine],
 69 | [m4_if(m4_eval([$# > 3]), [1],
 70 |        [m4_pushdef([_Lt_sep], [m4_define([_Lt_sep], m4_defn([lt_car]))])]]dnl
 71 | [[m4_foreach([_Lt_prefix], [$2],
 72 | 	     [m4_foreach([_Lt_suffix],
 73 | 		]m4_dquote(m4_dquote(m4_shift(m4_shift(m4_shift($@)))))[,
 74 | 	[_Lt_sep([$1])[]m4_defn([_Lt_prefix])[$3]m4_defn([_Lt_suffix])])])])])
 75 | 
 76 | 
 77 | # lt_if_append_uniq(MACRO-NAME, VARNAME, [SEPARATOR], [UNIQ], [NOT-UNIQ])
 78 | # -----------------------------------------------------------------------
 79 | # Iff MACRO-NAME does not yet contain VARNAME, then append it (delimited
 80 | # by SEPARATOR if supplied) and expand UNIQ, else NOT-UNIQ.
 81 | m4_define([lt_if_append_uniq],
 82 | [m4_ifdef([$1],
 83 | 	  [m4_if(m4_index([$3]m4_defn([$1])[$3], [$3$2$3]), [-1],
 84 | 		 [lt_append([$1], [$2], [$3])$4],
 85 | 		 [$5])],
 86 | 	  [lt_append([$1], [$2], [$3])$4])])
 87 | 
 88 | 
 89 | # lt_dict_add(DICT, KEY, VALUE)
 90 | # -----------------------------
 91 | m4_define([lt_dict_add],
 92 | [m4_define([$1($2)], [$3])])
 93 | 
 94 | 
 95 | # lt_dict_add_subkey(DICT, KEY, SUBKEY, VALUE)
 96 | # --------------------------------------------
 97 | m4_define([lt_dict_add_subkey],
 98 | [m4_define([$1($2:$3)], [$4])])
 99 | 
100 | 
101 | # lt_dict_fetch(DICT, KEY, [SUBKEY])
102 | # ----------------------------------
103 | m4_define([lt_dict_fetch],
104 | [m4_ifval([$3],
105 | 	m4_ifdef([$1($2:$3)], [m4_defn([$1($2:$3)])]),
106 |     m4_ifdef([$1($2)], [m4_defn([$1($2)])]))])
107 | 
108 | 
109 | # lt_if_dict_fetch(DICT, KEY, [SUBKEY], VALUE, IF-TRUE, [IF-FALSE])
110 | # -----------------------------------------------------------------
111 | m4_define([lt_if_dict_fetch],
112 | [m4_if(lt_dict_fetch([$1], [$2], [$3]), [$4],
113 | 	[$5],
114 |     [$6])])
115 | 
116 | 
117 | # lt_dict_filter(DICT, [SUBKEY], VALUE, [SEPARATOR], KEY, [...])
118 | # --------------------------------------------------------------
119 | m4_define([lt_dict_filter],
120 | [m4_if([$5], [], [],
121 |   [lt_join(m4_quote(m4_default([$4], [[, ]])),
122 |            lt_unquote(m4_split(m4_normalize(m4_foreach(_Lt_key, lt_car([m4_shiftn(4, $@)]),
123 | 		      [lt_if_dict_fetch([$1], _Lt_key, [$2], [$3], [_Lt_key ])])))))])[]dnl
124 | ])
125 | 


--------------------------------------------------------------------------------
/.autoconf/m4/ltversion.m4:
--------------------------------------------------------------------------------
 1 | # ltversion.m4 -- version numbers			-*- Autoconf -*-
 2 | #
 3 | #   Copyright (C) 2004, 2011-2015 Free Software Foundation, Inc.
 4 | #   Written by Scott James Remnant, 2004
 5 | #
 6 | # This file is free software; the Free Software Foundation gives
 7 | # unlimited permission to copy and/or distribute it, with or without
 8 | # modifications, as long as this notice is preserved.
 9 | 
10 | # @configure_input@
11 | 
12 | # serial 4179 ltversion.m4
13 | # This file is part of GNU Libtool
14 | 
15 | m4_define([LT_PACKAGE_VERSION], [2.4.6])
16 | m4_define([LT_PACKAGE_REVISION], [2.4.6])
17 | 
18 | AC_DEFUN([LTVERSION_VERSION],
19 | [macro_version='2.4.6'
20 | macro_revision='2.4.6'
21 | _LT_DECL(, macro_version, 0, [Which release of libtool.m4 was used?])
22 | _LT_DECL(, macro_revision, 0)
23 | ])
24 | 


--------------------------------------------------------------------------------
/.autoconf/m4/lt~obsolete.m4:
--------------------------------------------------------------------------------
  1 | # lt~obsolete.m4 -- aclocal satisfying obsolete definitions.    -*-Autoconf-*-
  2 | #
  3 | #   Copyright (C) 2004-2005, 2007, 2009, 2011-2015 Free Software
  4 | #   Foundation, Inc.
  5 | #   Written by Scott James Remnant, 2004.
  6 | #
  7 | # This file is free software; the Free Software Foundation gives
  8 | # unlimited permission to copy and/or distribute it, with or without
  9 | # modifications, as long as this notice is preserved.
 10 | 
 11 | # serial 5 lt~obsolete.m4
 12 | 
 13 | # These exist entirely to fool aclocal when bootstrapping libtool.
 14 | #
 15 | # In the past libtool.m4 has provided macros via AC_DEFUN (or AU_DEFUN),
 16 | # which have later been changed to m4_define as they aren't part of the
 17 | # exported API, or moved to Autoconf or Automake where they belong.
 18 | #
 19 | # The trouble is, aclocal is a bit thick.  It'll see the old AC_DEFUN
 20 | # in /usr/share/aclocal/libtool.m4 and remember it, then when it sees us
 21 | # using a macro with the same name in our local m4/libtool.m4 it'll
 22 | # pull the old libtool.m4 in (it doesn't see our shiny new m4_define
 23 | # and doesn't know about Autoconf macros at all.)
 24 | #
 25 | # So we provide this file, which has a silly filename so it's always
 26 | # included after everything else.  This provides aclocal with the
 27 | # AC_DEFUNs it wants, but when m4 processes it, it doesn't do anything
 28 | # because those macros already exist, or will be overwritten later.
 29 | # We use AC_DEFUN over AU_DEFUN for compatibility with aclocal-1.6.
 30 | #
 31 | # Anytime we withdraw an AC_DEFUN or AU_DEFUN, remember to add it here.
 32 | # Yes, that means every name once taken will need to remain here until
 33 | # we give up compatibility with versions before 1.7, at which point
 34 | # we need to keep only those names which we still refer to.
 35 | 
 36 | # This is to help aclocal find these macros, as it can't see m4_define.
 37 | AC_DEFUN([LTOBSOLETE_VERSION], [m4_if([1])])
 38 | 
 39 | m4_ifndef([AC_LIBTOOL_LINKER_OPTION],	[AC_DEFUN([AC_LIBTOOL_LINKER_OPTION])])
 40 | m4_ifndef([AC_PROG_EGREP],		[AC_DEFUN([AC_PROG_EGREP])])
 41 | m4_ifndef([_LT_AC_PROG_ECHO_BACKSLASH],	[AC_DEFUN([_LT_AC_PROG_ECHO_BACKSLASH])])
 42 | m4_ifndef([_LT_AC_SHELL_INIT],		[AC_DEFUN([_LT_AC_SHELL_INIT])])
 43 | m4_ifndef([_LT_AC_SYS_LIBPATH_AIX],	[AC_DEFUN([_LT_AC_SYS_LIBPATH_AIX])])
 44 | m4_ifndef([_LT_PROG_LTMAIN],		[AC_DEFUN([_LT_PROG_LTMAIN])])
 45 | m4_ifndef([_LT_AC_TAGVAR],		[AC_DEFUN([_LT_AC_TAGVAR])])
 46 | m4_ifndef([AC_LTDL_ENABLE_INSTALL],	[AC_DEFUN([AC_LTDL_ENABLE_INSTALL])])
 47 | m4_ifndef([AC_LTDL_PREOPEN],		[AC_DEFUN([AC_LTDL_PREOPEN])])
 48 | m4_ifndef([_LT_AC_SYS_COMPILER],	[AC_DEFUN([_LT_AC_SYS_COMPILER])])
 49 | m4_ifndef([_LT_AC_LOCK],		[AC_DEFUN([_LT_AC_LOCK])])
 50 | m4_ifndef([AC_LIBTOOL_SYS_OLD_ARCHIVE],	[AC_DEFUN([AC_LIBTOOL_SYS_OLD_ARCHIVE])])
 51 | m4_ifndef([_LT_AC_TRY_DLOPEN_SELF],	[AC_DEFUN([_LT_AC_TRY_DLOPEN_SELF])])
 52 | m4_ifndef([AC_LIBTOOL_PROG_CC_C_O],	[AC_DEFUN([AC_LIBTOOL_PROG_CC_C_O])])
 53 | m4_ifndef([AC_LIBTOOL_SYS_HARD_LINK_LOCKS], [AC_DEFUN([AC_LIBTOOL_SYS_HARD_LINK_LOCKS])])
 54 | m4_ifndef([AC_LIBTOOL_OBJDIR],		[AC_DEFUN([AC_LIBTOOL_OBJDIR])])
 55 | m4_ifndef([AC_LTDL_OBJDIR],		[AC_DEFUN([AC_LTDL_OBJDIR])])
 56 | m4_ifndef([AC_LIBTOOL_PROG_LD_HARDCODE_LIBPATH], [AC_DEFUN([AC_LIBTOOL_PROG_LD_HARDCODE_LIBPATH])])
 57 | m4_ifndef([AC_LIBTOOL_SYS_LIB_STRIP],	[AC_DEFUN([AC_LIBTOOL_SYS_LIB_STRIP])])
 58 | m4_ifndef([AC_PATH_MAGIC],		[AC_DEFUN([AC_PATH_MAGIC])])
 59 | m4_ifndef([AC_PROG_LD_GNU],		[AC_DEFUN([AC_PROG_LD_GNU])])
 60 | m4_ifndef([AC_PROG_LD_RELOAD_FLAG],	[AC_DEFUN([AC_PROG_LD_RELOAD_FLAG])])
 61 | m4_ifndef([AC_DEPLIBS_CHECK_METHOD],	[AC_DEFUN([AC_DEPLIBS_CHECK_METHOD])])
 62 | m4_ifndef([AC_LIBTOOL_PROG_COMPILER_NO_RTTI], [AC_DEFUN([AC_LIBTOOL_PROG_COMPILER_NO_RTTI])])
 63 | m4_ifndef([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE], [AC_DEFUN([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE])])
 64 | m4_ifndef([AC_LIBTOOL_PROG_COMPILER_PIC], [AC_DEFUN([AC_LIBTOOL_PROG_COMPILER_PIC])])
 65 | m4_ifndef([AC_LIBTOOL_PROG_LD_SHLIBS],	[AC_DEFUN([AC_LIBTOOL_PROG_LD_SHLIBS])])
 66 | m4_ifndef([AC_LIBTOOL_POSTDEP_PREDEP],	[AC_DEFUN([AC_LIBTOOL_POSTDEP_PREDEP])])
 67 | m4_ifndef([LT_AC_PROG_EGREP],		[AC_DEFUN([LT_AC_PROG_EGREP])])
 68 | m4_ifndef([LT_AC_PROG_SED],		[AC_DEFUN([LT_AC_PROG_SED])])
 69 | m4_ifndef([_LT_CC_BASENAME],		[AC_DEFUN([_LT_CC_BASENAME])])
 70 | m4_ifndef([_LT_COMPILER_BOILERPLATE],	[AC_DEFUN([_LT_COMPILER_BOILERPLATE])])
 71 | m4_ifndef([_LT_LINKER_BOILERPLATE],	[AC_DEFUN([_LT_LINKER_BOILERPLATE])])
 72 | m4_ifndef([_AC_PROG_LIBTOOL],		[AC_DEFUN([_AC_PROG_LIBTOOL])])
 73 | m4_ifndef([AC_LIBTOOL_SETUP],		[AC_DEFUN([AC_LIBTOOL_SETUP])])
 74 | m4_ifndef([_LT_AC_CHECK_DLFCN],		[AC_DEFUN([_LT_AC_CHECK_DLFCN])])
 75 | m4_ifndef([AC_LIBTOOL_SYS_DYNAMIC_LINKER],	[AC_DEFUN([AC_LIBTOOL_SYS_DYNAMIC_LINKER])])
 76 | m4_ifndef([_LT_AC_TAGCONFIG],		[AC_DEFUN([_LT_AC_TAGCONFIG])])
 77 | m4_ifndef([AC_DISABLE_FAST_INSTALL],	[AC_DEFUN([AC_DISABLE_FAST_INSTALL])])
 78 | m4_ifndef([_LT_AC_LANG_CXX],		[AC_DEFUN([_LT_AC_LANG_CXX])])
 79 | m4_ifndef([_LT_AC_LANG_F77],		[AC_DEFUN([_LT_AC_LANG_F77])])
 80 | m4_ifndef([_LT_AC_LANG_GCJ],		[AC_DEFUN([_LT_AC_LANG_GCJ])])
 81 | m4_ifndef([AC_LIBTOOL_LANG_C_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_C_CONFIG])])
 82 | m4_ifndef([_LT_AC_LANG_C_CONFIG],	[AC_DEFUN([_LT_AC_LANG_C_CONFIG])])
 83 | m4_ifndef([AC_LIBTOOL_LANG_CXX_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_CXX_CONFIG])])
 84 | m4_ifndef([_LT_AC_LANG_CXX_CONFIG],	[AC_DEFUN([_LT_AC_LANG_CXX_CONFIG])])
 85 | m4_ifndef([AC_LIBTOOL_LANG_F77_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_F77_CONFIG])])
 86 | m4_ifndef([_LT_AC_LANG_F77_CONFIG],	[AC_DEFUN([_LT_AC_LANG_F77_CONFIG])])
 87 | m4_ifndef([AC_LIBTOOL_LANG_GCJ_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_GCJ_CONFIG])])
 88 | m4_ifndef([_LT_AC_LANG_GCJ_CONFIG],	[AC_DEFUN([_LT_AC_LANG_GCJ_CONFIG])])
 89 | m4_ifndef([AC_LIBTOOL_LANG_RC_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_RC_CONFIG])])
 90 | m4_ifndef([_LT_AC_LANG_RC_CONFIG],	[AC_DEFUN([_LT_AC_LANG_RC_CONFIG])])
 91 | m4_ifndef([AC_LIBTOOL_CONFIG],		[AC_DEFUN([AC_LIBTOOL_CONFIG])])
 92 | m4_ifndef([_LT_AC_FILE_LTDLL_C],	[AC_DEFUN([_LT_AC_FILE_LTDLL_C])])
 93 | m4_ifndef([_LT_REQUIRED_DARWIN_CHECKS],	[AC_DEFUN([_LT_REQUIRED_DARWIN_CHECKS])])
 94 | m4_ifndef([_LT_AC_PROG_CXXCPP],		[AC_DEFUN([_LT_AC_PROG_CXXCPP])])
 95 | m4_ifndef([_LT_PREPARE_SED_QUOTE_VARS],	[AC_DEFUN([_LT_PREPARE_SED_QUOTE_VARS])])
 96 | m4_ifndef([_LT_PROG_ECHO_BACKSLASH],	[AC_DEFUN([_LT_PROG_ECHO_BACKSLASH])])
 97 | m4_ifndef([_LT_PROG_F77],		[AC_DEFUN([_LT_PROG_F77])])
 98 | m4_ifndef([_LT_PROG_FC],		[AC_DEFUN([_LT_PROG_FC])])
 99 | m4_ifndef([_LT_PROG_CXX],		[AC_DEFUN([_LT_PROG_CXX])])
100 | 


--------------------------------------------------------------------------------
/.autoconf/missing:
--------------------------------------------------------------------------------
  1 | #! /bin/sh
  2 | # Common wrapper for a few potentially missing GNU programs.
  3 | 
  4 | scriptversion=2013-10-28.13; # UTC
  5 | 
  6 | # Copyright (C) 1996-2014 Free Software Foundation, Inc.
  7 | # Originally written by Fran,cois Pinard <pinard@iro.umontreal.ca>, 1996.
  8 | 
  9 | # This program is free software; you can redistribute it and/or modify
 10 | # it under the terms of the GNU General Public License as published by
 11 | # the Free Software Foundation; either version 2, or (at your option)
 12 | # any later version.
 13 | 
 14 | # This program is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU General Public License for more details.
 18 | 
 19 | # You should have received a copy of the GNU General Public License
 20 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 21 | 
 22 | # As a special exception to the GNU General Public License, if you
 23 | # distribute this file as part of a program that contains a
 24 | # configuration script generated by Autoconf, you may include it under
 25 | # the same distribution terms that you use for the rest of that program.
 26 | 
 27 | if test $# -eq 0; then
 28 |   echo 1>&2 "Try '$0 --help' for more information"
 29 |   exit 1
 30 | fi
 31 | 
 32 | case $1 in
 33 | 
 34 |   --is-lightweight)
 35 |     # Used by our autoconf macros to check whether the available missing
 36 |     # script is modern enough.
 37 |     exit 0
 38 |     ;;
 39 | 
 40 |   --run)
 41 |     # Back-compat with the calling convention used by older automake.
 42 |     shift
 43 |     ;;
 44 | 
 45 |   -h|--h|--he|--hel|--help)
 46 |     echo "\
 47 | $0 [OPTION]... PROGRAM [ARGUMENT]...
 48 | 
 49 | Run 'PROGRAM [ARGUMENT]...', returning a proper advice when this fails due
 50 | to PROGRAM being missing or too old.
 51 | 
 52 | Options:
 53 |   -h, --help      display this help and exit
 54 |   -v, --version   output version information and exit
 55 | 
 56 | Supported PROGRAM values:
 57 |   aclocal   autoconf  autoheader   autom4te  automake  makeinfo
 58 |   bison     yacc      flex         lex       help2man
 59 | 
 60 | Version suffixes to PROGRAM as well as the prefixes 'gnu-', 'gnu', and
 61 | 'g' are ignored when checking the name.
 62 | 
 63 | Send bug reports to <bug-automake@gnu.org>."
 64 |     exit $?
 65 |     ;;
 66 | 
 67 |   -v|--v|--ve|--ver|--vers|--versi|--versio|--version)
 68 |     echo "missing $scriptversion (GNU Automake)"
 69 |     exit $?
 70 |     ;;
 71 | 
 72 |   -*)
 73 |     echo 1>&2 "$0: unknown '$1' option"
 74 |     echo 1>&2 "Try '$0 --help' for more information"
 75 |     exit 1
 76 |     ;;
 77 | 
 78 | esac
 79 | 
 80 | # Run the given program, remember its exit status.
 81 | "$@"; st=$?
 82 | 
 83 | # If it succeeded, we are done.
 84 | test $st -eq 0 && exit 0
 85 | 
 86 | # Also exit now if we it failed (or wasn't found), and '--version' was
 87 | # passed; such an option is passed most likely to detect whether the
 88 | # program is present and works.
 89 | case $2 in --version|--help) exit $st;; esac
 90 | 
 91 | # Exit code 63 means version mismatch.  This often happens when the user
 92 | # tries to use an ancient version of a tool on a file that requires a
 93 | # minimum version.
 94 | if test $st -eq 63; then
 95 |   msg="probably too old"
 96 | elif test $st -eq 127; then
 97 |   # Program was missing.
 98 |   msg="missing on your system"
 99 | else
100 |   # Program was found and executed, but failed.  Give up.
101 |   exit $st
102 | fi
103 | 
104 | perl_URL=http://www.perl.org/
105 | flex_URL=http://flex.sourceforge.net/
106 | gnu_software_URL=http://www.gnu.org/software
107 | 
108 | program_details ()
109 | {
110 |   case $1 in
111 |     aclocal|automake)
112 |       echo "The '$1' program is part of the GNU Automake package:"
113 |       echo "<$gnu_software_URL/automake>"
114 |       echo "It also requires GNU Autoconf, GNU m4 and Perl in order to run:"
115 |       echo "<$gnu_software_URL/autoconf>"
116 |       echo "<$gnu_software_URL/m4/>"
117 |       echo "<$perl_URL>"
118 |       ;;
119 |     autoconf|autom4te|autoheader)
120 |       echo "The '$1' program is part of the GNU Autoconf package:"
121 |       echo "<$gnu_software_URL/autoconf/>"
122 |       echo "It also requires GNU m4 and Perl in order to run:"
123 |       echo "<$gnu_software_URL/m4/>"
124 |       echo "<$perl_URL>"
125 |       ;;
126 |   esac
127 | }
128 | 
129 | give_advice ()
130 | {
131 |   # Normalize program name to check for.
132 |   normalized_program=`echo "$1" | sed '
133 |     s/^gnu-//; t
134 |     s/^gnu//; t
135 |     s/^g//; t'`
136 | 
137 |   printf '%s\n' "'$1' is $msg."
138 | 
139 |   configure_deps="'configure.ac' or m4 files included by 'configure.ac'"
140 |   case $normalized_program in
141 |     autoconf*)
142 |       echo "You should only need it if you modified 'configure.ac',"
143 |       echo "or m4 files included by it."
144 |       program_details 'autoconf'
145 |       ;;
146 |     autoheader*)
147 |       echo "You should only need it if you modified 'acconfig.h' or"
148 |       echo "$configure_deps."
149 |       program_details 'autoheader'
150 |       ;;
151 |     automake*)
152 |       echo "You should only need it if you modified 'Makefile.am' or"
153 |       echo "$configure_deps."
154 |       program_details 'automake'
155 |       ;;
156 |     aclocal*)
157 |       echo "You should only need it if you modified 'acinclude.m4' or"
158 |       echo "$configure_deps."
159 |       program_details 'aclocal'
160 |       ;;
161 |    autom4te*)
162 |       echo "You might have modified some maintainer files that require"
163 |       echo "the 'autom4te' program to be rebuilt."
164 |       program_details 'autom4te'
165 |       ;;
166 |     bison*|yacc*)
167 |       echo "You should only need it if you modified a '.y' file."
168 |       echo "You may want to install the GNU Bison package:"
169 |       echo "<$gnu_software_URL/bison/>"
170 |       ;;
171 |     lex*|flex*)
172 |       echo "You should only need it if you modified a '.l' file."
173 |       echo "You may want to install the Fast Lexical Analyzer package:"
174 |       echo "<$flex_URL>"
175 |       ;;
176 |     help2man*)
177 |       echo "You should only need it if you modified a dependency" \
178 |            "of a man page."
179 |       echo "You may want to install the GNU Help2man package:"
180 |       echo "<$gnu_software_URL/help2man/>"
181 |     ;;
182 |     makeinfo*)
183 |       echo "You should only need it if you modified a '.texi' file, or"
184 |       echo "any other file indirectly affecting the aspect of the manual."
185 |       echo "You might want to install the Texinfo package:"
186 |       echo "<$gnu_software_URL/texinfo/>"
187 |       echo "The spurious makeinfo call might also be the consequence of"
188 |       echo "using a buggy 'make' (AIX, DU, IRIX), in which case you might"
189 |       echo "want to install GNU make:"
190 |       echo "<$gnu_software_URL/make/>"
191 |       ;;
192 |     *)
193 |       echo "You might have modified some files without having the proper"
194 |       echo "tools for further handling them.  Check the 'README' file, it"
195 |       echo "often tells you about the needed prerequisites for installing"
196 |       echo "this package.  You may also peek at any GNU archive site, in"
197 |       echo "case some other package contains this missing '$1' program."
198 |       ;;
199 |   esac
200 | }
201 | 
202 | give_advice "$1" | sed -e '1s/^/WARNING: /' \
203 |                        -e '2,$s/^/         /' >&2
204 | 
205 | # Propagate the correct exit status (expected to be 127 for a program
206 | # not found, 63 for a program that failed due to version mismatch).
207 | exit $st
208 | 
209 | # Local variables:
210 | # eval: (add-hook 'write-file-hooks 'time-stamp)
211 | # time-stamp-start: "scriptversion="
212 | # time-stamp-format: "%:y-%02m-%02d.%02H"
213 | # time-stamp-time-zone: "UTC"
214 | # time-stamp-end: "; # UTC"
215 | # End:
216 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | Dockerfile
2 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AdolfVonKleist/Phonetisaurus/f08d3dfb10b8d619e665a9581d2a327bcc2504f7/.gitattributes


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | .*\.o$
 3 | .*\.arpa$
 4 | .*\.fst$
 5 | .*\.fst.txt$
 6 | .*\.dylib$
 7 | .*\.so
 8 | .*-binding.cc
 9 | src/bin/phonetisaurus-align
10 | src/bin/phonetisaurus-arpa2wfst
11 | src/bin/phonetisaurus-g2pfst
12 | src/bin/phonetisaurus-g2prnn
13 | src/bin/rnnlm
14 | .*\.pyc$
15 | models/
16 | .*\.egg-info/
17 | build/
18 | dist/
19 | src/data
20 | src/rnnlm.direct$
21 | *.pyc
22 | phonetisaurus.egg-info/
23 | rnnlm.egg-info/
24 | *.fst
25 | *.o
26 | *.so
27 | *-binding.cc
28 | nohup.out
29 | src/.autoconf/autom4te.cache
30 | src/.autoconf/config.log
31 | src/.autoconf/config.status
32 | src/.autoconf/configure
33 | src/Makefile
34 | html/
35 | exp/
36 | phonetisaurus.egg-info/
37 | config.log
38 | config.status
39 | autom4te.cache
40 | .libs/
41 | Makefile
42 | Phonetisaurus.la
43 | libtool
44 | phonetisaurus-align
45 | phonetisaurus-arpa2wfst
46 | phonetisaurus-g2pfst
47 | phonetisaurus-g2prnn
48 | python/.deps/
49 | python/.dirstamp
50 | python/Phonetisaurus_la-Phonetisaurus-binding.lo
51 | rnnlm
52 | src/3rdparty/rnnlm/.deps/
53 | src/3rdparty/rnnlm/.dirstamp
54 | src/bin/.deps/
55 | src/bin/.dirstamp
56 | src/lib/.deps/
57 | src/lib/.dirstamp
58 | src/lib/Phonetisaurus_la-util.lo
59 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: cpp
 2 | 
 3 | branches:
 4 |   only:
 5 |     - master
 6 | 
 7 | sudo: required
 8 | dist: trusty
 9 | group: edge
10 | 
11 | os:
12 |   - linux
13 | 
14 | compiler:
15 |   - gcc
16 | 
17 | env:
18 |   - CROSSCOMPILE=native
19 | 
20 | addons:
21 |   apt:
22 |     packages:
23 |       - zlib1g-dev
24 |       - wget
25 |       - python-dev
26 | 
27 | matrix:
28 |   include:
29 |     - os: linux
30 |       compiler: gcc
31 |       addons:
32 |         apt:
33 |           sources: ['ubuntu-toolchain-r-test']
34 |           packages: ['g++-4.9','zlib1g-dev','wget','python-dev', 'python-pip']
35 |       env:
36 |         - CXXCOMPILER=g++-4.9
37 | 
38 | before_install:
39 |   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update          ; fi
40 |   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install openfst ; fi
41 |   - if [[ "$TRAVIS_OS_NAME" != "osx" ]]; then wget https://github.com/AdolfVonKleist/packages/raw/master/Ubuntu-14.04/debs/openfst_1.6.2_amd64-trusty.deb ; fi
42 |   - if [[ "$TRAVIS_OS_NAME" != "osx" ]]; then sudo dpkg -i openfst_1.6.2_amd64-trusty.deb ; fi
43 |   - if [ ! -z "$CXXCOMPILER" ]; then export CXX="$CXXCOMPILER"; fi
44 | 
45 | install:
46 |   - pip install --user pybindgen
47 | 
48 | script:
49 |   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then ./configure --enable-python && make -j2 && ( ./phonetisaurus-g2pfst --help || echo ) && cd python && cp ../.libs/Phonetisaurus.so . && python setup.py build ; fi
50 |   - if [[ "$TRAVIS_OS_NAME" != "osx" ]]; then ./configure --enable-python && make -j2 && ./phonetisaurus-g2pfst --help && cd python && cp ../.libs/Phonetisaurus.so . && python setup.py build ; fi
51 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3 as build
 2 | 
 3 | WORKDIR /build
 4 | 
 5 | RUN apt-get -y update && apt-get -y install git g++ autoconf-archive make libtool gfortran tar gawk
 6 | 
 7 | RUN wget http://www.openfst.org/twiki/pub/FST/FstDownload/openfst-1.6.2.tar.gz && \
 8 |     tar -xvzf openfst-1.6.2.tar.gz && \
 9 |     cd openfst-1.6.2 && \
10 |     ./configure --enable-static --enable-shared --enable-far --enable-ngram-fsts && \
11 |     make -j $(nproc) && \
12 |     make install && \
13 |     ldconfig
14 | 
15 | RUN git clone https://github.com/mitlm/mitlm && \
16 | 	cd mitlm && \
17 | 	autoreconf -i && \
18 | 	./configure && \
19 | 	make -j $(nproc) && \
20 | 	make install
21 | 
22 | WORKDIR /build/phonetisaurus
23 | 
24 | COPY . ./
25 | 
26 | RUN pip3 install pybindgen
27 | 
28 | RUN ./configure --enable-python && \
29 |     make -j $(nproc) && \
30 |     make install 
31 | 
32 | FROM python:3-slim
33 | 
34 | RUN apt-get -y update && apt-get -y install gfortran && apt-get -y clean && apt-get -y autoclean
35 | 
36 | WORKDIR /setup
37 | 
38 | COPY --from=build /build/phonetisaurus/python ./
39 | COPY --from=build /build/phonetisaurus/.libs/Phonetisaurus.so ./
40 | 
41 | RUN python setup.py install
42 | 
43 | COPY --from=build /usr/local/lib/fst /usr/local/lib/fst
44 | COPY --from=build /usr/local/lib/libfst*so*0 /usr/local/lib/
45 | COPY --from=build /usr/local/bin/phonetisaurus* /usr/local/bin/
46 | COPY --from=build /build/phonetisaurus/src/scripts/* /usr/local/bin/
47 | COPY --from=build /usr/local/bin/rnnlm /usr/local/bin/
48 | COPY --from=build /usr/local/bin/estimate-ngram /usr/local/bin/
49 | COPY --from=build /usr/local/lib/libmitlm.so.1.0.0 /usr/local/lib
50 | 
51 | RUN ldconfig
52 | 
53 | WORKDIR /work
54 | 
55 | ENTRYPOINT [ "/bin/bash" , "-c" ]
56 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2017, Josef Novak
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |     and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |     this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/Makefile.am:
--------------------------------------------------------------------------------
  1 | AUTOMAKE_OPTIONS = subdir-objects
  2 | ACLOCAL_AMFLAGS = -I .autoconf/m4
  3 | 
  4 | EXTRA_DIST=               	                       			  \
  5 | 	$(srcdir)/LICENSE						  \
  6 | 	$(srcdir)/README.md						  \
  7 | 	$(srcdir)/docs/doxygen.cfg					  \
  8 | 	$(srcdir)/docs/mainpage.dox					  \
  9 | 	$(srcdir)/src/3rdparty/rnnlm/COPYRIGHT.txt			  \
 10 | 	$(srcdir)/src/3rdparty/utfcpp/utf8/checked.h			  \
 11 | 	$(srcdir)/src/3rdparty/utfcpp/utf8/core.h			  \
 12 | 	$(srcdir)/src/3rdparty/utfcpp/utf8/unchecked.h			  \
 13 | 	$(srcdir)/src/3rdparty/utfcpp/utf8.h				  \
 14 | 	$(srcdir)/src/configure						  \
 15 | 	$(srcdir)/src/lib/feature-reader.cc				  \
 16 | 	$(srcdir)/src/include/RnnLMPy.h					  \
 17 | 	$(srcdir)/python/phonetisaurus-module.py			  \
 18 | 	$(srcdir)/python/script/phoneticize.py				  \
 19 | 	$(srcdir)/python/script/demo.html				  \
 20 | 	$(srcdir)/python/script/words.list				  \
 21 | 	$(srcdir)/python/script/g2pserver.py				  \
 22 | 	$(srcdir)/python/phonetisaurus/__init__.py			  \
 23 | 	$(srcdir)/python/pybindgen/__init__.py				  \
 24 | 	$(srcdir)/python/pybindgen/cppclass_typehandlers.py		  \
 25 | 	$(srcdir)/python/pybindgen/cppexception.py			  \
 26 | 	$(srcdir)/python/pybindgen/cppmethod.py				  \
 27 | 	$(srcdir)/python/pybindgen/enum.py				  \
 28 | 	$(srcdir)/python/pybindgen/wscript				  \
 29 | 	$(srcdir)/python/pybindgen/cppclass.py				  \
 30 | 	$(srcdir)/python/pybindgen/cppclass_container.py		  \
 31 | 	$(srcdir)/python/pybindgen/settings.py				  \
 32 | 	$(srcdir)/python/pybindgen/function.py				  \
 33 | 	$(srcdir)/python/pybindgen/utils.py				  \
 34 | 	$(srcdir)/python/pybindgen/module.py				  \
 35 | 	$(srcdir)/python/pybindgen/typehandlers/__init__.py		  \
 36 | 	$(srcdir)/python/pybindgen/typehandlers/codesink.py		  \
 37 | 	$(srcdir)/python/pybindgen/typehandlers/ctypeparser/__init__.py	  \
 38 | 	$(srcdir)/python/pybindgen/typehandlers/ctypeparser/tokenizer.py  \
 39 | 	$(srcdir)/python/pybindgen/typehandlers/stringtype.py		  \
 40 | 	$(srcdir)/python/pybindgen/typehandlers/pyobjecttype.py		  \
 41 | 	$(srcdir)/python/pybindgen/typehandlers/inttype.py		  \
 42 | 	$(srcdir)/python/pybindgen/typehandlers/doubletype.py		  \
 43 | 	$(srcdir)/python/pybindgen/typehandlers/voidtype.py		  \
 44 | 	$(srcdir)/python/pybindgen/typehandlers/floattype.py		  \
 45 | 	$(srcdir)/python/pybindgen/typehandlers/base.py			  \
 46 | 	$(srcdir)/python/pybindgen/typehandlers/booltype.py		  \
 47 | 	$(srcdir)/python/pybindgen/cppattribute.py			  \
 48 | 	$(srcdir)/python/pybindgen/gccxmlparser.py			  \
 49 | 	$(srcdir)/python/pybindgen/overloading.py			  \
 50 | 	$(srcdir)/python/pybindgen/pytypeobject.py			  \
 51 | 	$(srcdir)/python/pybindgen/container.py				  \
 52 | 	$(srcdir)/python/pybindgen/converter_functions.py		  \
 53 | 	$(srcdir)/python/pybindgen/version.py				  \
 54 | 	$(srcdir)/python/pybindgen/wrapper_registry.py			  \
 55 | 	$(srcdir)/python/setup.py					  \
 56 |         $(srcdir)/test/g014b2b/g014b2b.ref				  \
 57 |         $(srcdir)/test/g014b2b/g014b2b.train				  \
 58 |         $(srcdir)/test/g014b2b/g014b2b.words				  \
 59 |         $(srcdir)/test/check-nbest-wer.py
 60 | 
 61 | 
 62 | 
 63 | dist_bin_SCRIPTS =                          \
 64 | 	src/scripts/phonetisaurus-apply     \
 65 | 	src/scripts/phonetisaurus-train
 66 | 
 67 | bin_PROGRAMS =                     \
 68 | 	phonetisaurus-align	   \
 69 | 	phonetisaurus-arpa2wfst    \
 70 | 	phonetisaurus-g2pfst       \
 71 | 	phonetisaurus-g2prnn       \
 72 | 	rnnlm
 73 | 
 74 | AM_CPPFLAGS = -I$(top_srcdir)/src -DGIT_REVISION=\"$(GIT_REVISION)\" -std=c++0x -Wall -Wno-sign-compare -Wno-unused-local-typedefs
 75 | 
 76 | phonetisaurus_align_SOURCES = src/bin/phonetisaurus-align.cc src/include/PhonetisaurusRex.h src/lib/util.cc src/include/util.h src/lib/LatticePruner.cc src/include/LatticePruner.h src/lib/M2MFstAligner.cc src/include/M2MFstAligner.h
 77 | phonetisaurus_align_CXXFLAGS = $(OPENFST_CXXFLAGS) $(UTFCPP_CXXFLAGS)
 78 | phonetisaurus_align_LDADD = $(OPENFST_LDFLAGS)
 79 | 
 80 | phonetisaurus_arpa2wfst_SOURCES = src/bin/phonetisaurus-arpa2wfst.cc src/include/ARPA2WFST.h src/lib/util.cc src/include/util.h
 81 | phonetisaurus_arpa2wfst_CXXFLAGS = $(OPENFST_CXXFLAGS) $(UTFCPP_CXXFLAGS)
 82 | phonetisaurus_arpa2wfst_LDADD = $(OPENFST_LDFLAGS)
 83 | 
 84 | phonetisaurus_g2pfst_SOURCES = src/bin/phonetisaurus-g2pfst.cc src/include/PhonetisaurusScript.h src/include/PhonetisaurusRex.h src/lib/util.cc src/include/util.h
 85 | phonetisaurus_g2pfst_CXXFLAGS = $(OPENFST_CXXFLAGS) $(UTFCPP_CXXFLAGS) -funroll-loops -ffast-math
 86 | phonetisaurus_g2pfst_LDADD = $(OPENFST_LDFLAGS)
 87 | 
 88 | phonetisaurus_g2prnn_SOURCES = src/bin/phonetisaurus-g2prnn.cc src/include/LegacyRnnLMHash.h src/include/LegacyRnnLMDecodable.h src/include/LegacyRnnLMReader.h src/include/RnnLMDecoder.h src/lib/util.cc src/include/util.h src/3rdparty/rnnlm/rnnlmlib.cpp src/3rdparty/rnnlm/rnnlmlib.h
 89 | phonetisaurus_g2prnn_CXXFLAGS = $(OPENFST_CXXFLAGS) $(UTFCPP_CXXFLAGS) -I$(top_srcdir)/src/3rdparty/rnnlm -funroll-loops -ffast-math
 90 | phonetisaurus_g2prnn_LDADD = $(OPENFST_LDFLAGS)
 91 | if WANT_OPENMP
 92 | if OPENMP
 93 |   phonetisaurus_g2prnn_CXXFLAGS += $(OPENMP_CXXFLAGS)
 94 |   phonetisaurus_g2prnn_LDADD += $(OPENMP_LDFLAGS)
 95 | endif
 96 | endif
 97 | 
 98 | rnnlm_SOURCES = src/bin/rnnlm.cc src/3rdparty/rnnlm/rnnlmlib.cpp src/3rdparty/rnnlm/rnnlmlib.h
 99 | rnnlm_CXXFLAGS = $(OPENFST_CXXFLAGS) -I$(top_srcdir)/src/3rdparty/rnnlm -funroll-loops -ffast-math
100 | rnnlm_LDADD = $(OPENFST_LDFLAGS)
101 | 
102 | if WANT_PYTHON
103 | if HAVE_PYTHON
104 | 
105 | python/Phonetisaurus-binding.cc: $(top_srcdir)/python/phonetisaurus-module.py
106 | 	mkdir -p python
107 | 	$(PYTHON) -B $(top_srcdir)/python/phonetisaurus-module.py > python/Phonetisaurus-binding.cc
108 | 
109 | CLEANFILES = python/Phonetisaurus-binding.cc
110 | 
111 | if HAVE_PYTHON_DEV
112 | pyexec_LTLIBRARIES = Phonetisaurus.la
113 | nodist_Phonetisaurus_la_SOURCES = python/Phonetisaurus-binding.cc src/lib/util.cc src/include/util.h
114 | Phonetisaurus_la_CXXFLAGS = $(OPENFST_CXXFLAGS) $(PYTHON_CPPFLAGS) $(UTFCPP_CXXFLAGS) -funroll-loops -ffast-math
115 | Phonetisaurus_la_LIBADD = $(OPENFST_LDFLAGS) $(PYTHON_LIBS)
116 | Phonetisaurus_la_LDFLAGS = -avoid-version -module
117 | endif
118 | endif
119 | endif
120 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Phonetisaurus G2P ##
  2 | [![Build Status](https://travis-ci.org/AdolfVonKleist/Phonetisaurus.svg?branch=master)](https://travis-ci.org/AdolfVonKleist/Phonetisaurus)
  3 | 
  4 | This repository contains scripts suitable for training, evaluating and using grapheme-to-phoneme
  5 | models for speech recognition using the OpenFst framework.  The current build requires OpenFst
  6 | version 1.6.0 or later, and the examples below use version 1.7.2.
  7 | 
  8 | The repository includes C++ binaries suitable for training, compiling, and evaluating G2P models.
  9 | It also some simple python bindings which may be used to extract individual
 10 | multigram scores, alignments, and to dump the raw lattices in .fst format for each word.
 11 | 
 12 | The python scripts and bindings were tested most recently with python v3.8.5.
 13 | 
 14 | Standalone distributions related to previous INTERSPEECH papers, as well as the complete, exported
 15 | final version of the old google-code repository are available via ```git-lfs``` in a separate
 16 | repository:
 17 |   * https://github.com/AdolfVonKleist/phonetisaurus-downloads
 18 | 
 19 | #### Contact: ####
 20 |   * phonetisaurus@gmail.com
 21 | 
 22 | #### Scratch Build for OpenFst v1.7.2 and Ubuntu 20.04 ####
 23 | This build was tested via AWS EC2 with a fresh Ubuntu 20.04 base, and m4.large instance.
 24 | 
 25 | ```
 26 | $ sudo apt-get update
 27 | # Basics
 28 | $ sudo apt-get install git g++ autoconf-archive make libtool
 29 | # Python bindings
 30 | $ sudo apt-get install python-setuptools python-dev
 31 | # mitlm (to build a quick play model)
 32 | $ sudo apt-get install gfortran
 33 | ```
 34 | 
 35 | Create a work directory of your choice:
 36 | ```
 37 | $ mkdir g2p
 38 | $ cd g2p/
 39 | ```
 40 | 
 41 | Next grab and install OpenFst-1.7.2:
 42 | ```
 43 | $ wget http://www.openfst.org/twiki/pub/FST/FstDownload/openfst-1.7.2.tar.gz
 44 | $ tar -xvzf openfst-1.7.2.tar.gz
 45 | $ cd openfst-1.7.2
 46 | # Minimal configure, compatible with current defaults for Kaldi
 47 | $ ./configure --enable-static --enable-shared --enable-far --enable-ngram-fsts
 48 | $ make -j 
 49 | # Now wait a while...
 50 | $ sudo make install
 51 | # Extend your LD_LIBRARY_PATH .bashrc (assumes OpenFst installed to default location):
 52 | $ echo 'export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib:/usr/local/lib/fst' \
 53 |      >> ~/.bashrc
 54 | $ source ~/.bashrc
 55 | $ cd ..
 56 | ```
 57 | 
 58 | Checkout the latest Phonetisaurus from master and compile without bindings:
 59 | ```
 60 | $ git clone https://github.com/AdolfVonKleist/Phonetisaurus.git
 61 | $ cd Phonetisaurus
 62 | # if OpenFst is installed in the default location:
 63 | $ ./configure
 64 | # if OpenFst is installed in a special location:
 65 | $ ./configure \
 66 |       --with-openfst-includes=${OFST_PATH}/openfst-1.7.2/include \
 67 |       --with-openfst-libs=${OFST_PATH}/openfst-1.7.2/lib
 68 | $ make
 69 | $ sudo make install
 70 | $ cd ..
 71 | ```
 72 | 
 73 | Checkout the latest Phonetisaurus from master and compile with python3 bindings:
 74 | ```
 75 | $ git clone https://github.com/AdolfVonKleist/Phonetisaurus.git
 76 | $ cd Phonetisaurus
 77 | $ sudo pip3 install pybindgen
 78 | # if OpenFst is installed in the default location:
 79 | $ PYTHON=python3 ./configure --enable-python
 80 | # if OpenFst is installed in a special location:
 81 | $ PYTHON=python3 ./configure \
 82 |       --with-openfst-includes=${OFST_PATH}/openfst-1.7.2/include \
 83 |       --with-openfst-libs=${OFST_PATH}/openfst-1.7.2/lib \
 84 |       --enable-python
 85 | $ make
 86 | $ sudo make install
 87 | $ cd python
 88 | $ cp ../.libs/Phonetisaurus.so .
 89 | $ sudo python3 setup.py install
 90 | $ cd ../..
 91 | ```
 92 | 
 93 | Grab and install mitlm to build a quick test model with the cmudict (5m):
 94 | ```
 95 | $ git clone https://github.com/mitlm/mitlm.git
 96 | $ cd mitlm/
 97 | $ ./autogen.sh
 98 | $ make
 99 | $ sudo make install
100 | $ cd ..
101 | ```
102 | 
103 | Grab a copy of the latest version of CMUdict and clean it up a bit:
104 | ```
105 | $ mkdir example
106 | $ cd example
107 | $ wget https://raw.githubusercontent.com/cmusphinx/cmudict/master/cmudict.dict
108 | # Clean it up a bit and reformat:
109 | $ cat cmudict.dict \
110 |   | perl -pe 's/\([0-9]+\)//;
111 |               s/\s+/ /g; s/^\s+//;
112 |               s/\s+$//; @_ = split (/\s+/);
113 |               $w = shift (@_);
114 |               $_ = $w."\t".join (" ", @_)."\n";' \
115 |   > cmudict.formatted.dict
116 | ```
117 | 
118 | Train a complete model with default parameters using the wrapper script.
119 | NOTE: this assumes the tool was compiled with the python3 bindings:
120 | ```
121 | $ phonetisaurus-train --lexicon cmudict.formatted.dict --seq2_del
122 | INFO:phonetisaurus-train:2017-07-09 16:35:31:  Checking command configuration...
123 | INFO:phonetisaurus-train:2017-07-09 16:35:31:  Checking lexicon for reserved characters: '}', '|', '_'...
124 | INFO:phonetisaurus-train:2017-07-09 16:35:31:  Aligning lexicon...
125 | INFO:phonetisaurus-train:2017-07-09 16:37:44:  Training joint ngram model...
126 | INFO:phonetisaurus-train:2017-07-09 16:37:46:  Converting ARPA format joint n-gram model to WFST format...
127 | INFO:phonetisaurus-train:2017-07-09 16:37:59:  G2P training succeeded: train/model.fst
128 | ```
129 | 
130 | Generate pronunciations for a word list using the wrapper script:
131 | ```
132 | $ phonetisaurus-apply --model train/model.fst --word_list test.wlist
133 | test  T EH1 S T
134 | jumbotron  JH AH1 M B OW0 T R AA0 N
135 | excellent  EH1 K S AH0 L AH0 N T
136 | eggselent  EH1 G S L AH0 N T
137 | ```
138 | 
139 | Generate pronunciations for a word list using the wrapper script.
140 | Filter against a reference lexicon, add n-best, and run in verbose mode,
141 | and generate :
142 | ```
143 | $ phonetisaurus-apply --model train/model.fst --word_list test.wlist -n 2 -g -v -l cmudict.formatted.dict
144 | DEBUG:phonetisaurus-apply:2017-07-09 16:48:22:  Checking command configuration...
145 | DEBUG:phonetisaurus-apply:2017-07-09 16:48:22:  beam:  10000
146 | DEBUG:phonetisaurus-apply:2017-07-09 16:48:22:  greedy:  True
147 | DEBUG:phonetisaurus-apply:2017-07-09 16:48:22:  lexicon_file:  cmudict.formatted.dict
148 | DEBUG:phonetisaurus-apply:2017-07-09 16:48:22:  model:  train/model.fst
149 | DEBUG:phonetisaurus-apply:2017-07-09 16:48:22:  nbest:  2
150 | DEBUG:phonetisaurus-apply:2017-07-09 16:48:22:  thresh:  99.0
151 | DEBUG:phonetisaurus-apply:2017-07-09 16:48:22:  verbose:  True
152 | DEBUG:phonetisaurus-apply:2017-07-09 16:48:22:  Loading lexicon from file...
153 | DEBUG:phonetisaurus-apply:2017-07-09 16:48:22:  Applying G2P model...
154 | GitRevision: kaldi-1-g5028ba-dirty
155 | eggselent  26.85  EH1 G S L AH0 N T
156 | eggselent  28.12  EH1 G Z L AH0 N T
157 | excellent  0.00  EH1 K S AH0 L AH0 N T
158 | excellent  19.28  EH1 K S L EH1 N T
159 | jumbotron  0.00  JH AH1 M B OW0 T R AA0 N
160 | jumbotron  17.30  JH AH1 M B OW0 T R AA2 N
161 | test  0.00  T EH1 S T
162 | test  11.56  T EH2 S T
163 | ```
164 | 
165 | Generate pronunciations using the alternative % of total probability mass constraint,
166 | and print the resulting scores as human readable, normalized probabilities rather than
167 | raw negative log scores:
168 | ```
169 | phonetisaurus-apply --model train/model.fst --word_list Phonetisaurus/script/words.list -v -a -p 0.85 -pr
170 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58:  Checking command configuration...
171 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58:  accumulate:  True
172 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58:  beam:  10000
173 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58:  greedy:  False
174 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58:  lexicon_file:  None
175 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58:  logger:  <logging.Logger object at 0x7fdaa93d2410>
176 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58:  model:  train/model.fst
177 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58:  nbest:  100
178 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58:  pmass:  0.85
179 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58:  probs:  True
180 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58:  thresh:  99.0
181 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58:  verbose:  True
182 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58:  phonetisaurus-g2pfst --model=train/model.fst --nbest=100 --beam=10000 --thresh=99.0 --accumulate=true --pmass=0.85 --nlog_probs=false --wordlist=Phonetisaurus/script/words.list
183 | DEBUG:phonetisaurus-apply:2017-07-30 11:55:58:  Applying G2P model...
184 | GitRevision: kaldi-2-g6e7c04-dirty
185 | test  0.68  T EH1 S T
186 | test  0.21  T EH2 S T
187 | right  0.81  R AY1 T
188 | right  0.13  R AY0 T
189 | junkify  0.64  JH AH1 NG K AH0 F AY2
190 | junkify  0.23  JH AH1 NG K IH0 F AY2
191 | ```
192 | 
193 | Align, estimate, and convert a joint n-gram model step-by-step:
194 | ```
195 | # Align the dictionary (5m-10m)
196 | $ phonetisaurus-align --input=cmudict.formatted.dict \
197 |   --ofile=cmudict.formatted.corpus --seq1_del=false
198 | # Train an n-gram model (5s-10s):
199 | $ estimate-ngram -o 8 -t cmudict.formatted.corpus \
200 |   -wl cmudict.o8.arpa
201 | # Convert to OpenFst format (10s-20s):
202 | $ phonetisaurus-arpa2wfst --lm=cmudict.o8.arpa --ofile=cmudict.o8.fst
203 | $ cd
204 | ```
205 | 
206 | Test the manual model with the wrapper script:
207 | ```
208 | $ cd Phonetisaurus/script
209 | $ ./phoneticize.py -m ~/example/cmudict.o8.fst -w testing
210 |   11.24   T EH1 S T IH0 NG
211 |   -------
212 |   t:T:3.31
213 |   e:EH1:2.26
214 |   s:S:2.61
215 |   t:T:0.21
216 |   i:IH0:2.66
217 |   n|g:NG:0.16
218 |   <eps>:<eps>:0.01
219 | ```
220 | 
221 | Test the G2P servlet [requires compilation of bindings and module install]:
222 | ```
223 | $ nohup script/g2pserver.py -m ~/train/model.fst -l ~/cmudict.formatted.dict &
224 | $ curl -s -F "wordlist=@words.list" http://localhost:8080/phoneticize/list
225 | test    T EH1 S T
226 | right   R AY1 T
227 | junkify JH AH1 NG K AH0 F AY2
228 | junkify JH AH1 NG K IH0 F AY2
229 | ```
230 | 
231 | Use a special location for OpenFst, parallel build with 2 cores
232 | ```
233 |  $ ./configure --with-openfst-libs=/home/ubuntu/openfst-1.6.2/lib \
234 |           --with-openfst-includes=/home/ubuntu/openfst-1.6.2/include
235 |  $ make -j 2 all
236 | ```
237 | 
238 | Use custom g++ under OSX (Note: OpenFst must also be compiled with this
239 | custom g++ alternative [untested with v1.6.2])
240 | ```
241 |  $ ./configure --with-openfst-libs=/home/osx/openfst-1.6.2gcc/lib \
242 |           --with-openfst-includes=/home/osx/openfst-1.6.2gcc/include \
243 |           CXX=g++-4.9
244 |  $ make -j 2 all
245 | ```
246 | 
247 | #### Rebuild configure ####
248 | If you need to rebuild the configure script you can do so:
249 | ```
250 |  $ autoreconf -i
251 | ```
252 | 
253 | ### Install [Linux]: ###
254 | ```
255 |  $ sudo make install
256 | ```
257 | 
258 | ### Uninstall [Linux]: ###
259 | ```
260 |  $ sudo make uninstall
261 | ```
262 | 
263 | ### Usage: ###
264 | #### phonetisaurus-align ####
265 | ```
266 |  $ bin/phonetisaurus-align --help
267 | ```
268 | #### phonetisaurus-arpa2wfst ####
269 | ```
270 |  $ bin/phonetisaurus-arpa2wfst --help
271 | ```
272 | #### phonetisaurus-g2prnn ####
273 | ```
274 |  $ bin/phonetisaurus-g2prnn --help
275 | ```
276 | #### phonetisaurus-g2pfst ####
277 | ```
278 |  $ bin/phonetisaurus-g2pfst --help
279 | ```
280 | 
281 | ### Docker: ###
282 | 
283 | Docker images are hosted on: https://hub.docker.com/r/phonetisaurus/phonetisaurus
284 | 
285 | The images can be used in one of 3 ways:
286 | 
287 |   * directly, to process files on your computer without needing to install/compile anything (apart from docker)
288 |   * as a base image for another project (using the `FROM` statement)
289 |   * to copy portions of the binaries or libraries into a new image (using the `COPY --from=` statement) - most of the files are in `/usr/local/bin` and `/usr/local/lib`
290 | 
291 | To use the program directly, you need to mount the local folder with the required files (eg. models, word lists, etc) into the Docker container under the `/work` path, as this is the default workdir in the image. Then you can call the programs directly after the name of the image, for example:
292 | ```
293 | docker run --rm -it -v $PWD:/work phonetisaurus/phonetisaurus "phonetisaurus-apply -m model.fst -wl test.wlist"
294 | ```
295 | 
296 | You can also use the `bash` program to simply enter the interactive shell and run everything from there.
297 | 
298 | ### Misc: ###
299 | cpplint command:
300 | ```
301 |  $ ./cpplint.py --filter=-whitespace/parens,-whitespace/braces,\
302 |       -legal/copyright,-build/namespaces,-runtime/references\
303 |       src/include/util.h
304 | ```
305 | 


--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
  1 | #                                               -*- Autoconf -*-
  2 | # Process this file with autoconf to produce a configure script.
  3 | AC_PREREQ([2.69])
  4 | AC_INIT([phonetisaurus], [0.8.1], [phonetisaurus@gmail.com])
  5 | AC_CONFIG_MACRO_DIR([.autoconf/m4])
  6 | AC_CONFIG_AUX_DIR([.autoconf])
  7 | AC_CONFIG_SRCDIR(configure.ac)
  8 | 
  9 | GIT_REVISION=`git describe --abbrev=6 --dirty --always --tags 2>/dev/null || echo package`
 10 | AC_SUBST([GIT_REVISION])
 11 | 
 12 | AM_INIT_AUTOMAKE([foreign -Wall])
 13 | AM_MAINTAINER_MODE
 14 | 
 15 | AM_PROG_AR
 16 | LT_INIT
 17 | 
 18 | AC_ARG_ENABLE(python,
 19 |         AS_HELP_STRING([--enable-python], [Compile with Python support]),
 20 |         [case "${enableval}" in
 21 |           yes) enable_python=true ;;
 22 |           no) enable_python=false ;;
 23 |           *) AC_MSG_ERROR(bad value ${enableval} for --enable-python) ;;
 24 |         esac], [enable_python=false])
 25 | AM_CONDITIONAL(WANT_PYTHON, test x"$enable_python" = xtrue)
 26 | 
 27 | AS_IF([test x"$enable_python" = xtrue], [
 28 | AM_PATH_PYTHON([2.7],, [:])
 29 | ])
 30 | AM_CONDITIONAL([HAVE_PYTHON], [test "$PYTHON" != :])
 31 | AC_SUBST([PYTHON])
 32 | if test x"$PYTHON" != x && test "$PYTHON" != ":"; then
 33 |    PYTHON_CPPFLAGS=
 34 |    PYTHON_LIBS=
 35 |    AX_PYTHON_DEVEL([>= '$PYTHON_VERSION'])
 36 | fi
 37 | AM_CONDITIONAL([HAVE_PYTHON_DEV], [test x"$PYTHON" != x && test "$PYTHON" != :])
 38 | AC_SUBST([PYTHON_CPPFLAGS])
 39 | AC_SUBST([PYTHON_LIBS])
 40 | 
 41 | AC_LANG(C++)
 42 | # Checks for programs.
 43 | AC_PROG_CXX
 44 | AX_CXX_COMPILE_STDCXX_11([], [mandatory])
 45 | #Python stuff not currently needed.  Will return.
 46 | #AX_PYTHON_DEVEL([>= '2.7'])
 47 | AC_PROG_CC
 48 | AC_PROG_CPP
 49 | #AC_PROG_INSTALL
 50 | AC_PROG_LN_S
 51 | 
 52 | # Checks for libraries.
 53 | AC_CHECK_LIB([c], [getgid],[AC_DEFINE([HAVE_GETGID],[1],[libc includes getgid])])
 54 | AC_CHECK_LIB([dl], [dlopen], [], [echo "dl library not found. Weird."; exit -1])
 55 | AC_CHECK_LIB([m], [cos], [], [echo "m library not found.  Please install m library before proceeding"; exit -1])
 56 | AC_CHECK_LIB([pthread], [pthread_mutex_init], [], [echo "pthreads not found.  Please install pthread library before proceeding"; exit -1])
 57 | 
 58 | # Checks for header files.
 59 | AC_CHECK_HEADERS([stddef.h stdlib.h string.h])
 60 | 
 61 | # Checks for typedefs, structures, and compiler characteristics.
 62 | AC_CHECK_HEADER_STDBOOL
 63 | AC_C_INLINE
 64 | AC_TYPE_SIZE_T
 65 | AC_TYPE_SSIZE_T
 66 | AC_CHECK_TYPES([ptrdiff_t])
 67 | 
 68 | # Checks for library functions.
 69 | AC_FUNC_STRTOD
 70 | AC_CHECK_FUNCS([memmove strchr strrchr strspn])
 71 | AC_CHECK_MATH_FUNC(exp10)
 72 | 
 73 | AC_ARG_WITH([openfst-includes],
 74 | 	[AS_HELP_STRING([--with-openfst-includes],
 75 | 	[Location of the OpenFst headers.])],
 76 | 	[user_openfst_headers_path="$withval"],
 77 | 	[])
 78 | AC_ARG_WITH([openfst-libs],
 79 | 	[AS_HELP_STRING([--with-openfst-libs],
 80 | 	[Location of the OpenFst shared libraries.])],
 81 | 	[user_openfst_libs_path="$withval"],
 82 | 	[])
 83 | 
 84 | saved_cppflags="${CPPFLAGS}"
 85 | if [[ "$user_openfst_headers_path" != "" ]]; then
 86 |       CPPFLAGS="-I$user_openfst_headers_path"
 87 |       AC_CHECK_HEADERS([fst/fst.h], [openfst_headers_found=1], [openfst_headers_found=0], [])
 88 |       if [[ $openfst_headers_found -eq 1 ]]; then
 89 |          OPENFST_CXXFLAGS="$CPPFLAGS"
 90 |       fi
 91 | else
 92 |    CPPFLAGS=""
 93 |    AC_CHECK_HEADERS([fst/fst.h], [openfst_headers_found=1], [openfst_headers_found=0], [])
 94 |    if [[ $openfst_headers_found -eq 1 ]]; then
 95 |       OPENFST_CXXFLAGS="$CPPFLAGS"
 96 |    else
 97 |       CPPFLAGS="-I/usr/local/include"
 98 |       AC_CHECK_HEADERS([fst/fst.h], [openfst_headers_found=1], [openfst_headers_found=0], [])
 99 |       if [[ $openfst_headers_found -eq 1 ]]; then
100 |          OPENFST_CXXFLAGS="$CPPFLAGS"
101 |       fi
102 |    fi
103 | fi
104 | CPPFLAGS="${saved_cppflags}"
105 | 
106 | AC_MSG_CHECKING([for openfst libraries])
107 | CHECK_LIBRARIES="-lfst -lfstfar -lfstngram"
108 | saved_ldflags="${LDFLAGS}"
109 | saved_cppflags="${CPPFLAGS}"
110 | CPPFLAGS="${OPENFST_CXXFLAGS}"
111 | if test x"$user_openfst_libs_path" != x; then
112 |    LDFLAGS="-L$user_openfst_libs_path ${CHECK_LIBRARIES}"
113 |    AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <fst/fst.h>], [std::unique_ptr<fst::FstHeader> ifst(fst::FstHeader());])],
114 |        [have_openfst=true],                                           
115 |        [have_openfst=false])
116 |    if test x"$have_openfst" = xtrue; then
117 |       OPENFST_LDFLAGS="$LDFLAGS"
118 |       AC_MSG_RESULT([$OPENFST_LDFLAGS])
119 |    fi
120 | else
121 |    LDFLAGS="${CHECK_LIBRARIES}"
122 |    AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <fst/fst.h>], [std::unique_ptr<fst::FstHeader> ifst(fst::FstHeader());])],
123 |        [have_openfst=true],                                           
124 |        [have_openfst=false])
125 |    if test x"$have_openfst" = xtrue; then
126 |       OPENFST_LDFLAGS="$LDFLAGS"
127 |       AC_MSG_RESULT([$OPENFST_LDFLAGS])
128 |    else
129 |       LDFLAGS="-L/usr/local/lib ${CHECK_LIBRARIES}"
130 |       AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <fst/fst.h>], [std::unique_ptr<fst::FstHeader> ifst(fst::FstHeader());])],
131 |           [have_openfst=1],                                           
132 |           [have_openfst=0])
133 |       if test x"$have_openfst" = xtrue; then
134 |          OPENFST_LDFLAGS="$LDFLAGS"
135 |          AC_MSG_RESULT([$OPENFST_LDFLAGS])
136 |       else
137 |          AC_MSG_RESULT([no])
138 |       fi
139 |    fi
140 | fi
141 | LDFLAGS="${saved_ldflags}"
142 | CPPFLAGS="${saved_cppflags}"
143 | 
144 | if test x"$have_openfst" != xtrue; then
145 |    AC_MSG_ERROR([Can't find OpenFST or one or more of its extensions. Use --with-openfst-includes and --with-openfst-libs to specify where you have installed OpenFst. OpenFst should have been configured with the following flags: --enable-static --enable-shared --enable-far --enable-ngram-fsts])
146 | fi
147 | AC_SUBST([OPENFST_CXXFLAGS])
148 | AC_SUBST([OPENFST_LDFLAGS])
149 | AM_CONDITIONAL(OPENFST, [test x"$have_openfst" = xtrue])
150 | 
151 | AX_OPENMP([AC_DEFINE(HAVE_OPENMP,1,[Define if OpenMP is enabled])] [have_openmp=true])
152 | AM_CONDITIONAL(OPENMP, [test x"$have_openmp" = xtrue])
153 | AC_SUBST([OPENMP_CXXFLAGS])
154 | AC_SUBST([OPENMP_LDFLAGS])
155 | AC_ARG_ENABLE(openmp,
156 |         AS_HELP_STRING([--enable-openmp], [Compile with OpenMP support]),
157 |         [case "${enableval}" in
158 |           yes) enable_openmp=true ;;
159 |           no) enable_openmp=false ;;
160 |           *) AC_MSG_ERROR(bad value ${enableval} for --enable-openmp) ;;
161 |         esac], [enable_openmp=false])
162 | AM_CONDITIONAL(WANT_OPENMP, test x"$enable_openmp" = xtrue)
163 | 
164 | saved_cppflags="${CPPFLAGS}"
165 | CPPFLAGS=
166 | AC_CHECK_HEADERS([utf8.h], [utfcpp_headers_found=1], [utfcpp_headers_found=0], [])
167 | if [[ $utfcpp_headers_found -eq 1 ]]; then
168 |     UTFCPP_CXXFLAGS="$CPPFLAGS"
169 | else
170 |     UTFCPP_CXXFLAGS="-I \${top_srcdir}/src/3rdparty/utfcpp"
171 | fi
172 | CPPFLAGS="${saved_cppflags}"
173 | AC_SUBST([UTFCPP_CXXFLAGS])
174 | AM_CONDITIONAL(UTFCPP, [test x"$utfcpp_headers_found" = x1])
175 | 
176 | 
177 | AC_CONFIG_FILES([Makefile])
178 | AC_OUTPUT
179 | 


--------------------------------------------------------------------------------
/docs/mainpage.dox:
--------------------------------------------------------------------------------
 1 | /**
 2 | @brief Documentation file for Phonetisaurus G2P project
 3 | @author Josef R. Novak
 4 | @file
 5 | */
 6 | /** @defgroup Phonetisaurus Sources */
 7 | /** 
 8 | @mainpage Phonetisaurus G2P - WFST-based Grapheme-to-Phoneme conversion.
 9 | 
10 | GitHub project page: <a href="https://github.com/AdolfVonKleist/Phonetisaurus">Phonetisaurus</a>
11 | */
12 | 


--------------------------------------------------------------------------------
/python/phonetisaurus-module.py:
--------------------------------------------------------------------------------
 1 | import pybindgen
 2 | from pybindgen import param, retval
 3 | import sys
 4 | 
 5 | mod = pybindgen.Module ('Phonetisaurus')
 6 | ################################################
 7 | #PhonetisaurusOmega decoder wrapper
 8 | mod.add_include ('"include/PhonetisaurusScript.h"')
 9 | 
10 | #Build up the basic bits for the PathData return object
11 | mod.add_container ('std::vector<int>', 'int', 'vector')
12 | mod.add_container ('std::vector<float>', 'float', 'vector')
13 | 
14 | #Register the PathDataPy struct
15 | struct = mod.add_struct('PathData')
16 | struct.add_constructor([]) 
17 | struct.add_instance_attribute ('PathWeight', 'float')
18 | struct.add_instance_attribute ('PathWeights', 'std::vector<float>')
19 | struct.add_instance_attribute ('ILabels', 'std::vector<int>')
20 | struct.add_instance_attribute ('OLabels', 'std::vector<int>')
21 | struct.add_instance_attribute ('Uniques', 'std::vector<int>')
22 | 
23 | #Register the vector<PathData> container
24 | mod.add_container ('std::vector<PathData>', 'PathData', 'vector' )
25 | 
26 | g2pklass = mod.add_class ('PhonetisaurusScript')
27 | std_exception = mod.add_exception ('exception',
28 |                                    foreign_cpp_namespace='std',
29 |                                    message_rvalue='%(EXC)s.what()')
30 | 
31 | g2pklass.add_constructor ([param ('std::string', 'model')],
32 |                           throw=[std_exception])
33 | 
34 | g2pklass.add_method ('Phoneticize', retval ('std::vector<PathData>'),
35 |                     [ param ('std::string', 'word'),
36 |                       param ('int', 'nbest'),
37 |                       param ('int', 'beam'),
38 |                       param ('float', 'threshold'),
39 |                       param ('bool', 'write_fsts'),
40 |                       param ('bool', 'accumulate'),
41 |                       param ('float', 'pmass')
42 |                   ]
43 |                 )
44 | 
45 | # Helper methods for the symbol lookup
46 | g2pklass.add_method ('FindIsym', retval ('std::string'),
47 |                     [param ('int', 'symbol_id')])
48 | g2pklass.add_method ('FindIsym', retval('int'),
49 |                     [param ('std::string', 'symbol')])
50 | g2pklass.add_method ('FindOsym', retval('std::string'),
51 |                     [param ('int', 'symbol_id')])
52 | g2pklass.add_method ('FindOsym', retval('int'),
53 |                     [param ('std::string', 'symbol')])
54 | 
55 | 
56 | 
57 | mod.generate (sys.stdout)
58 | 


--------------------------------------------------------------------------------
/python/phonetisaurus/__init__.py:
--------------------------------------------------------------------------------
1 | from Phonetisaurus import PhonetisaurusScript as Phonetisaurus
2 | 


--------------------------------------------------------------------------------
/python/script/demo.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="pl">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>G2P</title>
 6 | 
 7 |     <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.4/jquery.min.js"></script>
 8 |     <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css"
 9 |     integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous">
10 |     <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap-theme.min.css"
11 |     integrity="sha384-rHyoN1iRsVXV4nD0JutlnGaslCJuC7uwjduW9SVrLvRYooPp2bWYgmgJQIXwl/Sp" crossorigin="anonymous">
12 |     <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js"
13 |     integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa"
14 |     crossorigin="anonymous"></script>
15 | 
16 |     <style>
17 |         .jumbotron {
18 |             background-color: darkgreen;
19 |             color: white;
20 |             box-shadow: 0 5px 20px rgba(0, 0, 0, 2);
21 |         }
22 |         #wordlist {
23 |             height: 500px;
24 |         }
25 |     </style>
26 | </head>
27 | <body>
28 |   <div class="jumbotron">
29 |     <div class="container">
30 |         <h1>
31 |             Grapheme-to-phoneme conversion
32 |         </h1>
33 |         <p>Using the <a href="https://github.com/AdolfVonKleist/Phonetisaurus">Phonetisaurus</a> toolkit</p>
34 |     </div>
35 | </div>
36 | 
37 | <div class="container">
38 |    <p>Enter a list of words (one per line) below and click "Convert":</p>
39 |    <textarea class="form-control" id="wordlist"></textarea>
40 |    <button class="btn btn-primary btn-lg col-md-6" type="submit" onclick="convert()" id="convertbtn">Convert</button>
41 |    <button class="btn btn-lg col-md-6" type="submit" onclick="undo()">Undo</button>
42 | </div>
43 | 
44 | <script type="text/javascript">
45 | 
46 |     var server='localhost:8080'
47 | 
48 |     var undo_text='';
49 | 
50 |     function convert() {
51 | 
52 |         $('#convertbtn').html('<span class="glyphicon glyphicon-refresh glyphicon-refresh-animate"></span> Converting...')
53 | 
54 |         var text=$('#wordlist').val();
55 |         undo_text=text;
56 |         var fd = new FormData();
57 |         var file = new Blob([text], {type: 'plain/text'});
58 |         fd.append('wordlist', file, 'fileName.txt');
59 | 
60 |         $.ajax({
61 |           url: 'http://'+server+'/phoneticize/list',
62 |           method: 'post',
63 |           data: fd,
64 |           processData: false,
65 |           contentType: false
66 |         }).done(function(data){
67 |             $('#wordlist').val(data);
68 |             $('#convertbtn').html('Convert');
69 |         });
70 |     }
71 | 
72 |     function undo() {
73 |         $('#wordlist').val(undo_text);
74 |     }
75 | </script>
76 | 
77 | </body>
78 | </html>
79 | 


--------------------------------------------------------------------------------
/python/script/g2pserver.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import os, re, phonetisaurus, json
  3 | from bottle import route, run, template, request, response
  4 | from itertools import izip
  5 | from collections import namedtuple, defaultdict
  6 | 
  7 | #Globals, oh no!
  8 | _g2pmodel = None
  9 | _lexicon  = defaultdict (list)
 10 | 
 11 | 
 12 | ###############################
 13 | # Utilities
 14 | def _phoneticize (model, args) :
 15 |     """
 16 |     Python wrapper function for g2p.
 17 |     """
 18 | 
 19 |     results = model.Phoneticize (
 20 |         args.token.encode ("utf8"),
 21 |         args.nbest,
 22 |         args.beam,
 23 |         args.thresh,
 24 |         args.write_fsts,
 25 |         args.accumulate,
 26 |         args.pmass
 27 |     )
 28 | 
 29 |     pronunciations = []
 30 |     for result in results :
 31 |         pronunciation = [model.FindOsym (u) for u in result.Uniques]
 32 |         yield u"{0}".format (u" ".join (pronunciation))
 33 | 
 34 | def _loadLexicon (lexiconfile) :
 35 |     with open (lexiconfile, "r") as ifp :
 36 |         for entry in ifp :
 37 |             word, pron = re.split (ur"\t", entry.decode ("utf8").strip ())
 38 |             _lexicon [word].append (pron)
 39 |     return
 40 | 
 41 | def _defaultArgs (userargs) :
 42 |     args = namedtuple ('args', [
 43 |         'token', 'nbest', 'beam', 'thresh', 'write_fsts',
 44 |         'accumulate', 'pmass'
 45 |     ])
 46 | 
 47 |     args.token  = ""
 48 |     args.nbest  = int (userargs.get ("nbest", 2))
 49 |     args.beam   = int (userargs.get ("beam", 500))
 50 |     args.thresh = float (userargs.get ("thresh", 10.))
 51 |     args.pmass = float (userargs.get ("pmass", 0.0))
 52 |     args.write_fsts = False
 53 |     args.accumulate = userargs.get (
 54 |         "accumulate",
 55 |         False
 56 |     )
 57 |     return args
 58 | ###############################
 59 | 
 60 | 
 61 | 
 62 | @route ('/phoneticize/list', method="POST")
 63 | def PhoneticizeList () :
 64 |     """Phoneticize a list of words.
 65 | 
 66 |     Phoneticize a list of words.  This will do a simple lookup for
 67 |     the word in the reference lexicon, and backoff to the G2P server
 68 |     in the event that it finds no entry.
 69 |     """
 70 |     default_args = _defaultArgs (request.forms)
 71 | 
 72 |     wlist  = request.files.get ("wordlist")
 73 | 
 74 |     words = re.split (ur"\n", wlist.file.read ().decode ("utf8"))
 75 | 
 76 |     lexicon = []
 77 |     for word in words :
 78 |         if re.match (ur"^\s*$", word) or u"<" in word or u"[" in word :
 79 |             continue
 80 |                      
 81 |         default_args.token = word.lower ()
 82 |         if default_args.token in _lexicon :
 83 |             for pronunciation in _lexicon [default_args.token] :
 84 |                 lexicon.append (u"{0}\t{1}".format (word, pronunciation))
 85 |         else :
 86 |             for pronunciation in _phoneticize (_g2pmodel, default_args) :
 87 |                 lexicon.append (u"{0}\t{1}".format (word, pronunciation))
 88 | 
 89 |     response.set_header('Access-Control-Allow-Origin', '*')
 90 | 
 91 |     return u"\n".join (lexicon).encode ("utf8")
 92 | 
 93 | 
 94 | 
 95 | if __name__ == '__main__':
 96 |     import sys, argparse
 97 | 
 98 |     example = "{0} --host localhost --port 8080"\
 99 |               "--model g2p.fst --lexicon ref.lexicon"
100 |     example = example.format (sys.argv [0])
101 |     parser  = argparse.ArgumentParser (description=example)
102 |     parser.add_argument ("--host", "-hs", help="IP to host the service on.",
103 |                          default="localhost")
104 |     parser.add_argument ("--port", "-p", help="Port to use for hosting.",
105 |                          default=8080, type=int)
106 |     parser.add_argument ("--model", "-m", help="Phonetisaurus G2P model.",
107 |                          required=True)
108 |     parser.add_argument ("--lexicon", "-l", help="Reference lexicon.",
109 |                          required=True)
110 |     parser.add_argument ("--verbose", "-v", help="Verbose mode.",
111 |                          default=False, action="store_true")
112 |     args = parser.parse_args ()
113 |     
114 |     if args.verbose :
115 |         for key,val in args.__dict__.iteritems () :
116 |             print >> sys.stderr, "{0}:\t{1}".format (key, val)
117 |             
118 |     _g2pmodel = phonetisaurus.Phonetisaurus (args.model)
119 |     _loadLexicon (args.lexicon)
120 | 
121 |     run (host=args.host, port=args.port, debug=False)
122 | 


--------------------------------------------------------------------------------
/python/script/phoneticize.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import phonetisaurus
 3 | from itertools import izip
 4 | 
 5 | def Phoneticize (model, args) :
 6 |     """Python wrapper function for g2p bindings.
 7 | 
 8 |     Python wrapper function for g2p bindings.  Most basic possible example.
 9 |     Intended as a template for doing something more useful.
10 | 
11 |     Args:
12 |         model (str): The g2p fst model to load.
13 |         args (obj): The argparse object with user specified options.
14 |     """
15 | 
16 |     results = model.Phoneticize (
17 |         args.token,
18 |         args.nbest,
19 |         args.beam,
20 |         args.thresh,
21 |         args.write_fsts,
22 |         args.accumulate,
23 |         args.pmass
24 |     )
25 | 
26 |     for result in results :
27 |         uniques = [model.FindOsym (u) for u in result.Uniques]
28 |         print ("{0:0.2f}\t{1}".format (result.PathWeight, " ".join (uniques)))
29 |         print ("-------")
30 | 
31 |         #Should always be equal length
32 |         for ilab, olab, weight in izip (result.ILabels,
33 |                                         result.OLabels,
34 |                                         result.PathWeights) :
35 |             print ("{0}:{1}:{2:0.2f}".format (
36 |                 model.FindIsym (ilab),
37 |                 model.FindOsym (olab),
38 |                 weight
39 |             ))
40 | 
41 |     return
42 | 
43 | 
44 | if __name__ == "__main__" :
45 |     import argparse, sys
46 | 
47 |     example = "{0} --model model.fst --word \"test\"".format (sys.argv [0])
48 |     parser  = argparse.ArgumentParser (description=example)
49 |     parser.add_argument ("--model", "-m", help="Phonetisaurus G2P model.",
50 |                          required=True)
51 |     group   = parser.add_mutually_exclusive_group (required=True)
52 |     group.add_argument ("--word", "-w", help="Input word in lower case.")
53 |     group.add_argument ("--wlist", "-wl", help="Provide a wordlist.")
54 |     parser.add_argument ("--nbest", "-n", help="NBest",
55 |                          default=1, type=int)
56 |     parser.add_argument ("--beam", "-b", help="Search beam",
57 |                          default=500, type=int)
58 |     parser.add_argument ("--thresh", "-t", help="NBest threshold.",
59 |                          default=10., type=float)
60 |     parser.add_argument ("--write_fsts", "-wf", help="Write decoded fsts "
61 |                          "to disk", default=False, action="store_true")
62 |     parser.add_argument ("--accumulate", "-a", help="Accumulate probs across "
63 |                          "unique pronunciations.", default=False,
64 |                          action="store_true")
65 |     parser.add_argument ("--pmass", "-p", help="Target probability mass.",
66 |                          default=0.0, type=float)
67 |     parser.add_argument ("--verbose", "-v", help="Verbose mode.",
68 |                          default=False, action="store_true")
69 |     args = parser.parse_args ()
70 | 
71 |     if args.verbose :
72 |         for key,val in args.__dict__.iteritems () :
73 |             print ("{0}:  {1}".format (key, val))
74 | 
75 |     model = phonetisaurus.Phonetisaurus (args.model)
76 | 
77 |     if args.word :
78 |         args.token = args.word
79 |         Phoneticize (model, args)
80 | 
81 |     else :
82 |         with open (args.wlist, "r") as ifp :
83 |             for word in ifp :
84 |                 word = word.decode ("utf8").strip ()
85 |                 args.token = word
86 |                 Phoneticize (model, args)
87 |                 print "-----------------------"
88 |                 print ""
89 | 


--------------------------------------------------------------------------------
/python/script/words.list:
--------------------------------------------------------------------------------
1 | test
2 | right
3 | junkify


--------------------------------------------------------------------------------
/python/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | from setuptools import setup, find_packages
 3 | import glob
 4 | #Install phonetisaurus 
 5 | setup (
 6 |     name         = 'phonetisaurus',
 7 |     version      = '0.3',
 8 |     description  = 'Phonetisaurus G2P python package (OpenFst-1.6.x)',
 9 |     url          = 'http://code.google.com/p/phonetisaurus',
10 |     author       = 'Josef Novak',
11 |     author_email = 'josef.robert.novak@gmail.com',
12 |     license      = 'BSD',
13 |     packages=find_packages(),
14 |     data_files   = [
15 |         ('.', ['Phonetisaurus.so'])
16 |     ],
17 |     include_package_data = True,
18 |     install_requires = ["argparse", "bottle"],
19 |     zip_safe     = False
20 | )
21 | 


--------------------------------------------------------------------------------
/src/3rdparty/rnnlm/COPYRIGHT.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2010-2012 Tomas Mikolov
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions
 6 | are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright
 9 | notice, this list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright
12 | notice, this list of conditions and the following disclaimer in the
13 | documentation and/or other materials provided with the distribution.
14 | 
15 | 3. Neither name of copyright holders nor the names of its contributors
16 | may be used to endorse or promote products derived from this software
17 | without specific prior written permission.
18 | 
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR
24 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/src/3rdparty/rnnlm/rnnlmlib.h:
--------------------------------------------------------------------------------
  1 | ///////////////////////////////////////////////////////////////////////
  2 | //
  3 | // Recurrent neural network based statistical language modeling toolkit
  4 | // Version 0.3e
  5 | // (c) 2010-2012 Tomas Mikolov (tmikolov@gmail.com)
  6 | //
  7 | // 2014-04-13 - Josef Robert Novak
  8 | // Removed some protections to give bindings access!
  9 | ///////////////////////////////////////////////////////////////////////
 10 | 
 11 | #ifndef _RNNLMLIB_H_
 12 | #define _RNNLMLIB_H_
 13 | 
 14 | #define MAX_STRING 100
 15 | #ifndef HAVE_EXP10
 16 | #define exp10(n) pow((double)10,(4-n))
 17 | #endif 
 18 | 
 19 | //#include <fst/fstlib.h>
 20 | //#include <tr1/unordered_map>
 21 | #include <string>
 22 | #include <vector>
 23 | //#include "util.h"
 24 | //using namespace fst;
 25 | 
 26 | const int MAX_NGRAM_ORDER=20;
 27 | typedef double real;		// doubles for NN weights
 28 | typedef double direct_t;	// doubles for ME weights; TODO: check why floats are not enough for RNNME (convergence problems)
 29 | //typedef std::tr1::unordered_map<std::string, std::vector<int> > JointMap;
 30 | //typedef std::tr1::unordered_map<int, std::string> TokenMap;
 31 | 
 32 | struct neuron {
 33 |     real ac;		//actual value stored in neuron
 34 |     real er;		//error value in neuron, used by learning algorithm
 35 | };
 36 |                 
 37 | struct synapse {
 38 |     real weight;	//weight of synapse
 39 | };
 40 | 
 41 | struct vocab_word {
 42 |     int cn;
 43 |     char word[MAX_STRING];
 44 | 
 45 |     real prob;
 46 |     int class_index;
 47 | };
 48 | 
 49 | /*
 50 | struct RNNToken {
 51 |   RNNToken* parent;
 52 |   struct neuron* neu;
 53 |   int history[MAX_NGRAM_ORDER];
 54 |   vector<int> bptt_history;
 55 | };
 56 | */
 57 | //typedef std::tr1::unordered_map<std::string, RNNToken> NeuTokenMap;
 58 | 
 59 | const unsigned int PRIMES [] = {
 60 |   108641969, 116049371, 125925907, 133333309, 
 61 |   145678979, 175308587, 197530793, 234567803, 
 62 |   251851741, 264197411, 330864029, 399999781,
 63 |   407407183, 459258997, 479012069, 545678687, 
 64 |   560493491, 607407037, 629629243, 656789717, 
 65 |   716048933, 718518067, 725925469, 733332871, 
 66 |   753085943, 755555077, 782715551, 790122953, 
 67 |   812345159, 814814293, 893826581, 923456189, 
 68 |   940740127, 953085797, 985184539, 990122807
 69 | };
 70 | 
 71 | const unsigned int PRIMES_SIZE=sizeof(PRIMES)/sizeof(PRIMES[0]);
 72 | 
 73 | enum FileTypeEnum {TEXT, BINARY, COMPRESSED};		//COMPRESSED not yet implemented
 74 | 
 75 | class CRnnLM {
 76 |  public:
 77 |   char train_file[MAX_STRING];
 78 |   char valid_file[MAX_STRING];
 79 |   char test_file[MAX_STRING];
 80 |   char rnnlm_file[MAX_STRING];
 81 |   char lmprob_file[MAX_STRING];
 82 |   bool joint;
 83 |   //JointMap joint_map;
 84 |   //TokenMap token_map;
 85 |   //NeuTokenMap NeuMap;
 86 | 
 87 |   int rand_seed;
 88 |     
 89 |   int debug_mode;
 90 |     
 91 |   int version;
 92 |   int filetype;
 93 |     
 94 |   int use_lmprob;
 95 |   real lambda;
 96 |   real gradient_cutoff;
 97 |     
 98 |   real dynamic;
 99 |     
100 |   real alpha;
101 |   real starting_alpha;
102 |   int alpha_divide;
103 |   double logp, llogp;
104 |   float min_improvement;
105 |   int iter;
106 |   int vocab_max_size;
107 |   int vocab_size;
108 |   int train_words;
109 |   int train_cur_pos;
110 |   int counter;
111 |     
112 |   int one_iter;
113 |   int anti_k;
114 |     
115 |   real beta;
116 |     
117 |   int class_size;
118 |   int **class_words;
119 |   int *class_cn;
120 |   int *class_max_cn;
121 |   int old_classes;
122 |     
123 |   struct vocab_word *vocab;
124 |   void sortVocab();
125 |   int *vocab_hash;
126 |   int vocab_hash_size;
127 |     
128 |   int layer0_size;
129 |   int layer1_size;
130 |   int layerc_size;
131 |   int layer2_size;
132 |     
133 |   long long direct_size;
134 |   int direct_order;
135 |   int history[MAX_NGRAM_ORDER];
136 |     
137 |   int bptt;
138 |   int bptt_block;
139 |   int *bptt_history;
140 |   neuron *bptt_hidden;
141 |   struct synapse *bptt_syn0;
142 |     
143 |   int gen;
144 | 
145 |   int independent;
146 |     
147 |   struct neuron *neu0;		//neurons in input layer
148 |   struct neuron *neu1;		//neurons in hidden layer
149 |   struct neuron *neuc;		//neurons in hidden layer
150 |   struct neuron *neu2;		//neurons in output layer
151 | 
152 |   struct synapse *syn0;		//weights between input and hidden layer
153 |   struct synapse *syn1;		//weights between hidden and output layer (or hidden and compression if compression>0)
154 |   struct synapse *sync;		//weights between hidden and compression layer
155 |   direct_t *syn_d;		//direct parameters between input and output layer (similar to Maximum Entropy model parameters)
156 |     
157 |   //backup used in training:
158 |   struct neuron *neu0b;
159 |   struct neuron *neu1b;
160 |   struct neuron *neucb;
161 |   struct neuron *neu2b;
162 | 
163 |   struct synapse *syn0b;
164 |   struct synapse *syn1b;
165 |   struct synapse *syncb;
166 |   direct_t *syn_db;
167 |     
168 |   //backup used in n-bset rescoring:
169 |   struct neuron *neu1b2;
170 |     
171 |     
172 |     //public:
173 | 
174 |     int alpha_set, train_file_set;
175 | 
176 |     CRnnLM()		//constructor initializes variables
177 |     {
178 | 	version=10;
179 | 	joint=true;
180 | 	filetype=TEXT;
181 | 	
182 | 	use_lmprob=0;
183 | 	lambda=0.75;
184 | 	gradient_cutoff=15;
185 | 	dynamic=0;
186 |     
187 | 	train_file[0]=0;
188 | 	valid_file[0]=0;
189 | 	test_file[0]=0;
190 | 	rnnlm_file[0]=0;
191 | 	
192 | 	alpha_set=0;
193 | 	train_file_set=0;
194 | 	
195 | 	alpha=0.1;
196 | 	beta=0.0000001;
197 | 	//beta=0.00000;
198 | 	alpha_divide=0;
199 | 	logp=0;
200 | 	llogp=-100000000;
201 | 	iter=0;
202 | 	
203 | 	min_improvement=1.003;
204 | 	
205 | 	train_words=0;
206 | 	train_cur_pos=0;
207 | 	vocab_max_size=100;
208 | 	vocab_size=0;
209 | 	vocab=(struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
210 | 	
211 | 	layer1_size=30;
212 | 	
213 | 	direct_size=0;
214 | 	direct_order=0;
215 | 	
216 | 	bptt=0;
217 | 	bptt_block=10;
218 | 	bptt_history=NULL;
219 | 	bptt_hidden=NULL;
220 | 	bptt_syn0=NULL;
221 | 	
222 | 	gen=0;
223 | 
224 | 	independent=0;
225 | 	
226 | 	neu0=NULL;
227 | 	neu1=NULL;
228 | 	neuc=NULL;
229 | 	neu2=NULL;
230 | 	
231 | 	syn0=NULL;
232 | 	syn1=NULL;
233 | 	sync=NULL;
234 | 	syn_d=NULL;
235 | 	syn_db=NULL;
236 | 	//backup
237 | 	neu0b=NULL;
238 | 	neu1b=NULL;
239 | 	neucb=NULL;
240 | 	neu2b=NULL;
241 | 	
242 | 	neu1b2=NULL;
243 | 	
244 | 	syn0b=NULL;
245 | 	syn1b=NULL;
246 | 	syncb=NULL;
247 | 	//
248 | 	
249 | 	rand_seed=1;
250 | 	
251 | 	class_size=100;
252 | 	old_classes=0;
253 | 	
254 | 	one_iter=0;
255 | 	
256 | 	debug_mode=1;
257 | 	srand(rand_seed);
258 | 	
259 | 	vocab_hash_size=100000000;
260 | 	vocab_hash=(int *)calloc(vocab_hash_size, sizeof(int));
261 |     }
262 |     
263 |     ~CRnnLM()		//destructor, deallocates memory
264 |     {
265 | 	int i;
266 | 	
267 | 	if (neu0!=NULL) {
268 | 	    free(neu0);
269 | 	    free(neu1);
270 | 	    if (neuc!=NULL) free(neuc);
271 | 	    free(neu2);
272 | 	    
273 | 	    free(syn0);
274 | 	    free(syn1);
275 | 	    if (sync!=NULL) free(sync);
276 | 	    
277 | 	    if (syn_d!=NULL) free(syn_d);
278 | 
279 | 	    if (syn_db!=NULL) free(syn_db);
280 | 
281 | 	    //
282 | 	    free(neu0b);
283 | 	    free(neu1b);
284 | 	    if (neucb!=NULL) free(neucb);
285 | 	    free(neu2b);
286 | 
287 | 	    free(neu1b2);
288 | 	    
289 | 	    free(syn0b);
290 | 	    free(syn1b);
291 | 	    if (syncb!=NULL) free(syncb);
292 | 	    //
293 | 	    
294 | 	    for (i=0; i<class_size; i++) free(class_words[i]);
295 | 	    free(class_max_cn);
296 | 	    free(class_cn);
297 | 	    free(class_words);
298 | 	
299 | 	    free(vocab);
300 | 	    free(vocab_hash);
301 | 
302 | 	    if (bptt_history!=NULL) free(bptt_history);
303 | 	    if (bptt_hidden!=NULL) free(bptt_hidden);
304 |             if (bptt_syn0!=NULL) free(bptt_syn0);
305 | 	    
306 | 	    //todo: free bptt variables too
307 | 	}
308 |     }
309 | 
310 |     //void MapJointToken (vocab_word* word);
311 |     //vector<int>& SearchJointVocab (string& word);
312 |     void SaveContext (std::string& id);
313 |     void RestoreContext (std::string& id);
314 | 
315 |     real random(real min, real max);
316 | 
317 |     void setTrainFile(char *str);
318 |     void setValidFile(char *str);
319 |     void setTestFile(char *str);
320 |     void setRnnLMFile(char *str);
321 |     void setLMProbFile(char *str) {strcpy(lmprob_file, str);}
322 |     
323 |     void setFileType(int newt) {filetype=newt;}
324 |     
325 |     void setClassSize(int newSize) {class_size=newSize;}
326 |     void setOldClasses(int newVal) {old_classes=newVal;}
327 |     void setLambda(real newLambda) {lambda=newLambda;}
328 |     void setGradientCutoff(real newGradient) {gradient_cutoff=newGradient;}
329 |     void setDynamic(real newD) {dynamic=newD;}
330 |     void setGen(real newGen) {gen=newGen;}
331 |     void setIndependent(int newVal) {independent=newVal;}
332 |     
333 |     void setLearningRate(real newAlpha) {alpha=newAlpha;}
334 |     void setRegularization(real newBeta) {beta=newBeta;}
335 |     void setMinImprovement(real newMinImprovement) {min_improvement=newMinImprovement;}
336 |     void setHiddenLayerSize(int newsize) {layer1_size=newsize;}
337 |     void setCompressionLayerSize(int newsize) {layerc_size=newsize;}
338 |     void setDirectSize(long long newsize) {direct_size=newsize;}
339 |     void setDirectOrder(int newsize) {direct_order=newsize;}
340 |     void setBPTT(int newval) {bptt=newval;}
341 |     void setBPTTBlock(int newval) {bptt_block=newval;}
342 |     void setRandSeed(int newSeed) {rand_seed=newSeed; srand(rand_seed);}
343 |     void setDebugMode(int newDebug) {debug_mode=newDebug;}
344 |     void setAntiKasparek(int newAnti) {anti_k=newAnti;}
345 |     void setOneIter(int newOneIter) {one_iter=newOneIter;}
346 |     
347 |     int getWordHash(char *word);
348 |     void readWord(char *word, FILE *fin);
349 |     int searchVocab(char *word);
350 |     int readWordIndex(FILE *fin);
351 |     int addWordToVocab(char *word);
352 |     void learnVocabFromTrainFile();		//train_file will be used to construct vocabulary
353 |     
354 |     void saveWeights();			//saves current weights and unit activations
355 |     void restoreWeights();		//restores current weights and unit activations from backup copy
356 |     //void saveWeights2();		//allows 2. copy to be stored, useful for dynamic rescoring of nbest lists
357 |     //void restoreWeights2();		
358 |     void saveContext();
359 |     void restoreContext();
360 |     void saveContext2();
361 |     void restoreContext2();
362 |     void initNet();
363 |     void saveNet();
364 |     void goToDelimiter(int delim, FILE *fi);
365 |     void restoreNet();
366 |     void netFlush();
367 |     void netReset();    //will erase just hidden layer state + bptt history + maxent history (called at end of sentences in the independent mode)
368 |     
369 |     void computeNet(int last_word, int word);
370 |     void learnNet(int last_word, int word);
371 |     void copyHiddenLayerToInput();
372 |     void trainNet();
373 |     void useLMProb(int use) {use_lmprob=use;}
374 |     void testNet();
375 |     void testNbest();
376 |     void testGen();
377 |     
378 |     void matrixXvector(struct neuron *dest, struct neuron *srcvec, struct synapse *srcmatrix, int matrix_width, int from, int to, int from2, int to2, int type);
379 | };
380 | 
381 | #endif
382 | 


--------------------------------------------------------------------------------
/src/3rdparty/utfcpp/utf8.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2006 Nemanja Trifunovic
 2 | 
 3 | /*
 4 | Permission is hereby granted, free of charge, to any person or organization
 5 | obtaining a copy of the software and accompanying documentation covered by
 6 | this license (the "Software") to use, reproduce, display, distribute,
 7 | execute, and transmit the Software, and to prepare derivative works of the
 8 | Software, and to permit third-parties to whom the Software is furnished to
 9 | do so, all subject to the following:
10 | 
11 | The copyright notices in the Software and this entire statement, including
12 | the above license grant, this restriction and the following disclaimer,
13 | must be included in all copies of the Software, in whole or in part, and
14 | all derivative works of the Software, unless such copies or derivative
15 | works are solely in the form of machine-executable object code generated by
16 | a source language processor.
17 | 
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 | DEALINGS IN THE SOFTWARE.
25 | */
26 | 
27 | 
28 | #ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
29 | #define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
30 | 
31 | #include "utf8/checked.h"
32 | #include "utf8/unchecked.h"
33 | 
34 | #endif // header guard
35 | 


--------------------------------------------------------------------------------
/src/3rdparty/utfcpp/utf8/checked.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Nemanja Trifunovic
  2 | 
  3 | /*
  4 | Permission is hereby granted, free of charge, to any person or organization
  5 | obtaining a copy of the software and accompanying documentation covered by
  6 | this license (the "Software") to use, reproduce, display, distribute,
  7 | execute, and transmit the Software, and to prepare derivative works of the
  8 | Software, and to permit third-parties to whom the Software is furnished to
  9 | do so, all subject to the following:
 10 | 
 11 | The copyright notices in the Software and this entire statement, including
 12 | the above license grant, this restriction and the following disclaimer,
 13 | must be included in all copies of the Software, in whole or in part, and
 14 | all derivative works of the Software, unless such copies or derivative
 15 | works are solely in the form of machine-executable object code generated by
 16 | a source language processor.
 17 | 
 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 24 | DEALINGS IN THE SOFTWARE.
 25 | */
 26 | 
 27 | 
 28 | #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 29 | #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 30 | 
 31 | #include "core.h"
 32 | #include <stdexcept>
 33 | 
 34 | namespace utf8
 35 | {
 36 |     // Base for the exceptions that may be thrown from the library
 37 |     class exception : public std::exception {
 38 |     };
 39 | 
 40 |     // Exceptions that may be thrown from the library functions.
 41 |     class invalid_code_point : public exception {
 42 |         uint32_t cp;
 43 |     public:
 44 |         invalid_code_point(uint32_t cp) : cp(cp) {}
 45 |         virtual const char* what() const throw() { return "Invalid code point"; }
 46 |         uint32_t code_point() const {return cp;}
 47 |     };
 48 | 
 49 |     class invalid_utf8 : public exception {
 50 |         uint8_t u8;
 51 |     public:
 52 |         invalid_utf8 (uint8_t u) : u8(u) {}
 53 |         virtual const char* what() const throw() { return "Invalid UTF-8"; }
 54 |         uint8_t utf8_octet() const {return u8;}
 55 |     };
 56 | 
 57 |     class invalid_utf16 : public exception {
 58 |         uint16_t u16;
 59 |     public:
 60 |         invalid_utf16 (uint16_t u) : u16(u) {}
 61 |         virtual const char* what() const throw() { return "Invalid UTF-16"; }
 62 |         uint16_t utf16_word() const {return u16;}
 63 |     };
 64 | 
 65 |     class not_enough_room : public exception {
 66 |     public:
 67 |         virtual const char* what() const throw() { return "Not enough space"; }
 68 |     };
 69 | 
 70 |     /// The library API - functions intended to be called by the users
 71 | 
 72 |     template <typename octet_iterator, typename output_iterator>
 73 |     output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
 74 |     {
 75 |         while (start != end) {
 76 |             octet_iterator sequence_start = start;
 77 |             internal::utf_error err_code = internal::validate_next(start, end);
 78 |             switch (err_code) {
 79 |                 case internal::UTF8_OK :
 80 |                     for (octet_iterator it = sequence_start; it != start; ++it)
 81 |                         *out++ = *it;
 82 |                     break;
 83 |                 case internal::NOT_ENOUGH_ROOM:
 84 |                     throw not_enough_room();
 85 |                 case internal::INVALID_LEAD:
 86 |                     append (replacement, out);
 87 |                     ++start;
 88 |                     break;
 89 |                 case internal::INCOMPLETE_SEQUENCE:
 90 |                 case internal::OVERLONG_SEQUENCE:
 91 |                 case internal::INVALID_CODE_POINT:
 92 |                     append (replacement, out);
 93 |                     ++start;
 94 |                     // just one replacement mark for the sequence
 95 |                     while (internal::is_trail(*start) && start != end)
 96 |                         ++start;
 97 |                     break;
 98 |             }
 99 |         }
100 |         return out;
101 |     }
102 | 
103 |     template <typename octet_iterator, typename output_iterator>
104 |     inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
105 |     {
106 |         static const uint32_t replacement_marker = internal::mask16(0xfffd);
107 |         return replace_invalid(start, end, out, replacement_marker);
108 |     }
109 | 
110 |     template <typename octet_iterator>
111 |     octet_iterator append(uint32_t cp, octet_iterator result)
112 |     {
113 |         if (!internal::is_code_point_valid(cp))
114 |             throw invalid_code_point(cp);
115 | 
116 |         if (cp < 0x80)                        // one octet
117 |             *(result++) = static_cast<uint8_t>(cp);
118 |         else if (cp < 0x800) {                // two octets
119 |             *(result++) = static_cast<uint8_t>((cp >> 6)            | 0xc0);
120 |             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
121 |         }
122 |         else if (cp < 0x10000) {              // three octets
123 |             *(result++) = static_cast<uint8_t>((cp >> 12)           | 0xe0);
124 |             *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
125 |             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
126 |         }
127 |         else {      // four octets
128 |             *(result++) = static_cast<uint8_t>((cp >> 18)           | 0xf0);
129 |             *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)  | 0x80);
130 |             *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
131 |             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
132 |         }
133 |         return result;
134 |     }
135 | 
136 |     template <typename octet_iterator>
137 |     uint32_t next(octet_iterator& it, octet_iterator end)
138 |     {
139 |         uint32_t cp = 0;
140 |         internal::utf_error err_code = internal::validate_next(it, end, &cp);
141 |         switch (err_code) {
142 |             case internal::UTF8_OK :
143 |                 break;
144 |             case internal::NOT_ENOUGH_ROOM :
145 |                 throw not_enough_room();
146 |             case internal::INVALID_LEAD :
147 |             case internal::INCOMPLETE_SEQUENCE :
148 |             case internal::OVERLONG_SEQUENCE :
149 |                 throw invalid_utf8(*it);
150 |             case internal::INVALID_CODE_POINT :
151 |                 throw invalid_code_point(cp);
152 |         }
153 |         return cp;
154 |     }
155 | 
156 |     template <typename octet_iterator>
157 |     uint32_t peek_next(octet_iterator it, octet_iterator end)
158 |     {
159 |         return next(it, end);
160 |     }
161 | 
162 |     template <typename octet_iterator>
163 |     uint32_t prior(octet_iterator& it, octet_iterator start)
164 |     {
165 |         // can't do much if it == start
166 |         if (it == start)
167 |             throw not_enough_room();
168 | 
169 |         octet_iterator end = it;
170 |         // Go back until we hit either a lead octet or start
171 |         while (internal::is_trail(*(--it)))
172 |             if (it == start)
173 |                 throw invalid_utf8(*it); // error - no lead byte in the sequence
174 |         return peek_next(it, end);
175 |     }
176 | 
177 |     /// Deprecated in versions that include "prior"
178 |     template <typename octet_iterator>
179 |     uint32_t previous(octet_iterator& it, octet_iterator pass_start)
180 |     {
181 |         octet_iterator end = it;
182 |         while (internal::is_trail(*(--it)))
183 |             if (it == pass_start)
184 |                 throw invalid_utf8(*it); // error - no lead byte in the sequence
185 |         octet_iterator temp = it;
186 |         return next(temp, end);
187 |     }
188 | 
189 |     template <typename octet_iterator, typename distance_type>
190 |     void advance (octet_iterator& it, distance_type n, octet_iterator end)
191 |     {
192 |         for (distance_type i = 0; i < n; ++i)
193 |             next(it, end);
194 |     }
195 | 
196 |     template <typename octet_iterator>
197 |     typename std::iterator_traits<octet_iterator>::difference_type
198 |     distance (octet_iterator first, octet_iterator last)
199 |     {
200 |         typename std::iterator_traits<octet_iterator>::difference_type dist;
201 |         for (dist = 0; first < last; ++dist)
202 |             next(first, last);
203 |         return dist;
204 |     }
205 | 
206 |     template <typename u16bit_iterator, typename octet_iterator>
207 |     octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
208 |     {
209 |         while (start != end) {
210 |             uint32_t cp = internal::mask16(*start++);
211 |             // Take care of surrogate pairs first
212 |             if (internal::is_lead_surrogate(cp)) {
213 |                 if (start != end) {
214 |                     uint32_t trail_surrogate = internal::mask16(*start++);
215 |                     if (internal::is_trail_surrogate(trail_surrogate))
216 |                         cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
217 |                     else
218 |                         throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
219 |                 }
220 |                 else
221 |                     throw invalid_utf16(static_cast<uint16_t>(cp));
222 | 
223 |             }
224 |             // Lone trail surrogate
225 |             else if (internal::is_trail_surrogate(cp))
226 |                 throw invalid_utf16(static_cast<uint16_t>(cp));
227 | 
228 |             result = append(cp, result);
229 |         }
230 |         return result;
231 |     }
232 | 
233 |     template <typename u16bit_iterator, typename octet_iterator>
234 |     u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
235 |     {
236 |         while (start != end) {
237 |             uint32_t cp = next(start, end);
238 |             if (cp > 0xffff) { //make a surrogate pair
239 |                 *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
240 |                 *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
241 |             }
242 |             else
243 |                 *result++ = static_cast<uint16_t>(cp);
244 |         }
245 |         return result;
246 |     }
247 | 
248 |     template <typename octet_iterator, typename u32bit_iterator>
249 |     octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
250 |     {
251 |         while (start != end)
252 |             result = append(*(start++), result);
253 | 
254 |         return result;
255 |     }
256 | 
257 |     template <typename octet_iterator, typename u32bit_iterator>
258 |     u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
259 |     {
260 |         while (start != end)
261 |             (*result++) = next(start, end);
262 | 
263 |         return result;
264 |     }
265 | 
266 |     // The iterator class
267 |     template <typename octet_iterator>
268 |     class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
269 |       octet_iterator it;
270 |       octet_iterator range_start;
271 |       octet_iterator range_end;
272 |       public:
273 |       iterator () {};
274 |       explicit iterator (const octet_iterator& octet_it,
275 |                          const octet_iterator& range_start,
276 |                          const octet_iterator& range_end) :
277 |                it(octet_it), range_start(range_start), range_end(range_end)
278 |       {
279 |           if (it < range_start || it > range_end)
280 |               throw std::out_of_range("Invalid utf-8 iterator position");
281 |       }
282 |       // the default "big three" are OK
283 |       octet_iterator base () const { return it; }
284 |       uint32_t operator * () const
285 |       {
286 |           octet_iterator temp = it;
287 |           return next(temp, range_end);
288 |       }
289 |       bool operator == (const iterator& rhs) const
290 |       {
291 |           if (range_start != rhs.range_start || range_end != rhs.range_end)
292 |               throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
293 |           return (it == rhs.it);
294 |       }
295 |       bool operator != (const iterator& rhs) const
296 |       {
297 |           return !(operator == (rhs));
298 |       }
299 |       iterator& operator ++ ()
300 |       {
301 |           next(it, range_end);
302 |           return *this;
303 |       }
304 |       iterator operator ++ (int)
305 |       {
306 |           iterator temp = *this;
307 |           next(it, range_end);
308 |           return temp;
309 |       }
310 |       iterator& operator -- ()
311 |       {
312 |           prior(it, range_start);
313 |           return *this;
314 |       }
315 |       iterator operator -- (int)
316 |       {
317 |           iterator temp = *this;
318 |           prior(it, range_start);
319 |           return temp;
320 |       }
321 |     }; // class iterator
322 | 
323 | } // namespace utf8
324 | 
325 | #endif //header guard
326 | 
327 | 
328 | 


--------------------------------------------------------------------------------
/src/3rdparty/utfcpp/utf8/core.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Nemanja Trifunovic
  2 | 
  3 | /*
  4 | Permission is hereby granted, free of charge, to any person or organization
  5 | obtaining a copy of the software and accompanying documentation covered by
  6 | this license (the "Software") to use, reproduce, display, distribute,
  7 | execute, and transmit the Software, and to prepare derivative works of the
  8 | Software, and to permit third-parties to whom the Software is furnished to
  9 | do so, all subject to the following:
 10 | 
 11 | The copyright notices in the Software and this entire statement, including
 12 | the above license grant, this restriction and the following disclaimer,
 13 | must be included in all copies of the Software, in whole or in part, and
 14 | all derivative works of the Software, unless such copies or derivative
 15 | works are solely in the form of machine-executable object code generated by
 16 | a source language processor.
 17 | 
 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 24 | DEALINGS IN THE SOFTWARE.
 25 | */
 26 | 
 27 | 
 28 | #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 29 | #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 30 | 
 31 | #include <iterator>
 32 | 
 33 | namespace utf8
 34 | {
 35 |     // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
 36 |     // You may need to change them to match your system.
 37 |     // These typedefs have the same names as ones from cstdint, or boost/cstdint
 38 |     typedef unsigned char   uint8_t;
 39 |     typedef unsigned short  uint16_t;
 40 |     typedef unsigned int    uint32_t;
 41 | 
 42 | // Helper code - not intended to be directly called by the library users. May be changed at any time
 43 | namespace internal
 44 | {
 45 |     // Unicode constants
 46 |     // Leading (high) surrogates: 0xd800 - 0xdbff
 47 |     // Trailing (low) surrogates: 0xdc00 - 0xdfff
 48 |     const uint16_t LEAD_SURROGATE_MIN  = 0xd800u;
 49 |     const uint16_t LEAD_SURROGATE_MAX  = 0xdbffu;
 50 |     const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
 51 |     const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
 52 |     const uint16_t LEAD_OFFSET         = LEAD_SURROGATE_MIN - (0x10000 >> 10);
 53 |     const uint32_t SURROGATE_OFFSET    = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
 54 | 
 55 |     // Maximum valid value for a Unicode code point
 56 |     const uint32_t CODE_POINT_MAX      = 0x0010ffffu;
 57 | 
 58 |     template<typename octet_type>
 59 |     inline uint8_t mask8(octet_type oc)
 60 |     {
 61 |         return static_cast<uint8_t>(0xff & oc);
 62 |     }
 63 |     template<typename u16_type>
 64 |     inline uint16_t mask16(u16_type oc)
 65 |     {
 66 |         return static_cast<uint16_t>(0xffff & oc);
 67 |     }
 68 |     template<typename octet_type>
 69 |     inline bool is_trail(octet_type oc)
 70 |     {
 71 |         return ((mask8(oc) >> 6) == 0x2);
 72 |     }
 73 | 
 74 |     template <typename u16>
 75 |     inline bool is_lead_surrogate(u16 cp)
 76 |     {
 77 |         return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
 78 |     }
 79 | 
 80 |     template <typename u16>
 81 |     inline bool is_trail_surrogate(u16 cp)
 82 |     {
 83 |         return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
 84 |     }
 85 | 
 86 |     template <typename u16>
 87 |     inline bool is_surrogate(u16 cp)
 88 |     {
 89 |         return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
 90 |     }
 91 | 
 92 |     template <typename u32>
 93 |     inline bool is_code_point_valid(u32 cp)
 94 |     {
 95 |         return (cp <= CODE_POINT_MAX && !is_surrogate(cp));
 96 |     }
 97 | 
 98 |     template <typename octet_iterator>
 99 |     inline typename std::iterator_traits<octet_iterator>::difference_type
100 |     sequence_length(octet_iterator lead_it)
101 |     {
102 |         uint8_t lead = mask8(*lead_it);
103 |         if (lead < 0x80)
104 |             return 1;
105 |         else if ((lead >> 5) == 0x6)
106 |             return 2;
107 |         else if ((lead >> 4) == 0xe)
108 |             return 3;
109 |         else if ((lead >> 3) == 0x1e)
110 |             return 4;
111 |         else
112 |             return 0;
113 |     }
114 | 
115 |     template <typename octet_difference_type>
116 |     inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
117 |     {
118 |         if (cp < 0x80) {
119 |             if (length != 1) 
120 |                 return true;
121 |         }
122 |         else if (cp < 0x800) {
123 |             if (length != 2) 
124 |                 return true;
125 |         }
126 |         else if (cp < 0x10000) {
127 |             if (length != 3) 
128 |                 return true;
129 |         }
130 | 
131 |         return false;
132 |     }
133 | 
134 |     enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
135 | 
136 |     /// get_sequence_x functions decode utf-8 sequences of the length x
137 | 
138 |     template <typename octet_iterator>
139 |     utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t* code_point)
140 |     {
141 |         if (it != end) {
142 |             if (code_point)
143 |                 *code_point = mask8(*it);
144 |             return UTF8_OK;
145 |         }
146 |         return NOT_ENOUGH_ROOM;
147 |     }
148 | 
149 |     template <typename octet_iterator>
150 |     utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t* code_point)
151 |     {
152 |         utf_error ret_code = NOT_ENOUGH_ROOM;
153 | 
154 |         if (it != end) {
155 |             uint32_t cp = mask8(*it);
156 |             if (++it != end) {
157 |                 if (is_trail(*it)) {
158 |                     cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
159 | 
160 |                     if (code_point)
161 |                         *code_point = cp;
162 |                     ret_code = UTF8_OK;
163 |                 }
164 |                 else
165 |                     ret_code = INCOMPLETE_SEQUENCE;
166 |             }
167 |             else
168 |                 ret_code = NOT_ENOUGH_ROOM;
169 |         }
170 | 
171 |         return ret_code;
172 |     }
173 | 
174 |     template <typename octet_iterator>
175 |     utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t* code_point)
176 |     {
177 |         utf_error ret_code = NOT_ENOUGH_ROOM;
178 | 
179 |         if (it != end) {
180 |             uint32_t cp = mask8(*it);
181 |             if (++it != end) {
182 |                 if (is_trail(*it)) {
183 |                     cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
184 |                     if (++it != end) {
185 |                         if (is_trail(*it)) {
186 |                             cp += (*it) & 0x3f;
187 | 
188 |                             if (code_point)
189 |                                 *code_point = cp;
190 |                             ret_code = UTF8_OK;
191 |                         }
192 |                         else 
193 |                             ret_code = INCOMPLETE_SEQUENCE;
194 |                     }
195 |                     else
196 |                         ret_code = NOT_ENOUGH_ROOM;
197 |                 }
198 |                 else
199 |                     ret_code = INCOMPLETE_SEQUENCE;
200 |             }
201 |             else
202 |                 ret_code = NOT_ENOUGH_ROOM;
203 |         }
204 | 
205 |         return ret_code;
206 |     }
207 | 
208 |     template <typename octet_iterator>
209 |     utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t* code_point)
210 |     {
211 |         utf_error ret_code = NOT_ENOUGH_ROOM;
212 | 
213 |         if (it != end) {
214 |             uint32_t cp = mask8(*it);
215 |             if (++it != end) {
216 |                 if (is_trail(*it)) {
217 |                     cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);
218 |                     if (++it != end) {
219 |                         if (is_trail(*it)) {
220 |                             cp += (mask8(*it) << 6) & 0xfff;
221 |                             if (++it != end) {
222 |                                 if (is_trail(*it)) {
223 |                                     cp += (*it) & 0x3f;
224 | 
225 |                                     if (code_point)
226 |                                         *code_point = cp;
227 |                                     ret_code = UTF8_OK;
228 |                                 }
229 |                                 else
230 |                                     ret_code = INCOMPLETE_SEQUENCE;
231 |                             }
232 |                             else
233 |                                 ret_code = NOT_ENOUGH_ROOM;
234 |                         }
235 |                         else
236 |                             ret_code = INCOMPLETE_SEQUENCE;
237 |                     }
238 |                     else
239 |                         ret_code = NOT_ENOUGH_ROOM;
240 |                 }
241 |                 else 
242 |                     ret_code = INCOMPLETE_SEQUENCE;
243 |             }
244 |             else
245 |                 ret_code = NOT_ENOUGH_ROOM;
246 |         }
247 | 
248 |         return ret_code;
249 |     }
250 | 
251 |     template <typename octet_iterator>
252 |     utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
253 |     {
254 |         // Save the original value of it so we can go back in case of failure
255 |         // Of course, it does not make much sense with i.e. stream iterators
256 |         octet_iterator original_it = it;
257 | 
258 |         uint32_t cp = 0;
259 |         // Determine the sequence length based on the lead octet
260 |         typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
261 |         octet_difference_type length = sequence_length(it);
262 |         if (length == 0)
263 |             return INVALID_LEAD;
264 | 
265 |         // Now that we have a valid sequence length, get trail octets and calculate the code point
266 |         utf_error err = UTF8_OK;
267 |         switch (length) {
268 |             case 1:
269 |                 err = get_sequence_1(it, end, &cp);
270 |                 break;
271 |             case 2:
272 |                 err = get_sequence_2(it, end, &cp);
273 |             break;
274 |             case 3:
275 |                 err = get_sequence_3(it, end, &cp);
276 |             break;
277 |             case 4:
278 |                 err = get_sequence_4(it, end, &cp);
279 |             break;
280 |         }
281 | 
282 |         if (err == UTF8_OK) {
283 |             // Decoding succeeded. Now, security checks...
284 |             if (is_code_point_valid(cp)) {
285 |                 if (!is_overlong_sequence(cp, length)){
286 |                     // Passed! Return here.
287 |                     if (code_point)
288 |                         *code_point = cp;
289 |                     ++it;
290 |                     return UTF8_OK;
291 |                 }
292 |                 else
293 |                     err = OVERLONG_SEQUENCE;
294 |             }
295 |             else 
296 |                 err = INVALID_CODE_POINT;
297 |         }
298 | 
299 |         // Failure branch - restore the original value of the iterator
300 |         it = original_it;
301 |         return err;
302 |     }
303 | 
304 |     template <typename octet_iterator>
305 |     inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
306 |         return validate_next(it, end, 0);
307 |     }
308 | 
309 | } // namespace internal
310 | 
311 |     /// The library API - functions intended to be called by the users
312 | 
313 |     // Byte order mark
314 |     const uint8_t bom[] = {0xef, 0xbb, 0xbf};
315 | 
316 |     template <typename octet_iterator>
317 |     octet_iterator find_invalid(octet_iterator start, octet_iterator end)
318 |     {
319 |         octet_iterator result = start;
320 |         while (result != end) {
321 |             internal::utf_error err_code = internal::validate_next(result, end);
322 |             if (err_code != internal::UTF8_OK)
323 |                 return result;
324 |         }
325 |         return result;
326 |     }
327 | 
328 |     template <typename octet_iterator>
329 |     inline bool is_valid(octet_iterator start, octet_iterator end)
330 |     {
331 |         return (find_invalid(start, end) == end);
332 |     }
333 | 
334 |     template <typename octet_iterator>
335 |     inline bool starts_with_bom (octet_iterator it, octet_iterator end)
336 |     {
337 |         return (
338 |             ((it != end) && (internal::mask8(*it++)) == bom[0]) &&
339 |             ((it != end) && (internal::mask8(*it++)) == bom[1]) &&
340 |             ((it != end) && (internal::mask8(*it))   == bom[2])
341 |            );
342 |     }
343 | 	
344 | 	//Deprecated in release 2.3 
345 |     template <typename octet_iterator>
346 |     inline bool is_bom (octet_iterator it)
347 |     {
348 |         return (
349 |             (internal::mask8(*it++)) == bom[0] &&
350 |             (internal::mask8(*it++)) == bom[1] &&
351 |             (internal::mask8(*it))   == bom[2]
352 |            );
353 |     }
354 | } // namespace utf8
355 | 
356 | #endif // header guard
357 | 
358 | 
359 | 


--------------------------------------------------------------------------------
/src/3rdparty/utfcpp/utf8/unchecked.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Nemanja Trifunovic
  2 | 
  3 | /*
  4 | Permission is hereby granted, free of charge, to any person or organization
  5 | obtaining a copy of the software and accompanying documentation covered by
  6 | this license (the "Software") to use, reproduce, display, distribute,
  7 | execute, and transmit the Software, and to prepare derivative works of the
  8 | Software, and to permit third-parties to whom the Software is furnished to
  9 | do so, all subject to the following:
 10 | 
 11 | The copyright notices in the Software and this entire statement, including
 12 | the above license grant, this restriction and the following disclaimer,
 13 | must be included in all copies of the Software, in whole or in part, and
 14 | all derivative works of the Software, unless such copies or derivative
 15 | works are solely in the form of machine-executable object code generated by
 16 | a source language processor.
 17 | 
 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 24 | DEALINGS IN THE SOFTWARE.
 25 | */
 26 | 
 27 | 
 28 | #ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 29 | #define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 30 | 
 31 | #include "core.h"
 32 | 
 33 | namespace utf8
 34 | {
 35 |     namespace unchecked 
 36 |     {
 37 |         template <typename octet_iterator>
 38 |         octet_iterator append(uint32_t cp, octet_iterator result)
 39 |         {
 40 |             if (cp < 0x80)                        // one octet
 41 |                 *(result++) = static_cast<uint8_t>(cp);  
 42 |             else if (cp < 0x800) {                // two octets
 43 |                 *(result++) = static_cast<uint8_t>((cp >> 6)          | 0xc0);
 44 |                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
 45 |             }
 46 |             else if (cp < 0x10000) {              // three octets
 47 |                 *(result++) = static_cast<uint8_t>((cp >> 12)         | 0xe0);
 48 |                 *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
 49 |                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
 50 |             }
 51 |             else {                                // four octets
 52 |                 *(result++) = static_cast<uint8_t>((cp >> 18)         | 0xf0);
 53 |                 *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80);
 54 |                 *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
 55 |                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
 56 |             }
 57 |             return result;
 58 |         }
 59 | 
 60 |         template <typename octet_iterator>
 61 |         uint32_t next(octet_iterator& it)
 62 |         {
 63 |             uint32_t cp = internal::mask8(*it);
 64 |             typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it);
 65 |             switch (length) {
 66 |                 case 1:
 67 |                     break;
 68 |                 case 2:
 69 |                     it++;
 70 |                     cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
 71 |                     break;
 72 |                 case 3:
 73 |                     ++it; 
 74 |                     cp = ((cp << 12) & 0xffff) + ((internal::mask8(*it) << 6) & 0xfff);
 75 |                     ++it;
 76 |                     cp += (*it) & 0x3f;
 77 |                     break;
 78 |                 case 4:
 79 |                     ++it;
 80 |                     cp = ((cp << 18) & 0x1fffff) + ((internal::mask8(*it) << 12) & 0x3ffff);                
 81 |                     ++it;
 82 |                     cp += (internal::mask8(*it) << 6) & 0xfff;
 83 |                     ++it;
 84 |                     cp += (*it) & 0x3f; 
 85 |                     break;
 86 |             }
 87 |             ++it;
 88 |             return cp;        
 89 |         }
 90 | 
 91 |         template <typename octet_iterator>
 92 |         uint32_t peek_next(octet_iterator it)
 93 |         {
 94 |             return next(it);    
 95 |         }
 96 | 
 97 |         template <typename octet_iterator>
 98 |         uint32_t prior(octet_iterator& it)
 99 |         {
100 |             while (internal::is_trail(*(--it))) ;
101 |             octet_iterator temp = it;
102 |             return next(temp);
103 |         }
104 | 
105 |         // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous)
106 |         template <typename octet_iterator>
107 |         inline uint32_t previous(octet_iterator& it)
108 |         {
109 |             return prior(it);
110 |         }
111 | 
112 |         template <typename octet_iterator, typename distance_type>
113 |         void advance (octet_iterator& it, distance_type n)
114 |         {
115 |             for (distance_type i = 0; i < n; ++i)
116 |                 next(it);
117 |         }
118 | 
119 |         template <typename octet_iterator>
120 |         typename std::iterator_traits<octet_iterator>::difference_type
121 |         distance (octet_iterator first, octet_iterator last)
122 |         {
123 |             typename std::iterator_traits<octet_iterator>::difference_type dist;
124 |             for (dist = 0; first < last; ++dist) 
125 |                 next(first);
126 |             return dist;
127 |         }
128 | 
129 |         template <typename u16bit_iterator, typename octet_iterator>
130 |         octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
131 |         {       
132 |             while (start != end) {
133 |                 uint32_t cp = internal::mask16(*start++);
134 |             // Take care of surrogate pairs first
135 |                 if (internal::is_lead_surrogate(cp)) {
136 |                     uint32_t trail_surrogate = internal::mask16(*start++);
137 |                     cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
138 |                 }
139 |                 result = append(cp, result);
140 |             }
141 |             return result;         
142 |         }
143 | 
144 |         template <typename u16bit_iterator, typename octet_iterator>
145 |         u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
146 |         {
147 |             while (start < end) {
148 |                 uint32_t cp = next(start);
149 |                 if (cp > 0xffff) { //make a surrogate pair
150 |                     *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
151 |                     *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
152 |                 }
153 |                 else
154 |                     *result++ = static_cast<uint16_t>(cp);
155 |             }
156 |             return result;
157 |         }
158 | 
159 |         template <typename octet_iterator, typename u32bit_iterator>
160 |         octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
161 |         {
162 |             while (start != end)
163 |                 result = append(*(start++), result);
164 | 
165 |             return result;
166 |         }
167 | 
168 |         template <typename octet_iterator, typename u32bit_iterator>
169 |         u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
170 |         {
171 |             while (start < end)
172 |                 (*result++) = next(start);
173 | 
174 |             return result;
175 |         }
176 | 
177 |         // The iterator class
178 |         template <typename octet_iterator>
179 |           class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { 
180 |             octet_iterator it;
181 |             public:
182 |             iterator () {};
183 |             explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
184 |             // the default "big three" are OK
185 |             octet_iterator base () const { return it; }
186 |             uint32_t operator * () const
187 |             {
188 |                 octet_iterator temp = it;
189 |                 return next(temp);
190 |             }
191 |             bool operator == (const iterator& rhs) const 
192 |             { 
193 |                 return (it == rhs.it);
194 |             }
195 |             bool operator != (const iterator& rhs) const
196 |             {
197 |                 return !(operator == (rhs));
198 |             }
199 |             iterator& operator ++ () 
200 |             {
201 |                 std::advance(it, internal::sequence_length(it));
202 |                 return *this;
203 |             }
204 |             iterator operator ++ (int)
205 |             {
206 |                 iterator temp = *this;
207 |                 std::advance(it, internal::sequence_length(it));
208 |                 return temp;
209 |             }  
210 |             iterator& operator -- ()
211 |             {
212 |                 prior(it);
213 |                 return *this;
214 |             }
215 |             iterator operator -- (int)
216 |             {
217 |                 iterator temp = *this;
218 |                 prior(it);
219 |                 return temp;
220 |             }
221 |           }; // class iterator
222 | 
223 |     } // namespace utf8::unchecked
224 | } // namespace utf8 
225 | 
226 | 
227 | #endif // header guard
228 | 
229 | 


--------------------------------------------------------------------------------
/src/bin/phonetisaurus-arpa2wfst.cc:
--------------------------------------------------------------------------------
 1 | /*
 2 |  phonetisaurus-arpa2wfst.cc
 3 | 
 4 |  Copyright (c) [2012-], Josef Robert Novak
 5 |  All rights reserved.
 6 | 
 7 |    Redistribution and use in source and binary forms, with or without
 8 |    modification, are permitted #provided that the following conditions
 9 |    are met:
10 | 
11 |    * Redistributions of source code must retain the above copyright 
12 |      notice, this list of conditions and the following disclaimer.
13 |    * Redistributions in binary form must reproduce the above 
14 |      copyright notice, this list of #conditions and the following 
15 |      disclaimer in the documentation and/or other materials provided 
16 |      with the distribution.
17 | 
18 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
19 |    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
20 |    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
21 |    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
22 |    COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
23 |    INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
24 |    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 
25 |    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
26 |    HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
27 |    STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
28 |    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 
29 |    OF THE POSSIBILITY OF SUCH DAMAGE.
30 | *
31 | */
32 | using namespace std;
33 | #include <include/ARPA2WFST.h>
34 | #include <include/util.h>
35 | 
36 | using namespace fst;
37 | 
38 | DEFINE_string (lm, "", "Input ARPA format LM.");
39 | DEFINE_string (eps, "<eps>", "Epsilon symbol.");
40 | DEFINE_string (sb, "<s>", "Sentence begin token.");
41 | DEFINE_string (se, "</s>", "Sentence end token.");
42 | DEFINE_string (split, "}", "Character separating grapheme/phoneme info.");
43 | DEFINE_string (skip, "_", "Character indicating insertions/deletions.");
44 | DEFINE_string (tie, "|", "Character separating multi-token subsequences.");
45 | DEFINE_string (ssyms, "", "Output filename for state symbols tables (default: do not print).");
46 | DEFINE_string (ofile, "", "Output file for writing. (STDOUT)");
47 | 
48 | int main (int argc, char* argv []) {
49 |   cerr << "GitRevision: " << GIT_REVISION << endl;
50 |   string usage = "arpa2wfsa - Transform an ARPA LM into an "
51 |     "equivalent WFSA.\n\n Usage: ";
52 |   set_new_handler (FailedNewHandler);
53 |   PhonetisaurusSetFlags (usage.c_str(), &argc, &argv, false);
54 | 
55 |   if (FLAGS_lm.compare ("") == 0) {
56 |     cerr << "You must supply an ARPA format lm "
57 |       "to --lm for conversion!" << endl;
58 |     return 0;
59 |   }
60 |     
61 |   cerr << "Initializing..." << endl;
62 |   ARPA2WFST* converter = new ARPA2WFST (FLAGS_lm, FLAGS_eps, FLAGS_sb, 
63 | 					FLAGS_se, FLAGS_split, FLAGS_skip, 
64 | 					FLAGS_tie);
65 |   cerr << "Converting..." << endl;
66 |   converter->arpa_to_wfst ();
67 |   
68 |   converter->arpafst.Write (FLAGS_ofile);
69 | 
70 |   if (FLAGS_ssyms.compare ("") != 0) {
71 |     converter->ssyms->WriteText (FLAGS_ssyms);
72 |   }
73 |   
74 |   delete converter;
75 | 
76 |   return 0;
77 | }
78 | 


--------------------------------------------------------------------------------
/src/bin/phonetisaurus-g2pfst.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 |  phonetisaurus-g2pfst.cc
  3 | 
  4 |  Copyright (c) [2012-], Josef Robert Novak
  5 |  All rights reserved.
  6 | 
  7 |    Redistribution and use in source and binary forms, with or without
  8 |    modification, are permitted #provided that the following conditions
  9 |    are met:
 10 | 
 11 |    * Redistributions of source code must retain the above copyright 
 12 |      notice, this list of conditions and the following disclaimer.
 13 |    * Redistributions in binary form must reproduce the above 
 14 |      copyright notice, this list of #conditions and the following 
 15 |      disclaimer in the documentation and/or other materials provided 
 16 |      with the distribution.
 17 | 
 18 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 19 |    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
 20 |    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
 21 |    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 22 |    COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
 23 |    INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
 24 |    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 
 25 |    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
 26 |    HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
 27 |    STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
 28 |    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 
 29 |    OF THE POSSIBILITY OF SUCH DAMAGE.
 30 | *
 31 | */
 32 | #include <fst/fstlib.h>
 33 | using namespace std;
 34 | #include <include/PhonetisaurusScript.h>
 35 | #include <include/util.h>
 36 | #include <iomanip>
 37 | using namespace fst;
 38 | 
 39 | typedef unordered_map<int, vector<PathData> > RMAP;
 40 | 
 41 | void PrintPathData (const vector<PathData>& results, string FLAGS_word,
 42 | 		    const SymbolTable* osyms, bool print_scores = true,
 43 | 		    bool nlog_probs = true) {
 44 |   for (int i = 0; i < results.size (); i++) {
 45 |     cout << FLAGS_word << "\t";
 46 |     if (print_scores == true) {
 47 |       if (nlog_probs == true) 
 48 | 	cout << results [i].PathWeight << "\t";
 49 |       else
 50 | 	cout << std::setprecision (3) << exp (-results [i].PathWeight) << "\t";
 51 |     }
 52 |     
 53 |     for (int j = 0; j < results [i].Uniques.size (); j++) {
 54 |       cout << osyms->Find (results [i].Uniques [j]);
 55 |       if (j < results [i].Uniques.size () - 1)
 56 | 	cout << " ";
 57 |     }
 58 |     cout << endl;
 59 |   }    
 60 | }
 61 | 
 62 | void EvaluateWordlist (PhonetisaurusScript& decoder, vector<string> corpus,
 63 | 		       int FLAGS_beam, int FLAGS_nbest, bool FLAGS_reverse,
 64 | 		       string FLAGS_skip, double FLAGS_thresh, string FLAGS_gsep,
 65 | 		       bool FLAGS_write_fsts, bool FLAGS_print_scores,
 66 | 		       bool FLAGS_accumulate, double FLAGS_pmass,
 67 | 		       bool FLAGS_nlog_probs) {
 68 |   for (int i = 0; i < corpus.size (); i++) {
 69 |     vector<PathData> results = decoder.Phoneticize (corpus [i], FLAGS_nbest,
 70 | 						    FLAGS_beam, FLAGS_thresh,
 71 | 						    FLAGS_write_fsts,
 72 | 						    FLAGS_accumulate, FLAGS_pmass);
 73 |     PrintPathData (results, corpus [i],
 74 | 		   decoder.osyms_,
 75 | 		   FLAGS_print_scores,
 76 | 		   FLAGS_nlog_probs);
 77 |   }
 78 | }
 79 | 
 80 | 
 81 | DEFINE_string (model, "", "Input FST G2P model.");
 82 | DEFINE_string (word, "", "Input word to phoneticize.");
 83 | DEFINE_string (wordlist, "", "Input wordlist to phoneticize");
 84 | DEFINE_string (gsep, "", "Grapheme separator.");
 85 | DEFINE_string (skip, "_", "Phoneme skip marker.");
 86 | DEFINE_int32 (nbest, 1, "N-best hypotheses to output.");
 87 | DEFINE_int32 (beam, 10000, "Decoder beam.");
 88 | DEFINE_double (thresh, 99.0, "N-best comparison threshold.");
 89 | DEFINE_double (pmass, 0.0, "Percent of probability mass (0.0 < p <= 1.0).");
 90 | DEFINE_bool (write_fsts, false, "Write the output FSTs for debugging.");
 91 | DEFINE_bool (reverse, false, "Reverse input word.");
 92 | DEFINE_bool (print_scores, true, "Print scores in output.");
 93 | DEFINE_bool (accumulate, false, "Accumulate weights for unique output prons.");
 94 | DEFINE_bool (nlog_probs, true, "Default scores vals are negative logs. "
 95 | 	     "Otherwise exp (-val).");
 96 | int main (int argc, char* argv []) {
 97 |   cerr << "GitRevision: " << GIT_REVISION << endl;
 98 |   string usage = "phonetisaurus-g2pfst - joint N-gram decoder.\n\n Usage: ";
 99 |   set_new_handler (FailedNewHandler);
100 |   PhonetisaurusSetFlags (usage.c_str(), &argc, &argv, false);
101 | 
102 |   if (FLAGS_model.compare ("") == 0) {
103 |     cerr << "You must supply an FST model to --model" << endl;
104 |     exit (1);
105 |   } else {
106 |     std::ifstream model_ifp (FLAGS_model);
107 |     if (!model_ifp.good ()) {
108 |       cout << "Failed to open --model file '"
109 | 	   << FLAGS_model << "'" << endl;
110 |       exit (1);
111 |     }
112 |   }
113 | 
114 |   if (FLAGS_pmass < 0.0 || FLAGS_pmass > 1) {
115 |     cout << "--pmass must be a float value between 0.0 and 1.0." << endl;
116 |     exit (1);
117 |   }
118 |   if (FLAGS_pmass == 0.0)
119 |     FLAGS_pmass = 99.0;
120 |   else
121 |     FLAGS_pmass = -log (FLAGS_pmass);
122 |   
123 |   bool use_wordlist = false;
124 |   if (FLAGS_wordlist.compare ("") != 0) {
125 |     std::ifstream wordlist_ifp (FLAGS_wordlist);
126 |     if (!wordlist_ifp.good ()) {
127 |       cout << "Failed to open --wordlist file '"
128 | 	   << FLAGS_wordlist << "'" << endl;
129 |       exit (1);
130 |     } else {
131 |       use_wordlist = true;
132 |     }
133 |   }
134 | 
135 |   if (FLAGS_wordlist.compare ("") == 0 && FLAGS_word.compare ("") == 0) {
136 |     cout << "Either --wordlist or --word must be set!" << endl;
137 |     exit (1);
138 |   }
139 | 
140 |   if (use_wordlist == true) {
141 |     vector<string> corpus;
142 |     LoadWordList (FLAGS_wordlist, &corpus);
143 |     
144 |     PhonetisaurusScript decoder (FLAGS_model, FLAGS_gsep);
145 |     EvaluateWordlist (
146 | 	    decoder, corpus, FLAGS_beam, FLAGS_nbest, FLAGS_reverse,
147 | 	    FLAGS_skip, FLAGS_thresh, FLAGS_gsep, FLAGS_write_fsts,
148 | 	    FLAGS_print_scores, FLAGS_accumulate, FLAGS_pmass,
149 | 	    FLAGS_nlog_probs
150 | 	  );
151 |   } else {
152 |     PhonetisaurusScript decoder (FLAGS_model, FLAGS_gsep);
153 |     vector<PathData> results = decoder.Phoneticize (
154 | 		         FLAGS_word, FLAGS_nbest, FLAGS_beam, FLAGS_thresh,
155 | 			 FLAGS_write_fsts, FLAGS_accumulate, FLAGS_pmass
156 | 		       );
157 |     PrintPathData (results, FLAGS_word,
158 | 		   decoder.osyms_,
159 | 		   FLAGS_print_scores,
160 | 		   FLAGS_nlog_probs);
161 |   }
162 |   
163 |   return 0;
164 | }
165 | 


--------------------------------------------------------------------------------
/src/bin/phonetisaurus-g2prnn.cc:
--------------------------------------------------------------------------------
  1 | #include <fst/fstlib.h>
  2 | using namespace std;
  3 | #include <include/LegacyRnnLMHash.h>
  4 | #include <include/LegacyRnnLMDecodable.h>
  5 | #include <include/LegacyRnnLMReader.h>
  6 | #include <include/RnnLMDecoder.h>
  7 | #include <include/util.h>
  8 | #include "utf8.h"
  9 | #ifdef _OPENMP
 10 | #include <omp.h>
 11 | #endif
 12 | using namespace fst;
 13 | 
 14 | typedef LegacyRnnLMDecodable<Token, LegacyRnnLMHash> Decodable;
 15 | typedef unordered_map<int, SimpleResult> RMAP;
 16 | 
 17 | 
 18 | void ThreadedEvaluateWordlist (vector<string>& corpus, RMAP& rmap,
 19 | 			       LegacyRnnLMHash& h, Decodable& s, 
 20 | 			       int FLAGS_threads, int FLAGS_beam, 
 21 | 			       int FLAGS_kmax, int FLAGS_nbest, 
 22 | 			       bool FLAGS_reverse, string FLAGS_gpdelim,
 23 | 			       string FLAGS_gdelim, string FLAGS_skip,
 24 | 			       double FLAGS_thresh, string FLAGS_gsep) {
 25 |   int csize = corpus.size ();
 26 | 
 27 | #ifdef _OPENMP
 28 | #pragma omp parallel for
 29 | #endif
 30 |   for (int x = 0; x < FLAGS_threads; x++) {
 31 |     RnnLMDecoder<Decodable> decoder (s);
 32 | 
 33 |     int start = x * (csize / FLAGS_threads);
 34 |     int end   = (x == FLAGS_threads - 1) ? csize \
 35 |       : start + (csize / FLAGS_threads);
 36 |     for (int i = start; i < end; i++) {
 37 |       vector<string> graphemes = tokenize_utf8_string (&corpus [i],
 38 | 							 &FLAGS_gsep);
 39 |       if (FLAGS_reverse == true)
 40 | 	reverse (graphemes.begin (), graphemes.end ());
 41 | 
 42 |       graphemes.push_back ("</s>");
 43 |       SimpleResult result = \
 44 | 	decoder.Decode (graphemes, FLAGS_beam, FLAGS_kmax, 
 45 | 			FLAGS_nbest, FLAGS_thresh, FLAGS_gpdelim,
 46 | 			FLAGS_gdelim, FLAGS_skip);
 47 |       rmap [i] = result;
 48 |     }
 49 |   }
 50 | 
 51 |   for (int i = 0; i < csize; i++) {
 52 |     const SimpleResult& result = rmap [i];
 53 | 
 54 |     for (int k = 0; k < result.pronunciations.size (); k++)
 55 |       cout << result.word << "\t" << result.scores [k] << "\t" 
 56 | 	   << result.pronunciations [k] << "\n";
 57 |   }
 58 | }
 59 | 
 60 | void EvaluateWordlist (vector<string>& corpus,
 61 | 		       LegacyRnnLMHash& h, Decodable& s, int FLAGS_beam, 
 62 | 		       int FLAGS_kmax, int FLAGS_nbest, bool FLAGS_reverse, 
 63 | 		       string FLAGS_gpdelim, string FLAGS_gdelim, 
 64 | 		       string FLAGS_skip, double FLAGS_thresh,
 65 | 		       string FLAGS_gsep) {
 66 | 
 67 |   RnnLMDecoder<Decodable> decoder (s);
 68 |   for (int i = 0; i < corpus.size (); i++) {
 69 |     vector<string> graphemes = tokenize_utf8_string (&corpus [i],
 70 | 						     &FLAGS_gsep);
 71 |     if (FLAGS_reverse == true)
 72 | 	reverse (graphemes.begin (), graphemes.end ());
 73 | 
 74 |     graphemes.push_back ("</s>");
 75 |     
 76 |     SimpleResult result = \
 77 |       decoder.Decode (graphemes, FLAGS_beam, FLAGS_kmax, 
 78 | 		      FLAGS_nbest, FLAGS_thresh, FLAGS_gpdelim,
 79 | 		      FLAGS_gdelim, FLAGS_skip);
 80 |     
 81 |     for (int k = 0; k < result.pronunciations.size (); k++)
 82 |       cout << result.word << "\t" << result.scores [k] << "\t" 
 83 | 	   << result.pronunciations [k] << "\n";
 84 |   }
 85 | }
 86 | 
 87 | void EvaluateWord (string word, LegacyRnnLMHash& h, Decodable& s, 
 88 | 		   int FLAGS_beam, int FLAGS_kmax, int FLAGS_nbest, 
 89 | 		   bool FLAGS_reverse, string FLAGS_gpdelim, 
 90 | 		   string FLAGS_gdelim, string FLAGS_skip, 
 91 | 		   double FLAGS_thresh, string FLAGS_gsep) {
 92 | 
 93 |   vector<string> graphemes = tokenize_utf8_string (&word,
 94 | 						   &FLAGS_gsep);
 95 |   if (FLAGS_reverse == true)
 96 |     reverse (graphemes.begin (), graphemes.end ());
 97 |   graphemes.push_back ("</s>");
 98 |   
 99 |   RnnLMDecoder<Decodable> decoder (s);
100 |   SimpleResult result =	\
101 |       decoder.Decode (graphemes, FLAGS_beam, FLAGS_kmax, 
102 | 		      FLAGS_nbest, FLAGS_thresh, FLAGS_gpdelim,
103 | 		      FLAGS_gdelim, FLAGS_skip);
104 |     
105 |   for (int k = 0; k < result.pronunciations.size (); k++)
106 |     cout << result.word << "\t" << result.scores [k] << "\t" 
107 | 	 << result.pronunciations [k] << "\n";
108 | }
109 | 
110 | DEFINE_string (rnnlm, "", "The input RnnLM model.");
111 | DEFINE_string (wordlist, "", "Input word list to evaluate.");
112 | DEFINE_string (word, "", "Single input word to evaluate.");
113 | DEFINE_string (gdelim, "|", "The default multigram delimiter.");
114 | DEFINE_string (gpdelim, "}", "The default grapheme / phoneme delimiter.");
115 | DEFINE_string (gsep, "", "The default grapheme delimiter for testing.  Typically ''.");
116 | DEFINE_string (skip, "_", "The default null/skip token.");
117 | DEFINE_int32  (nbest, 1, "Maximum number of hypotheses to return.");
118 | DEFINE_int32  (threads, 1, "Number of parallel threads (OpenMP).");
119 | DEFINE_int32  (kmax, 20, "State-local maximum queue size.");
120 | DEFINE_int32  (beam, 20, "The state-local beam width.");
121 | DEFINE_double (thresh, 0.0, "The n-best pruning threshold. Relative to 1-best.");
122 | DEFINE_bool   (reverse, false, "Reverse the input word before decoding.");
123 | 
124 | int main (int argc, char* argv []) {
125 |   cerr << "GitRevision: " << GIT_REVISION << endl;
126 |   string usage = "phonetisaurus-g2prnn --rnnlm=test.rnnlm " \
127 |     "--wordlist=test.words --nbest=5\n\n Usage: ";
128 |   set_new_handler (FailedNewHandler);
129 |   PhonetisaurusSetFlags (usage.c_str (), &argc, &argv, false);
130 | 
131 |   if (FLAGS_rnnlm.compare ("") == 0) {
132 |     cout << "--rnnlm model is required!" << endl;
133 |     exit (1);
134 |   } else {
135 |     std::ifstream rnnlm_ifp (FLAGS_rnnlm);
136 |     if (!rnnlm_ifp.good ()) {
137 |       cout << "Faile to open --rnnlm file '"
138 | 	   << FLAGS_rnnlm << "'" << endl;
139 |       exit (1);
140 |     }
141 |   }
142 | 
143 |   bool use_wordlist = false;
144 |   if (FLAGS_wordlist.compare ("") != 0) {
145 |     std::ifstream wordlist_ifp (FLAGS_wordlist);
146 |     if (!wordlist_ifp.good ()) {
147 |       cout << "Failed to open --wordlist file '" 
148 | 	   << FLAGS_wordlist << "'" << endl;
149 |       exit (1);
150 |     } else {
151 |       use_wordlist = true;
152 |     }
153 |   }
154 |       
155 |   if (FLAGS_wordlist.compare ("") == 0 && FLAGS_word.compare ("") == 0) {
156 |     cout << "Either --wordlist or --word must be set!" << endl;
157 |   }
158 |  
159 | #ifdef _OPENMP
160 |   omp_set_num_threads (FLAGS_threads);
161 | #endif
162 |   vector<string> corpus;
163 | 
164 |   LoadWordList (FLAGS_wordlist, &corpus);
165 | 
166 |   RMAP rmap;
167 | 
168 |   LegacyRnnLMReader<Decodable, LegacyRnnLMHash> reader (FLAGS_rnnlm);
169 |   LegacyRnnLMHash h = reader.CopyVocabHash (FLAGS_gdelim, FLAGS_gpdelim);
170 |   Decodable s = reader.CopyLegacyRnnLM (h);
171 | 
172 |   if (use_wordlist == true) {
173 |     if (FLAGS_threads > 1) {
174 |       ThreadedEvaluateWordlist (corpus, rmap, h, s, FLAGS_threads,
175 | 				FLAGS_beam, FLAGS_kmax, FLAGS_nbest,
176 | 				FLAGS_reverse, FLAGS_gpdelim,
177 | 				FLAGS_gdelim, FLAGS_skip,
178 | 				FLAGS_thresh, FLAGS_gsep);
179 |     } else {
180 |       EvaluateWordlist (corpus, h, s, FLAGS_beam, 
181 | 			FLAGS_kmax, FLAGS_nbest, FLAGS_reverse, 
182 | 			FLAGS_gpdelim, FLAGS_gdelim, FLAGS_skip, 
183 | 			FLAGS_thresh, FLAGS_gsep);
184 |     }
185 |   } else {
186 |     EvaluateWord (FLAGS_word, h, s, FLAGS_beam, FLAGS_kmax,
187 | 		  FLAGS_nbest, FLAGS_reverse, FLAGS_gpdelim,
188 | 		  FLAGS_gdelim, FLAGS_skip, FLAGS_thresh, FLAGS_gsep);
189 |   }
190 | 
191 |   return 0;
192 | }
193 | 


--------------------------------------------------------------------------------
/src/include/LatticePruner.h:
--------------------------------------------------------------------------------
 1 | #ifndef SRC_INCLUDE_LATTICEPRUNER_H_
 2 | #define SRC_INCLUDE_LATTICEPRUNER_H_
 3 | /*
 4 |  LatticePruner.hpp
 5 | 
 6 |  Copyright (c) [2012-], Josef Robert Novak
 7 |  All rights reserved.
 8 | 
 9 |  Redistribution and use in source and binary forms, with or without
10 |   modification, are permitted #provided that the following conditions
11 |   are met:
12 | 
13 |   * Redistributions of source code must retain the above copyright
14 |     notice, this list of conditions and the following disclaimer.
15 |   * Redistributions in binary form must reproduce the above
16 |     copyright notice, this list of #conditions and the following
17 |     disclaimer in the documentation and/or other materials provided
18 |     with the distribution.
19 | 
20 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 |  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 |  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23 |  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24 |  COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
25 |  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 |  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 |  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 |  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
29 |  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 |  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
31 |  OF THE POSSIBILITY OF SUCH DAMAGE.
32 | */
33 | #include <fst/fstlib.h>
34 | #include <vector>
35 | #include "./util.h"
36 | using namespace std;
37 | 
38 | namespace fst {
39 | class LatticePruner {
40 |   /*
41 |     Generic pruning/re-weighting class for simple WFST lattices.
42 |     Implements several simple pruning methods including the following:
43 |        - Arc penalization
44 |        - N-best extraction via ShortestPath()
45 |        - Arc-based beam pruning via Prune()
46 |        - Forward-Backward pruning
47 |     These may be combined into a cascade as well.
48 |   */
49 |  public:
50 |   // Basics declarations
51 |   vector<LogWeight> alpha, beta;
52 |   LabelData         penalties;
53 |   bool              penalize;
54 |   int               nbest;
55 |   bool              fb;
56 |   TropicalWeight    beam;
57 | 
58 |   // Constructors
59 |   LatticePruner ();
60 |   // Used with M2MFstAligner we should have a symbol-based penalty model to use
61 |   LatticePruner (LabelData _penalties, TropicalWeight _beam, int _nbest,
62 |                  bool _fb, bool _penalize);
63 |   // Otherwise just use an arbitrary lattice/WFST so no penalizing
64 |   LatticePruner (TropicalWeight _beam, int _nbest, bool _fb);
65 | 
66 |   void prune_fst (VectorFst<StdArc>* fst);
67 | 
68 |  private:
69 |   VectorFst<StdArc> _nbest_prune (VectorFst<StdArc>* fst);
70 |   void _penalize_arcs (VectorFst<StdArc>* fst);
71 |   void _forward_backward (VectorFst<StdArc>* fst);
72 | };
73 | }  // namespace fst
74 | #endif  // SRC_INCLUDE_LATTICEPRUNER_H_
75 | 


--------------------------------------------------------------------------------
/src/include/LegacyRnnLMDecodable.h:
--------------------------------------------------------------------------------
  1 | #ifndef SRC_INCLUDE_LEGACYRNNLMDECODABLE_H_
  2 | #define SRC_INCLUDE_LEGACYRNNLMDECODABLE_H_
  3 | #include <vector>
  4 | using std::vector;
  5 | 
  6 | // Fast exponent implementation from RnnLM
  7 | /*
  8 | static union {
  9 |   double d;
 10 |   struct{
 11 |     int j,i;
 12 |   } n;
 13 | } d2i;
 14 | #define EXP_A (1048576/M_LN2)
 15 | #define EXP_C 60801
 16 | #define FAST_EXP(y)(d2i.n.i=EXP_A*(y)+(1072693248-EXP_C),d2i.d)
 17 | */
 18 | 
 19 | #ifdef __cplusplus
 20 | #define cast_uint32_t static_cast<uint32_t>
 21 | #else
 22 | #define cast_uint32_t (uint32_t)
 23 | #endif
 24 | static inline float fastpow2 (float p) {
 25 |   float offset = (p < 0) ? 1.0f : 0.0f;
 26 |   float clipp = (p < -126) ? -126.0f : p;
 27 |   int w = clipp;
 28 |   float z = clipp - w + offset;
 29 |   union {uint32_t i; float f;} v = {
 30 |     cast_uint32_t (
 31 |         (1 << 23) * (clipp + 121.2740575f + 27.7280233f /
 32 |                      (4.84252568f - z) - 1.49012907f * z)
 33 |     )
 34 |   };
 35 | 
 36 |   return v.f;
 37 | }
 38 | 
 39 | static inline float FAST_EXP (float p) {
 40 |   return fastpow2 (1.442695040f * p);
 41 | }
 42 | 
 43 | 
 44 | template<class T, class H>
 45 | class LegacyRnnLMDecodable {
 46 |  public:
 47 |   LegacyRnnLMDecodable (H& hash, int i, int h, int o, int d, int m)
 48 |     : h (hash), isize (i), hsize (h), osize (o), order (d), max_order (m) { }
 49 | 
 50 |   double ComputeNet (const T& p, T* t) {
 51 |     vector<double> olayer;
 52 |     olayer.resize (osize, 0.0);
 53 | 
 54 |     for (int j = 0; j < hsize; j++)
 55 |       for (int i = 0; i < hsize; i++)
 56 |         t->hlayer [j] += p.hlayer [i] * syn0 [i + h.vocab_.size () + j * isize];
 57 | 
 58 |     for (int i = 0; i < hsize; i++)
 59 |       if (p.word != -1)
 60 |         t->hlayer [i] += syn0 [p.word + i * (hsize + h.vocab_.size ())];
 61 | 
 62 |     for (int i = 0; i < hsize; i++) {
 63 |       if (t->hlayer [i] > 50)
 64 |         t->hlayer [i] = 50;
 65 |       if (t->hlayer [i] < -50)
 66 |         t->hlayer [i] = -50;
 67 |       t->hlayer [i] = 1 / (1 + FAST_EXP (-t->hlayer [i]));
 68 |     }
 69 | 
 70 |     for (int j = h.vocab_.size (); j < osize; j++)
 71 |       for (int i = 0; i < hsize; i++)
 72 |         olayer [j] += t->hlayer [i] * syn1 [i + j * hsize];
 73 | 
 74 |     // Begin class direct connection activations
 75 |     if (synd.size () > 0) {
 76 |       // Feature hash begin
 77 |       vector<uint64> hash;
 78 |       hash.resize (max_order, 0);
 79 | 
 80 |       for (int i = 0; i < order; i++) {
 81 |         if (i > 0)
 82 |           if (t->history [i - 1] == -1)
 83 |             break;
 84 |         hash [i] = h.primes_[0] * h.primes_[1];
 85 |         for (int j = 1; j <= i; j++)
 86 |           hash [i] +=
 87 |             h.primes_[(i * h.primes_[j] + j) % h.primes_.size ()]
 88 |             * static_cast<uint64>(t->history [j - 1] + 1);
 89 | 
 90 |         hash [i] = hash [i] % (synd.size () / 2);
 91 |       }
 92 |       // Feature hash end
 93 |       for (int i = h.vocab_.size (); i < osize; i++) {
 94 |         for (int j = 0; j < order; j++) {
 95 |           if (hash [j]) {
 96 |             olayer [i] += synd [hash [j]];
 97 |             hash [j]++;
 98 |           } else {
 99 |             break;
100 |           }
101 |         }
102 |       }
103 |     }
104 |     // End class direct connection activations
105 | 
106 |     double sum = 0;
107 |     // Softmax on classes
108 |     for (int i = h.vocab_.size (); i < osize; i++) {
109 |       if (olayer [i] > 50)
110 |         olayer [i] = 50;
111 |       if (olayer [i] < -50)
112 |         olayer [i] = -50;
113 |       double val = FAST_EXP (olayer [i]);
114 |       sum += val;
115 |       olayer [i] = val;
116 |     }
117 |     for (int i = h.vocab_.size (); i < osize; i++)
118 |       olayer [i] /= sum;
119 | 
120 |     // 1->2 word activations
121 |     if (t->word != -1) {
122 |       int begin = h.class_sizes_[h.vocab_[t->word].class_index].begin;
123 |       int end   = h.class_sizes_[h.vocab_[t->word].class_index].end;
124 |       for (int j = begin; j <= end; j++)
125 |         for (int i = 0; i < hsize; i++)
126 |           olayer [j] += t->hlayer [i] * syn1 [i + j * hsize];
127 | 
128 |       // Begin word direct connection activations
129 |       if (synd.size () > 0) {
130 |         // Begin feature hashing
131 |         uint64 hash [max_order];
132 |         for (int i = 0; i < order; i++)
133 |           hash [i] = 0;
134 | 
135 |         for (int i = 0; i < order; i++) {
136 |           if (i > 0)
137 |             if (t->history [i - 1] == -1)
138 |               break;
139 | 
140 |           hash [i] = h.primes_[0] * h.primes_[1]
141 |             * static_cast<uint64> (h.vocab_[t->word].class_index + 1);
142 | 
143 |           for (int j = 1; j <= i; j++)
144 |             hash [i] += h.primes_[(i * h.primes_[j] + j) % h.primes_.size ()]
145 |               * static_cast<uint64> (t->history [j - 1] + 1);
146 | 
147 |           hash [i] = (hash [i] % (synd.size () / 2)) + (synd.size () / 2);
148 |         }
149 |         // End feature hashing
150 | 
151 |         for (int i = begin; i <= end; i++) {
152 |           for (int j = 0; j < order; j++) {
153 |             if (hash [j]) {
154 |               olayer [i] += synd [hash [j]];
155 |               hash [j]++;
156 |               hash [j] = hash [j] % synd.size ();
157 |             } else {
158 |               break;
159 |             }
160 |           }
161 |         }
162 |       }
163 |       // End word direct connection activations
164 | 
165 |       sum = 0.0;
166 |       for (int i = begin; i <= end; i++) {
167 |         if (olayer [i] > 50)
168 |           olayer [i] = 50;
169 |         if (olayer [i] < -50)
170 |           olayer [i] = -50;
171 |         olayer [i] = FAST_EXP (olayer [i]);
172 |         sum += olayer [i];
173 |       }
174 |       for (int i = begin; i <= end; i++)
175 |         olayer [i] /= sum;
176 |     }
177 | 
178 |     return olayer [t->word]
179 |       * olayer [h.vocab_.size () + h.vocab_[t->word].class_index];
180 |   }
181 | 
182 |   // We need the synapses and the vocabulary hash
183 |   H& h;
184 |   int isize;
185 |   int hsize;
186 |   int osize;
187 |   int order;
188 |   int max_order;
189 |   vector<double> syn0;
190 |   vector<double> syn1;
191 |   vector<double> synd;
192 | };
193 | #endif  // SRC_INCLUDE_LEGACYRNNLMDECODABLE_H_
194 | 


--------------------------------------------------------------------------------
/src/include/LegacyRnnLMHash.h:
--------------------------------------------------------------------------------
  1 | #ifndef SRC_INCLUDE_LEGACYRNNLMHASH_H_
  2 | #define SRC_INCLUDE_LEGACYRNNLMHASH_H_
  3 | 
  4 | #include <math.h>
  5 | #include <fst/fstlib.h>
  6 | #include <vector>
  7 | #include <string>
  8 | #include <unordered_map>
  9 | #include <sstream>
 10 | 
 11 | 
 12 | typedef double real;
 13 | 
 14 | struct VocabWord {
 15 |  public:
 16 |   VocabWord () {}
 17 |   explicit VocabWord (std::string word_) : cn (1), word (word_) {}
 18 |   VocabWord (std::string word_, int cn_) : cn (cn_), word (word_) {}
 19 |   int    cn;   // Unigram count
 20 |   std::string word;
 21 |   real   prob;
 22 |   int    class_index;
 23 | };
 24 | 
 25 | struct ClassIndex {
 26 |  public:
 27 |   ClassIndex () : begin(0), end(0) {}
 28 |   int begin;
 29 |   int end;
 30 | };
 31 | 
 32 | class LegacyRnnLMHash {
 33 |  public:
 34 |   explicit LegacyRnnLMHash (int class_size)
 35 |     : class_size_ (class_size), g_delim_("|"), gp_delim_("}") {
 36 |     vocab_hash_.resize (100000000);
 37 |   }
 38 | 
 39 |   LegacyRnnLMHash (int class_size, const string g_delim, const string gp_delim)
 40 |     : class_size_ (class_size), g_delim_(g_delim.c_str ()),
 41 |       gp_delim_(gp_delim.c_str ()) {
 42 |     vocab_hash_.resize (100000000);
 43 |   }
 44 | 
 45 |   static const std::vector<unsigned int> primes_;
 46 | 
 47 |   void Split (const std::string& s, char delim,
 48 |               std::vector<std::string>& elems) {
 49 |     std::stringstream ss (s);
 50 |     std::string item;
 51 |     while (getline (ss, item, delim))
 52 |       elems.push_back (item);
 53 |   }
 54 | 
 55 |   template <typename I>
 56 |   int HashInput (I start, I end) {
 57 |     size_t hash = 0;
 58 |     for (I it = start; it != end; ++it)
 59 |       hash = hash * 237 + isyms.Find (*it);
 60 | 
 61 |     return hash;
 62 |   }
 63 | 
 64 |   void MapToken (string& token) {
 65 |     std::vector<std::string> gp;
 66 |     std::vector<std::string> graphs;
 67 |     // std::vector<std::string> phones;
 68 | 
 69 |     Split (token, *gp_delim_, gp);
 70 |     Split (gp [0], *g_delim_, graphs);
 71 |     // Split (gp [1], *g_delim, phones);
 72 | 
 73 |     size_t hash = 0;
 74 |     for (int i = 0; i < graphs.size (); i++)
 75 |       hash = hash * 237 + isyms.AddSymbol (graphs [i]);
 76 | 
 77 |     if (imap.find (hash) == imap.end ())
 78 |       imap [hash] = std::vector<int> {FindWord (token)};
 79 |     else
 80 |       imap [hash].push_back (FindWord (token));
 81 | 
 82 |     /*
 83 |     if (omap.find (FindWord (token)) == omap.end ())
 84 |       omap [FindWord (token)] = phones;
 85 |     */
 86 |   }
 87 | 
 88 |   int HashWord (std::string& word) const {
 89 |     size_t hash = 0;
 90 |     for (size_t i = 0; i < word.size (); i++)
 91 |       hash = hash * 237 + word[i];
 92 |     hash = hash % vocab_hash_.size ();
 93 |     return hash;
 94 |   }
 95 | 
 96 |   int FindWord (std::string& word) {
 97 |     size_t hash = HashWord (word);
 98 | 
 99 |     if (vocab_hash_[hash] == -1)
100 |       return -1;
101 | 
102 |     if (word.compare (vocab_[vocab_hash_[hash]].word) == 0)
103 |       return vocab_hash_[hash];
104 | 
105 |     for (size_t i = 0; i < vocab_.size (); i++) {
106 |       if (word.compare (vocab_[i].word) == 0) {
107 |         vocab_hash_[hash] = i;
108 |         return i;
109 |       }
110 |     }
111 |     return -1;
112 |   }
113 | 
114 |   int GetWordId (std::string& word) const {
115 |     size_t hash = HashWord (word);
116 |     if (vocab_hash_[hash] == -1)
117 |       return -1;
118 |     return vocab_hash_[hash];
119 |   }
120 | 
121 |   int AddWordToVocab (std::string& word, int cn = 1) {
122 |     vocab_.push_back (VocabWord (word, cn));
123 |     size_t hash = HashWord (word);
124 |     vocab_hash_[hash] = vocab_.size () - 1;
125 |     return vocab_.size () - 1;
126 |   }
127 | 
128 |   void SortVocab () {
129 |     // Just sorts based on Class
130 |     for (int i = 1; i < vocab_.size (); i++) {
131 |       int max = i;
132 |       for (int j = i + 1; j < vocab_.size (); j++)
133 |         if (vocab_[max].cn < vocab_[j].cn)
134 |           max = j;
135 |       VocabWord swap = vocab_[max];
136 |       vocab_[max] = vocab_[i];
137 |       vocab_[i]   = swap;
138 |     }
139 |   }
140 | 
141 |   void SetClasses () {
142 |     double df = 0;
143 |     double dd = 0;
144 |     int     a = 0;
145 |     int     b = 0;
146 | 
147 |     for (int i = 0; i < vocab_.size (); i++)
148 |       b += vocab_[i].cn;
149 |     for (int i = 0; i < vocab_.size (); i++)
150 |       dd += sqrt (vocab_[i].cn / static_cast<double> (b));
151 |     for (int i = 0; i < vocab_.size (); i++) {
152 |       df += sqrt (vocab_[i].cn / static_cast<double> (b)) / dd;
153 |       if (df > 1)
154 |         df = 1;
155 |       if (df > (a + 1) / static_cast<double> (class_size_)) {
156 |         vocab_[i].class_index = a;
157 |         if (a < class_size_ - 1)
158 |           a++;
159 |       } else {
160 |         vocab_[i].class_index = a;
161 |       }
162 |     }
163 | 
164 |     class_sizes_.resize (class_size_);
165 |     int c = 0;
166 |     for (int i = 0; i < vocab_.size (); i++) {
167 |       if (i == 0) {
168 |         class_sizes_[c].begin = i;
169 |       }
170 | 
171 |       if (i + 1 == vocab_.size ()) {
172 |         class_sizes_[c].end = i;
173 |       } else if (vocab_[i].class_index < vocab_[i + 1].class_index) {
174 |         class_sizes_[c].end = i;
175 |         c++;
176 |         class_sizes_[c].begin = i + 1;
177 |       }
178 |     }
179 |   }
180 | 
181 |   std::vector<size_t> vocab_hash_;
182 |   std::vector<VocabWord> vocab_;
183 |   std::vector<ClassIndex> class_sizes_;
184 |   std::unordered_map<int, std::vector<int> > imap;
185 |   // std::unordered_map<int, std::vector<int> > omap;
186 |   fst::SymbolTable isyms;
187 |   int class_size_;
188 |   const char* g_delim_;
189 |   const char* gp_delim_;
190 | };
191 | 
192 | const std::vector<unsigned int> LegacyRnnLMHash::primes_ = {
193 |   108641969, 116049371, 125925907, 133333309,
194 |   145678979, 175308587, 197530793, 234567803,
195 |   251851741, 264197411, 330864029, 399999781,
196 |   407407183, 459258997, 479012069, 545678687,
197 |   560493491, 607407037, 629629243, 656789717,
198 |   716048933, 718518067, 725925469, 733332871,
199 |   753085943, 755555077, 782715551, 790122953,
200 |   812345159, 814814293, 893826581, 923456189,
201 |   940740127, 953085797, 985184539, 990122807
202 | };
203 | 
204 | // const char* LegacyRnnLMHash::g_delim  = "|";
205 | // const char* LegacyRnnLMHash::gp_delim = "}";
206 | 
207 | #endif  // SRC_INCLUDE_LEGACYRNNLMHASH_H_
208 | 


--------------------------------------------------------------------------------
/src/include/LegacyRnnLMReader.h:
--------------------------------------------------------------------------------
 1 | #ifndef SRC_INCLUDE_LEGACYRNNLMREADER_H_
 2 | #define SRC_INCLUDE_LEGACYRNNLMREADER_H_
 3 | #include <string>
 4 | #include "./rnnlmlib.h"
 5 | using std::string;
 6 | 
 7 | template<class D, class H>
 8 | class LegacyRnnLMReader {
 9 |  public:
10 |   typedef D Decodable;
11 |   typedef H Hasher;
12 | 
13 |   explicit LegacyRnnLMReader (const string& rnnlm_file) {
14 |     srand (1);
15 |     // We don't actually need or use any of this
16 |     rnnlm_.setLambda (0.75);
17 |     rnnlm_.setRegularization (0.0000001);
18 |     rnnlm_.setDynamic (false);
19 |     rnnlm_.setRnnLMFile (const_cast<char*> (rnnlm_file.c_str ()));
20 |     rnnlm_.setRandSeed (1);
21 |     rnnlm_.useLMProb (false);
22 |     rnnlm_.setDebugMode (1);
23 |     // This will actually load the thing
24 |     rnnlm_.restoreNet ();
25 |   }
26 | 
27 |   Decodable CopyLegacyRnnLM (Hasher& h, int max_order = 5) {
28 |     // Copy static data that can be shared by all tokens
29 |     Decodable d (h, rnnlm_.layer0_size, rnnlm_.layer1_size,
30 |                  rnnlm_.layer2_size, rnnlm_.direct_order,
31 |                  max_order);
32 |     for (int i = 0; i < rnnlm_.layer0_size * rnnlm_.layer1_size; i++)
33 |       d.syn0.push_back (static_cast<double> (rnnlm_.syn0 [i].weight));
34 | 
35 |     for (int i = 0; i < rnnlm_.layer1_size * rnnlm_.layer2_size; i++)
36 |       d.syn1.push_back (static_cast<double> (rnnlm_.syn1 [i].weight));
37 | 
38 |     for (int i = 0; i < rnnlm_.direct_size; i++)
39 |       d.synd.push_back (static_cast<double> (rnnlm_.syn_d [i]));
40 | 
41 |     return d;
42 |   }
43 | 
44 |   Hasher CopyVocabHash (const string g_delim, const string gp_delim) {
45 |     Hasher h (rnnlm_.class_size, g_delim, gp_delim);
46 |     for (int i = 0; i < rnnlm_.vocab_size; i++) {
47 |       string word = rnnlm_.vocab [i].word;
48 |       h.AddWordToVocab (word, rnnlm_.vocab [i].cn);
49 |     }
50 |     h.SortVocab ();
51 |     h.SetClasses ();
52 |     for (int i = 0; i < h.vocab_.size (); i++)
53 |       h.MapToken (h.vocab_[i].word);
54 | 
55 |     return h;
56 |   }
57 | 
58 |   Hasher CopyVocabHash () {
59 |     Hasher h (rnnlm_.class_size);
60 |     for (int i = 0; i < rnnlm_.vocab_size; i++) {
61 |       string word = rnnlm_.vocab [i].word;
62 |       h.AddWordToVocab (word, rnnlm_.vocab [i].cn);
63 |     }
64 |     h.SortVocab ();
65 |     h.SetClasses ();
66 |     for (int i = 0; i < h.vocab_.size (); i++)
67 |       h.MapToken (h.vocab_[i].word);
68 | 
69 |     return h;
70 |   }
71 | 
72 |  private:
73 |   CRnnLM rnnlm_;  // 1The actual model
74 | };
75 | #endif  // SRC_INCLUDE_LEGACYRNNLMREADER_H_
76 | 


--------------------------------------------------------------------------------
/src/include/M2MFstAligner.h:
--------------------------------------------------------------------------------
  1 | #ifndef SRC_INCLUDE_M2MFSTALIGNER_H_
  2 | #define SRC_INCLUDE_M2MFSTALIGNER_H_
  3 | /*
  4 |  M2MFstAligner.hpp
  5 | 
  6 |  Copyright (c) [2012-], Josef Robert Novak
  7 |  All rights reserved.
  8 | 
  9 |  Redistribution and use in source and binary forms, with or without
 10 |   modification, are permitted #provided that the following conditions
 11 |   are met:
 12 | 
 13 |   * Redistributions of source code must retain the above copyright
 14 |     notice, this list of conditions and the following disclaimer.
 15 |   * Redistributions in binary form must reproduce the above
 16 |     copyright notice, this list of #conditions and the following
 17 |     disclaimer in the documentation and/or other materials provided
 18 |     with the distribution.
 19 | 
 20 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21 |  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22 |  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 23 |  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 24 |  COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 25 |  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 26 |  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 27 |  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 28 |  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 29 |  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 30 |  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 31 |  OF THE POSSIBILITY OF SUCH DAMAGE.
 32 | */
 33 | #include <fst/fstlib.h>
 34 | #include <fst/extensions/far/far.h>
 35 | #include <map>
 36 | #include <string>
 37 | #include <vector>
 38 | #include <set>
 39 | #include "./util.h"
 40 | using namespace std;
 41 | 
 42 | 
 43 | namespace fst{
 44 | class M2MFstAligner {
 45 |   /*
 46 |     Read in pairs of sequences of the form SEQ1 and SEQ2 and
 47 |     transform them into an FST that encodes all possible
 48 |     alignments between the symbols in the two sequences.
 49 |     Note that this may include a combination of multi-symbol
 50 |     subsequences depending on user specifications.
 51 | 
 52 |     This is achieved by simply generating the entire alignment
 53 |     graph during a single nested loop through the two input
 54 |     sequences that are to be aligned.
 55 | 
 56 |     The user may optionally specify whether to allow deletions
 57 |     for SEQ1 or SEQ2, as well as a maximum subsequence length
 58 |     for each sequence.
 59 | 
 60 |     This class does not implement any lattice pruning or printing
 61 |     methods.  A combination of the LatticePruner and FstPathFinder
 62 |     classes may be used to achieve this a-la phonetisaurus-align.cpp.
 63 |   */
 64 |  public:
 65 |   // Basics declarations
 66 |   bool seq1_del;
 67 |   bool seq2_del;
 68 |   unsigned int seq1_max;
 69 |   unsigned int seq2_max;
 70 |   string seq1_sep;
 71 |   string seq2_sep;
 72 |   string s1s2_sep;
 73 |   string eps;
 74 |   string skip;
 75 |   bool penalize;
 76 |   bool penalize_em;
 77 |   bool restrict;
 78 |   bool grow;
 79 | 
 80 |   // vector<LogWeight> alpha, beta;
 81 |   // This will be used during decoding to clean the paths
 82 |   set<int> skipSeqs;
 83 |   // OpenFst stuff
 84 |   // These will be overwritten after each FST construction
 85 |   vector<VectorFst<LogArc> > fsas;
 86 | 
 87 |   // This will be maintained for the life of object
 88 |   // These symbol tables will be maintained entire life of
 89 |   //  the object.  This will ensure that any resulting 'corpus'
 90 |   //  shares the same symbol tables.
 91 |   SymbolTable *isyms;
 92 |   map<LogArc::Label, LogWeight> alignment_model;
 93 |   map<LogArc::Label, LogWeight> prev_alignment_model;
 94 |   LabelData penalties;
 95 |   LogWeight total;
 96 |   LogWeight prevTotal;
 97 | 
 98 |   // Constructors
 99 |   M2MFstAligner ();
100 |   // Train from scratch using a dictionary
101 |   M2MFstAligner (bool seq1_del, bool seq2_del, unsigned int seq1_max,
102 |                  unsigned int seq2_max,
103 |                  string seq1_sep, string seq2_sep, string s1s2_sep,
104 |                  string eps, string _skip, bool _penalize,
105 |                  bool penalize_em, bool restrict, bool grow);
106 |   // We've already got a model to go on
107 |   M2MFstAligner (string model_file, bool penalize, bool penalize_em,
108 |                  bool restrict);
109 | 
110 |   // Write an aligner model to disk.  Critical info is stored in the
111 |   //  the symbol table so that it can be restored when the model is loaded.
112 |   void write_model (string model_name);
113 | 
114 |   // Transform a sequence pair into an equivalent multiple-to-multiple FST,
115 |   //  encoding all possible alignments between the two sequences
116 |   void Sequences2FST (VectorFst<LogArc>* fst, vector<string>* seq1,
117 |                             vector<string>* seq2);
118 |   void Sequences2FST (VectorFst<LogArc>* fst, int s1m, int s2m,
119 |                       vector<string>* seq1, vector<string>* seq2);
120 |   void Sequences2FSTNoInit (VectorFst<LogArc>* fst, vector<string>* seq1,
121 |                             vector<string>* seq2);
122 | 
123 |   // Initialize all of the training data
124 |   void entry2alignfst (vector<string> seq1, vector<string> seq2);
125 |   void entry2alignfstnoinit (vector<string> seq1, vector<string> seq2,
126 |                              int nbest, string lattice = "");
127 |   void _conditional_max (bool x_given_y);
128 |   // The expectation routines
129 |   void expectation ();
130 | 
131 |   // The maximization routine.  Returns the change since the last iteration
132 |   float maximization (bool lastiter);
133 | 
134 |   // Precompute the label and subsequence lengths for all possible alignment
135 |   //  units this helps speedup the penalization and decoding routines.
136 |   void _compute_penalties (LogArc::Label label, int lhs, int rhs,
137 |                            bool lhsE, bool rhsE);
138 | };
139 | }  // namespace fst
140 | #endif  // SRC_INCLUDE_M2MFSTALIGNER_H_
141 | 


--------------------------------------------------------------------------------
/src/include/PhonetisaurusScript.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  PhonetisaurusPy.h
  3 | 
  4 |  Copyright (c) [2012-], Josef Robert Novak
  5 |  All rights reserved.
  6 | 
  7 |  Redistribution and use in source and binary forms, with or without
  8 |   modification, are permitted #provided that the following conditions
  9 |   are met:
 10 | 
 11 |   * Redistributions of source code must retain the above copyright
 12 |     notice, this list of conditions and the following disclaimer.
 13 |     * Redistributions in binary form must reproduce the above
 14 |     copyright notice, this list of #conditions and the following
 15 |     disclaimer in the documentation and/or other materials provided
 16 |     with the distribution.
 17 | 
 18 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 |  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 |  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 21 |  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 22 |  COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 23 |  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 24 |  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 25 |  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 26 |  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 27 |  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 28 |  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 29 |  OF THE POSSIBILITY OF SUCH DAMAGE.
 30 | */
 31 | // \file
 32 | // This implements the scripting interface for the FST-based
 33 | // decoder.  The associated classes are suitable for
 34 | // construction of command-line utilities and bindings for
 35 | // scripting languages such as Python.
 36 | //
 37 | #ifndef SRC_INCLUDE_PHONETISAURUSSCRIPT_H_
 38 | #define SRC_INCLUDE_PHONETISAURUSSCRIPT_H_
 39 | #include "PhonetisaurusRex.h"
 40 | #include <sys/types.h>
 41 | #include <sys/stat.h>
 42 | #include <string>
 43 | #include <vector>
 44 | /*! \struct PathData
 45 |     \brief Response data.
 46 | 
 47 |     The PathData structure is used to encapsulate
 48 |     a single FST G2P result.
 49 | */
 50 | struct PathData {
 51 |   PathData () {}
 52 |   PathData (float PathWeight_, const vector<float>& PathWeights_,
 53 |        const vector<int>& ILabels_, const vector<int>& OLabels_,
 54 |        const vector<int>& Uniques_)
 55 |     : PathWeight (PathWeight_), PathWeights (PathWeights_),
 56 |       ILabels (ILabels_), OLabels (OLabels_), Uniques(Uniques_) {}
 57 | 
 58 |   float PathWeight;
 59 |   vector<float> PathWeights;
 60 |   vector<int>   ILabels;
 61 |   vector<int>   OLabels;
 62 |   // Contains only 'interesting' phone labels
 63 |   vector<int>   Uniques;
 64 | };
 65 | 
 66 | /*! \class PhonetisaurusScript
 67 |     \brief A wrapper class encapsulating the FST G2P decoder.
 68 | 
 69 |     A wrapper class for the FST G2P decoder.  Suitable for
 70 |     incorporation into commandline binaries and bindings
 71 |     for various scripting languages.
 72 | */
 73 | class PhonetisaurusScript {
 74 |  private:
 75 |   void normalizeModel() {
 76 |     ArcSort (&model_, ILabelCompare<StdArc> ());
 77 |     isyms_ = model_.InputSymbols ();
 78 |     osyms_ = model_.OutputSymbols ();
 79 |     imax_  = LoadClusters (isyms_, &imap_, &invimap_);
 80 |     omax_  = LoadClusters (osyms_, &omap_, &invomap_);
 81 |     veto_set_.insert (0);
 82 |     veto_set_.insert (1);
 83 |     veto_set_.insert (2);
 84 |   }
 85 |  public:
 86 |   explicit PhonetisaurusScript (const VectorFst<StdArc> model, string delim="") : delim_(delim) {
 87 |     model_ = model;
 88 |     normalizeModel();
 89 |   }
 90 | 
 91 |   explicit PhonetisaurusScript(string model, string delim="") : delim_(delim) {
 92 |     struct stat buffer;
 93 |     if (!(stat (model.c_str(), &buffer) == 0))
 94 |       throw std::exception();
 95 | 
 96 |     // this is solving the memory leak problem
 97 |     VectorFst<StdArc>* model_temp{nullptr};
 98 |     model_temp = (VectorFst<StdArc>::Read(model));
 99 |     if(!model_temp) { throw std::exception(); }
100 |     model_ = *model_temp;
101 |     delete model_temp;
102 | 
103 |     normalizeModel();
104 |   }
105 | 
106 |   // The actual phoneticizer routine
107 |   vector<PathData> Phoneticize (const string& word, int nbest = 1,
108 |                       int beam = 10000, float threshold = 99,
109 |                       bool write_fsts = false,
110 |                       bool accumulate = false,
111 |                       double pmass = 99.0) {
112 |     VectorFst<StdArc>* fst = new VectorFst<StdArc> ();
113 |     vector<int> entry = tokenize2ints (
114 |                           const_cast<string*> (&word),
115 |                           &delim_, isyms_
116 |                         );
117 |     Entry2FSA (entry, fst, imax_, invimap_);
118 | 
119 |     fst->SetInputSymbols (isyms_);
120 |     fst->SetOutputSymbols (isyms_);
121 | 
122 |     // Useful for debugging; print the input word machine
123 |     if (write_fsts)
124 |       fst->Write (word + ".fst");
125 | 
126 |     VectorFst<StdArc> ofst;
127 | 
128 |     StdArc::Weight weight_threshold = threshold;
129 |     StdArc::StateId state_threshold = kNoStateId;
130 |     AnyArcFilter<StdArc> arc_filter;
131 |     vector<StdArc::Weight> distance;
132 | 
133 |     VectorFst<StdArc>* ifst = new VectorFst<StdArc>();
134 |     Compose(*fst, model_, ifst);
135 | 
136 |     // Useful for debugging; print the g2p lattice
137 |     if (write_fsts)
138 |       ifst->Write (word+".lat.fst");
139 | 
140 |     AutoQueue<StdArc::StateId> state_queue (*ifst, &distance, arc_filter);
141 | 
142 |     M2MPathFilter<StdArc> path_filter (omap_, veto_set_);
143 | 
144 |     ShortestPathOptions<StdArc, AutoQueue<StdArc::StateId>,
145 |                       AnyArcFilter<StdArc> >
146 |       opts (&state_queue, arc_filter, nbest, false, false,
147 |             kDelta, false, weight_threshold,
148 |             state_threshold);
149 | 
150 |     ShortestPathSpecialized (*ifst, &ofst, &distance,
151 |                              &path_filter, beam, opts, accumulate);
152 | 
153 |     vector<PathData> paths;
154 |     float total = 99.0;
155 |     if (pmass < 99.0) {
156 |       for (size_t i = 0; i < path_filter.ordered_paths.size(); i++) {
157 |         const vector<int>& u = path_filter.ordered_paths [i];
158 |         const Path& orig = path_filter.path_map [u];
159 |         total = Plus (LogWeight (total), LogWeight (orig.PathWeight)).Value ();
160 |       }
161 |     }
162 | 
163 |     LogWeight nbest_pmass = 99.0;
164 |     for (size_t i = 0; i < path_filter.ordered_paths.size(); i++) {
165 |       const vector<int>& u = path_filter.ordered_paths [i];
166 |       const Path& orig = path_filter.path_map [u];
167 |       float pweight = orig.PathWeight;
168 |       if (pmass < 99.0) {
169 |         pweight = pweight - total;
170 |         nbest_pmass = Plus (
171 |                         LogWeight (nbest_pmass),
172 |                         LogWeight (pweight)
173 |                       ).Value ();
174 |       }
175 | 
176 |       PathData path = PathData (
177 |                   pweight, orig.PathWeights,
178 |                   orig.ILabels, orig.OLabels, orig.unique_olabels
179 |                 );
180 |       paths.push_back (path);
181 | 
182 |       // We are greedy with this, in order to ensure that if pmass =~ -log (.8),
183 |       // and we have h1 = -log (.5), and h2 = -log (.4) that we get both.
184 |       if (pmass < 99.0 && nbest_pmass.Value () < pmass)
185 |         break;
186 |     }
187 | 
188 |     // Make sure that we clean up
189 |     delete fst;
190 |     delete ifst;
191 |     return paths;
192 |   }
193 | 
194 |   // Helper functions for the bindings
195 |   string FindIsym (int symbol_id) {
196 |     return isyms_->Find (symbol_id);
197 |   }
198 | 
199 |   int FindIsym (const string& symbol) {
200 |     return isyms_->Find (symbol);
201 |   }
202 | 
203 |   string FindOsym (int symbol_id) {
204 |     return osyms_->Find (symbol_id);
205 |   }
206 | 
207 |   int FindOsym (const string& symbol) {
208 |     return osyms_->Find (symbol);
209 |   }
210 | 
211 |   const SymbolTable* isyms_;
212 |   const SymbolTable* osyms_;
213 | 
214 |  private:
215 |   VectorFst<StdArc> model_;
216 |   SymbolMap12M imap_, omap_;
217 |   SymbolMapM21 invimap_, invomap_;
218 |   int imax_;
219 |   int omax_;
220 |   VetoSet veto_set_;
221 |   string delim_;
222 | };
223 | #endif  // SRC_INCLUDE_PHONETISAURUSSCRIPT_H_
224 | 


--------------------------------------------------------------------------------
/src/include/RnnLMDecoder.h:
--------------------------------------------------------------------------------
  1 | #ifndef SRC_INCLUDE_RNNLMDECODER_H_
  2 | #define SRC_INCLUDE_RNNLMDECODER_H_
  3 | 
  4 | #include <fst/fstlib.h>
  5 | #include <include/LegacyRnnLMDecodable.h>
  6 | #include <include/LegacyRnnLMHash.h>
  7 | #include <include/util.h>
  8 | #include <string>
  9 | #include <vector>
 10 | #include <unordered_set>
 11 | 
 12 | using fst::VectorFst;
 13 | using fst::ArcIterator;
 14 | using fst::StateIterator;
 15 | using fst::StdArc;
 16 | using fst::Heap;
 17 | using std::vector;
 18 | using std::unordered_set;
 19 | 
 20 | 
 21 | class Token {
 22 |  public:
 23 |   Token (int hsize, int max_order)
 24 |     : word (0), weight (0.0), total (0.0),
 25 |       g (0.0), prev (NULL), state (0), key (-1) {
 26 |     hlayer.resize (hsize, 1.0);
 27 |     history.resize (max_order, 0);
 28 | 
 29 |     HashHistory ();
 30 |   }
 31 | 
 32 |   Token (Token* tok, int w, int s)
 33 |     : word (w), weight (0.0), total (0.0),
 34 |       g (0.0), prev (tok), state (s), key (-1) {
 35 |     // Copy an existing token and update the
 36 |     //  various layers as needed
 37 |     hlayer.resize (tok->hlayer.size(), 0.0);
 38 |     history.resize (tok->history.size (), 0);
 39 | 
 40 |     // Would it be more efficient to perform the hash
 41 |     //  by iterating back throug the parent tokens?
 42 |     for (int i = tok->history.size () - 1; i > 0; i--)
 43 |       history [i] = tok->history [i - 1];
 44 |     history [0] = tok->word;
 45 | 
 46 |     HashHistory ();
 47 |   }
 48 | 
 49 |   void HashHistory () {
 50 |     hhash = state * 7853;
 51 |     for (int i = 0; i < history.size (); i++)
 52 |       hhash = hhash * 7877 + history [i];
 53 |   }
 54 | 
 55 |   int word;
 56 |   mutable double weight;
 57 |   mutable double total;
 58 |   mutable double g;
 59 |   mutable Token* prev;
 60 |   int state;
 61 |   mutable int key;
 62 |   mutable vector<double> hlayer;
 63 |   mutable vector<int> history;
 64 |   size_t hhash;
 65 | };
 66 | 
 67 | class TokenCompare {
 68 |  public:
 69 |   bool operator () (const Token& t1, const Token& t2) const {
 70 |     return (t1.state == t2.state &&
 71 |             t1.word == t2.word &&
 72 |             t1.hhash == t2.hhash);
 73 |     /*
 74 |      return (t1.state == t2.state &&
 75 |             t1.word == t2.word);
 76 |     */
 77 |   }
 78 | };
 79 | 
 80 | class TokenHash {
 81 |  public:
 82 |   size_t operator () (const Token& t) const {
 83 |     return t.state * kPrime0 + t.word * kPrime1 + t.hhash * kPrime2;
 84 |     // return t.state * kPrime0 + t.word * kPrime1;
 85 |   }
 86 |  private:
 87 |   static const size_t kPrime0;
 88 |   static const size_t kPrime1;
 89 |   static const size_t kPrime2;
 90 | };
 91 | const size_t TokenHash::kPrime0 = 7853;
 92 | const size_t TokenHash::kPrime1 = 7867;
 93 | const size_t TokenHash::kPrime2 = 7873;
 94 | 
 95 | 
 96 | class TokenPointerCompare {
 97 |  public:
 98 |   bool operator () (const Token* t1, const Token* t2) const {
 99 |     return (t1->g < t2->g);
100 |   }
101 | };
102 | 
103 | class Chunk {
104 |  public:
105 |   Chunk (int word, double cost, double total)
106 |     : w (word), c (cost), t (total) { }
107 |   int w;
108 |   double c;
109 |   double t;
110 |   template<class H>
111 |   vector<string> Tokenize (char gpdelim, char gdelim, H& h,
112 |                            bool graphemes = false) const {
113 |     vector<string> gp_elems;
114 |     Split (h.vocab_[w].word, gpdelim, gp_elems);
115 |     vector<string> elems;
116 |     if (graphemes == true)
117 |       Split (gp_elems [0], gdelim, elems);
118 |     else if (gp_elems.size () == 2)
119 |       Split (gp_elems [1], gdelim, elems);
120 |     return elems;
121 |   }
122 | };
123 | 
124 | class SimpleResult {
125 |  public:
126 |   SimpleResult (string word, vector<double> scores,
127 |                 vector<string> pronunciations)
128 |     : word (word), scores (scores), pronunciations (pronunciations) { }
129 | 
130 |   SimpleResult () { }
131 | 
132 |   string word;
133 |   vector<double> scores;
134 |   vector<string> pronunciations;
135 | };
136 | 
137 | /* Standalone function for convenience */
138 | template<class H>
139 | VectorFst<StdArc> WordToRnnLMFst (const vector<string>& word, H& h) {
140 |   VectorFst<StdArc> fst;
141 |   fst.AddState ();
142 |   fst.SetStart (0);
143 |   for (int i = 0; i < word.size (); i++) {
144 |     int hash = h.HashInput (word.begin () + i,
145 |                               word.begin () + i + 1);
146 |     fst.AddState ();
147 |     fst.AddArc (i, StdArc (hash, hash, StdArc::Weight::One(), i + 1));
148 |   }
149 | 
150 |   for (int i = 0; i < word.size (); i++) {
151 |     for (int j = 2; j <= 3; j++) {
152 |       if (i + j <= word.size ()) {
153 |         int hash = h.HashInput (word.begin () + i, word.begin () + i + j);
154 |         if (h.imap.find (hash) != h.imap.end ())
155 |           fst.AddArc (i, StdArc (hash, hash, StdArc::Weight::One (), i + j));
156 |       }
157 |     }
158 |   }
159 |   fst.SetFinal (word.size (), StdArc::Weight::One ());
160 | 
161 |   return fst;
162 | }
163 | 
164 | template <class D>
165 | class RnnLMDecoder {
166 |  public:
167 |   typedef D Decodable;
168 |   typedef vector<vector<Chunk> > RawResults;
169 |   typedef Heap<Token*, TokenPointerCompare> Queue;
170 |   typedef unordered_set<Token, TokenHash, TokenCompare> TokenSet;
171 | 
172 |   explicit RnnLMDecoder (Decodable& decodable)
173 |     : d (decodable) { }
174 | 
175 |   double Heuristic (int nstate, int nstates, double hcost) {
176 |     int factor = nstates - nstate - 1;
177 |     if (factor > 0)
178 |       return factor * hcost;
179 |     return 0.0;
180 |   }
181 | 
182 |   VectorFst<StdArc> WordToRnnLMFst (const vector<string>& word) {
183 |     VectorFst<StdArc> fst;
184 |     fst.AddState ();
185 |     fst.SetStart (0);
186 |     for (int i = 0; i < word.size (); i++) {
187 |       int hash = d.h.HashInput (word.begin () + i,
188 |                               word.begin () + i + 1);
189 |       fst.AddState ();
190 |       fst.AddArc (i, StdArc (hash, hash, StdArc::Weight::One(), i + 1));
191 |     }
192 | 
193 |     for (int i = 0; i < word.size (); i++) {
194 |       for (int j = 2; j <= 3; j++) {
195 |         if (i + j <= word.size ()) {
196 |           int hash = d.h.HashInput (word.begin () + i, word.begin () + i + j);
197 |           if (d.h.imap.find (hash) != d.h.imap.end ())
198 |             fst.AddArc (i, StdArc (hash, hash, StdArc::Weight::One (), i + j));
199 |         }
200 |       }
201 |     }
202 |     fst.SetFinal (word.size (), StdArc::Weight::One ());
203 | 
204 |     return fst;
205 |   }
206 | 
207 |   SimpleResult Decode (const vector<string>& word, int beam, int kMax,
208 |                      int nbest, double thresh, const string& gpdelim,
209 |                      const string& gdelim, const string& skip) {
210 |     RawResults raw_results = DecodeRaw (word, beam, kMax, nbest, thresh);
211 |     SimpleResult simple_result;
212 |     stringstream word_ss;
213 |     for (int i = 0; i < word.size (); i++)
214 |       if (i != word.size () - 1)
215 |         word_ss << word [i];
216 |     simple_result.word = word_ss.str ();
217 | 
218 |     for (int i = 0; i < raw_results.size (); i++) {
219 |       const vector<Chunk>& result = raw_results [i];
220 |       stringstream pronunciation_ss;
221 |       for (vector<Chunk>::const_iterator it = result.begin ();
222 |            it != result.end (); ++it) {
223 |         vector<string> chunk_vec = \
224 |           it->Tokenize<LegacyRnnLMHash> (static_cast<char>(*gpdelim.c_str ()),
225 |                                          static_cast<char>(*gdelim.c_str ()),
226 |                                          d.h);
227 |         for (int j = 0; j < chunk_vec.size (); j++) {
228 |           if (chunk_vec [j].compare (skip) != 0)
229 |             pronunciation_ss << chunk_vec [j];
230 |           else
231 |             continue;
232 | 
233 |           if (!(it == result.end () && j != chunk_vec.size () - 1))
234 |             pronunciation_ss << " ";
235 |         }
236 |         if (it+1 == result.end ())
237 |           simple_result.scores.push_back (it->t);
238 |       }
239 |       simple_result.pronunciations.push_back (pronunciation_ss.str ());
240 |     }
241 | 
242 |     return simple_result;
243 |   }
244 | 
245 |   RawResults DecodeRaw (const vector<string>& word, int beam, int kMax,
246 |                         int nbest, double thresh = 0.0) {
247 |     VectorFst<StdArc> fst = WordToRnnLMFst (word);
248 |     for (int i = 0; i < sQueue.size (); i++)
249 |       sQueue [i].Clear ();
250 |     sQueue.resize (fst.NumStates () + 1);
251 | 
252 |     Initialize ();
253 |     int n = 0;
254 |     for (StateIterator<VectorFst<StdArc> > siter (fst);
255 |          !siter.Done(); siter.Next ()) {
256 |       int s = siter.Value ();
257 |       int k = 0;
258 |       while (!sQueue [s].Empty () && k < kMax && n < nbest) {
259 |         Token* top = sQueue [s].Pop ();
260 |         if (fst.Final (top->state) != StdArc::Weight::Zero ()) {
261 |           // Token* a = (Token*)&(*top);
262 |           Token* a = reinterpret_cast<Token*>(top);
263 |           if (n > 0 && thresh > 0.0)
264 |             if (a->total - results [0][results [0].size () - 1].t > thresh)
265 |               break;
266 | 
267 |           vector<Chunk> result;
268 |           while (a->prev != NULL) {
269 |             result.push_back (Chunk (a->word, a->weight, a->total));
270 |             a = reinterpret_cast<Token*> (a->prev);
271 |           }
272 |           reverse (result.begin (), result.end ());
273 |           results.push_back (result);
274 |           n++;
275 |           continue;
276 |         }
277 | 
278 |         for (ArcIterator<VectorFst<StdArc> > aiter (fst, top->state);
279 |              !aiter.Done (); aiter.Next ()) {
280 |           const StdArc& arc = aiter.Value ();
281 |           const vector<int>& map = d.h.imap [arc.ilabel];
282 | 
283 |           for (int i = 0; i < map.size (); i++) {
284 |             Token ntoken (reinterpret_cast<Token*>(top), map [i],
285 |                           arc.nextstate);
286 |             ntoken.weight = -log (d.ComputeNet ((*top), &ntoken));
287 |             if (ntoken.weight > beam)
288 |               continue;
289 | 
290 |             ntoken.total += top->total + ntoken.weight;
291 |             // Heuristic here if we use one (we don't)
292 |             ntoken.g = ntoken.total;
293 | 
294 |             TokenSet::iterator niterator = pool.find (ntoken);
295 | 
296 |             if (niterator == pool.end ()) {
297 |               pool.insert (ntoken);
298 |               Token* npointer = (Token*)&(*pool.find (ntoken));
299 |               sQueue [arc.nextstate].Insert (npointer);
300 |             } else {
301 |               if (ntoken.g < niterator->g) {
302 |                 niterator->weight  = ntoken.weight;
303 |                 niterator->total   = ntoken.total;
304 |                 niterator->prev    = ntoken.prev;
305 |                 niterator->history = ntoken.history;
306 |                 niterator->g       = ntoken.g;
307 |                 niterator->hlayer  = ntoken.hlayer;
308 |                 sQueue [arc.nextstate].Insert ((Token*)&(*niterator));
309 |               }
310 |             }
311 |           }
312 |         }
313 |         k++;
314 |       }
315 |     }
316 |     return results;
317 |   }
318 | 
319 |   RawResults  results;
320 | 
321 | 
322 |  private:
323 |   void Initialize () {
324 |     pool.clear ();
325 |     results.clear ();
326 | 
327 |     Token start (d.hsize, d.max_order);
328 |     pool.insert (start);
329 |     TokenSet::iterator prev = pool.find (start);
330 |     prev->key = sQueue [0].Insert (reinterpret_cast<Token*>(&prev));
331 |     return;
332 |   }
333 | 
334 |   Decodable& d;
335 |   vector<Queue> sQueue;
336 |   TokenSet pool;
337 | };
338 | #endif  // SRC_INCLUDE_RNNLMDECODER_H_
339 | 


--------------------------------------------------------------------------------
/src/include/RnnLMPy.h:
--------------------------------------------------------------------------------
  1 | // RnnLMWrapper.h
  2 | //
  3 | // Copyright (c) [2013-], Yandex, LLC
  4 | // Author: jorono@yandex-team.ru (Josef Robert Novak)
  5 | // All rights reserved.
  6 | /*
  7 |    Redistribution and use in source and binary forms, with or without
  8 |    modification, are permitted #provided that the following conditions
  9 |    are met:
 10 | 
 11 |    * Redistributions of source code must retain the above copyright
 12 |    notice, this list of conditions and the following disclaimer.
 13 |    * Redistributions in binary form must reproduce the above
 14 |    copyright notice, this list of #conditions and the following
 15 |    disclaimer in the documentation and/or other materials provided
 16 |    with the distribution.
 17 | 
 18 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 |    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 |    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 21 |    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 22 |    COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 23 |    INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 24 |    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 25 |    SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 26 |    HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 27 |    STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 28 |    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 29 |    OF THE POSSIBILITY OF SUCH DAMAGE.
 30 | *
 31 | */
 32 | /// \file
 33 | /// Python bindings for RnnLM.  These only correspond
 34 | /// to basic evaluation functions, not training. By default
 35 | /// the evaluations utilizes the -independent convention from
 36 | /// the original rnnlm tool.  This is all we are interested in
 37 | /// for G2P evaluations.
 38 | #ifndef SRC_INCLUDE_RNNLMPY_H_
 39 | #define SRC_INCLUDE_RNNLMPY_H_
 40 | 
 41 | #include <fst/fstlib.h>
 42 | #include <string>
 43 | #include <vector>
 44 | #include "./rnnlmlib.h"
 45 | 
 46 | using namespace fst;
 47 | 
 48 | typedef struct UttResult {
 49 |   UttResult () : sent_prob(0.0) {}
 50 |   double sent_prob;
 51 |   vector<double> word_probs;
 52 |   vector<string> words;
 53 | } UttResult;
 54 | 
 55 | class RnnLMPy {
 56 |  public:
 57 |   explicit RnnLMPy (string rnnlm_file) {
 58 |     srand (1);
 59 |     rnnlm_.setLambda (0.75);
 60 |     rnnlm_.setRegularization (0.0000001);
 61 |     rnnlm_.setDynamic (false);
 62 |     rnnlm_.setRnnLMFile (const_cast<char*> (rnnlm_file.c_str()));
 63 |     rnnlm_.setRandSeed (1);
 64 |     rnnlm_.useLMProb (false);
 65 |     rnnlm_.setDebugMode (1);
 66 |     rnnlm_.restoreNet ();
 67 |   }
 68 | 
 69 |   vector<int> GetJointVocab (string& token) {
 70 |     return rnnlm_.SearchJointVocab (token);
 71 |   }
 72 | 
 73 |   string GetString (int id) {
 74 |     return rnnlm_.token_map[id];
 75 |   }
 76 | 
 77 |   UttResult EvaluateSentence (vector<string> words) {
 78 |     /*
 79 |       Note that the user is responsible for explicitly
 80 |       providing the sentence-end token in the words vector!
 81 |     */
 82 |     int a, word, last_word;
 83 |     UttResult result;
 84 |     string delim = "}";
 85 | 
 86 |     last_word = 0;
 87 |     rnnlm_.copyHiddenLayerToInput ();
 88 |     if (rnnlm_.bptt > 0) {
 89 |       for (a = 0; a < rnnlm_.bptt + rnnlm_.bptt_block; a++)
 90 |         rnnlm_.bptt_history[a] = 0;
 91 |     }
 92 |     for (a = 0; a < MAX_NGRAM_ORDER; a++)
 93 |       rnnlm_.history[a] = 0;
 94 |     rnnlm_.netReset();
 95 | 
 96 |     // Check the G2P tokens
 97 |     for (size_t i = 0; i < words.size(); i++) {
 98 |       word = rnnlm_.searchVocab (const_cast<char*> (words[i].c_str()));
 99 |       /*
100 |       vector<string> toks = tokenize_utf8_string (&words[i], &delim);
101 |       cout << toks[0] << endl;
102 |       vector<int>& tokens = rnnlm_.SearchJointVocab (toks[0]);
103 |       float tscore = -999;
104 |       for (int j = 0; j < tokens.size(); j++) {
105 |         cout << "  " << tokens[j] << "\t"
106 |              << rnnlm_.token_map[tokens[j]] << "\t";
107 |         rnnlm_.computeNet (last_word, tokens[j]);
108 |         float tval =  log10 (rnnlm_.neu2[rnnlm_.vocab[tokens[j]].class_index
109 |                              + rnnlm_.vocab_size].ac
110 |                              * rnnlm_.neu2[tokens[j]].ac);
111 |         if (tval > tscore) {
112 |           tscore = tval;
113 |           word = tokens[j];
114 |         }
115 |         cout << tval << endl;
116 |       }
117 |       /////////////////////
118 |       */
119 |       result.words.push_back (rnnlm_.token_map[word]);
120 |       rnnlm_.computeNet (last_word, word);
121 | 
122 | 
123 |       if (word != -1) {
124 |         result.word_probs.push_back (
125 |             log10 (rnnlm_.neu2[rnnlm_.vocab[word].class_index
126 |                    + rnnlm_.vocab_size].ac
127 |                    * rnnlm_.neu2[word].ac));
128 |         result.sent_prob += result.word_probs.back ();
129 |       } else {
130 |         // cout << "-1\t0\tOOV" << endl;
131 |         result.word_probs.push_back (0.0);
132 |       }
133 | 
134 |       rnnlm_.copyHiddenLayerToInput ();
135 |       if (last_word != -1)
136 |         rnnlm_.neu0[last_word].ac = 0;
137 | 
138 |       last_word = word;
139 |       for (a = MAX_NGRAM_ORDER - 1; a > 0; a--)
140 |         rnnlm_.history[a] = rnnlm_.history[a-1];
141 |       rnnlm_.history[0] = last_word;
142 |     }
143 | 
144 |     return result;
145 |   }
146 | 
147 |  private:
148 |   CRnnLM rnnlm_;  // The actual rnnlm
149 | };
150 | 
151 | #endif  // SRC_INCLUDE_RNNLMPY_H_
152 | 


--------------------------------------------------------------------------------
/src/include/util.h:
--------------------------------------------------------------------------------
 1 | #ifndef SRC_INCLUDE_UTIL_H_
 2 | #define SRC_INCLUDE_UTIL_H_
 3 | /*
 4 |  Copyright (c) [2012-], Josef Robert Novak
 5 |  All rights reserved.
 6 | 
 7 |  Redistribution and use in source and binary forms, with or without
 8 |   modification, are permitted #provided that the following conditions
 9 |   are met:
10 | 
11 |   * Redistributions of source code must retain the above copyright
12 |     notice, this list of conditions and the following disclaimer.
13 |   * Redistributions in binary form must reproduce the above
14 |     copyright notice, this list of #conditions and the following
15 |     disclaimer in the documentation and/or other materials provided
16 |     with the distribution.
17 | 
18 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 |  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 |  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 |  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 |  COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
23 |  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24 |  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 |  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 |  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
27 |  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 |  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
29 |  OF THE POSSIBILITY OF SUCH DAMAGE.
30 | *
31 | */
32 | #include <fst/fstlib.h>
33 | #include <utf8.h>
34 | #include <unordered_map>
35 | #include <string>
36 | #include <vector>
37 | #ifdef __MACH__
38 | #include <mach/clock.h>
39 | #include <mach/mach.h>
40 | #endif
41 | using namespace fst;
42 | 
43 | typedef struct LabelDatum {int max, tot, lhs, rhs; bool lhsE, rhsE;} LabelDatum;
44 | typedef unordered_map<LogArc::Label, LabelDatum> LabelData;
45 | 
46 | string vec2str (vector<string> vec, string sep);
47 | 
48 | string itoas (int i);
49 | 
50 | vector<string> tokenize_utf8_string (string* utf8_string, string* delimiter);
51 | 
52 | vector<string> tokenize_entry (string* testword, string* sep,
53 |                                SymbolTable* syms);
54 | 
55 | vector<int> tokenize2ints (string* word, string* sep, const SymbolTable* syms);
56 | 
57 | timespec get_time( );
58 | 
59 | timespec diff (timespec start, timespec end);
60 | 
61 | void PhonetisaurusSetFlags (const char* usage, int* argc, char*** argv,
62 |                             bool remove_flags);
63 | 
64 | void LoadWordList (const std::string& filename,
65 |                    std::vector<std::string>* corpus);
66 | 
67 | void Split (const std::string& s, char delim, std::vector<std::string>& elems);
68 | 
69 | #endif  // SRC_INCLUDE_UTIL_H_
70 | 


--------------------------------------------------------------------------------
/src/lib/LatticePruner.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 |  LatticePruner.cpp 
  3 | 
  4 |  Copyright (c) [2012-], Josef Robert Novak
  5 |  All rights reserved.
  6 | 
  7 |  Redistribution and use in source and binary forms, with or without
  8 |   modification, are permitted #provided that the following conditions
  9 |   are met:
 10 | 
 11 |   * Redistributions of source code must retain the above copyright 
 12 |     notice, this list of conditions and the following disclaimer.
 13 |     * Redistributions in binary form must reproduce the above 
 14 |     copyright notice, this list of #conditions and the following 
 15 |     disclaimer in the documentation and/or other materials provided 
 16 |     with the distribution.
 17 | 
 18 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 19 |  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
 20 |  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
 21 |  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 22 |  COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
 23 |  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
 24 |  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 
 25 |  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
 26 |  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
 27 |  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
 28 |  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 
 29 |  OF THE POSSIBILITY OF SUCH DAMAGE.
 30 |  *
 31 |  */
 32 | using namespace std;
 33 | #include "include/LatticePruner.h"
 34 | 
 35 | 
 36 | LatticePruner::LatticePruner( ){
 37 |   //Default constructor
 38 | }
 39 | 
 40 | LatticePruner::LatticePruner( LabelData _penalties, TropicalWeight _beam, int _nbest, bool _fb, bool _penalize ) {
 41 |   penalties = _penalties;
 42 |   penalize  = _penalize;
 43 |   beam      = _beam;
 44 |   nbest     = _nbest;
 45 |   fb        = _fb;
 46 | }
 47 | 
 48 | LatticePruner::LatticePruner( TropicalWeight _beam, int _nbest, bool _fb ) {
 49 |   //TODO
 50 |   beam     = _beam;
 51 |   nbest    = _nbest;
 52 |   fb       = _fb;
 53 |   penalize = false;
 54 | }
 55 | 
 56 | void LatticePruner::prune_fst( VectorFst<StdArc>* fst ){
 57 |   /*
 58 |     Apply several optional pruning heuristics to the lattice.
 59 |   */
 60 |   if( penalize==true )
 61 |     _penalize_arcs( fst );
 62 | 
 63 |   if( fb==true )
 64 |     _forward_backward( fst );
 65 | 
 66 |   if( nbest==1 ){
 67 |     //If N=1 then all the remaining stuff is a waste of time.
 68 |     //This is because the pruning heuristics are all computed
 69 |     // *relative* to the 1-best hypothesis.  
 70 |     //This is in contrast LMBR and arc penalization.
 71 |     *fst = _nbest_prune( fst );
 72 |     return;
 73 |   }
 74 | 
 75 | 
 76 |   if( beam.Value() != LogWeight::Zero() )
 77 |     Prune( fst, beam );
 78 | 
 79 |   if( nbest > 1 )
 80 |     *fst = _nbest_prune( fst );
 81 | 
 82 |   return;
 83 | }
 84 | 
 85 | VectorFst<StdArc> LatticePruner::_nbest_prune( VectorFst<StdArc>* fst ){
 86 |   /* 
 87 |      This is just a destructive wrapper for the OpenFst ShortestPath 
 88 |      implementation.  I wish they'd implement desctructive versions of
 89 |      all the algos in the library...
 90 |   */
 91 |   VectorFst<StdArc> sfst;
 92 | 
 93 |   ShortestPath( *fst, &sfst, nbest );  
 94 | 
 95 |   return sfst;
 96 | }
 97 | 
 98 | void LatticePruner::_forward_backward( VectorFst<StdArc>* fst ){
 99 |   /*
100 |     OpenFst-based implementation of forward-backward lattice pruning based on,
101 |        Sixtus and Ortmanns, "HIGH QUALITY WORD GRAPHS USING FORWARD-BACKWARD PRUNING", 1999
102 | 
103 |     Note-to-self: It seems to give consistent WER and PER improvements so I guess I 
104 |      got the implementation right, but it seems like maybe it was too easy.
105 |   */
106 |   //Setup
107 |   VectorFst<LogArc>* pfst = new VectorFst<LogArc>();
108 |   VectorFst<LogArc>* lfst = new VectorFst<LogArc>();
109 |   vector<LogWeight>  alpha, beta;
110 | 
111 |   Map(*fst, lfst, StdToLogMapper());
112 | 
113 |   //Normalize so that subsequent operations don't go crazy
114 |   Push<LogArc, REWEIGHT_TO_FINAL>(*lfst, pfst, kPushWeights);
115 |   for( StateIterator<VectorFst<LogArc> > siter(*pfst); !siter.Done(); siter.Next() ){
116 |     size_t i = siter.Value();
117 |     if( pfst->Final(i)!=LogArc::Weight::Zero() ){
118 |       pfst->SetFinal(i,LogArc::Weight::One());
119 |     }
120 |   }
121 | 
122 |   //Compute Forward and Backward probabilities
123 |   ShortestDistance( *pfst, &alpha );
124 |   ShortestDistance( *pfst, &beta, true );
125 | 
126 |   //Compute arc posteriors.  This is the same as the Expectation step.
127 |   for( StateIterator<VectorFst<LogArc> > siter(*pfst); !siter.Done(); siter.Next() ){
128 |     LogArc::StateId q = siter.Value();
129 |     for( MutableArcIterator<VectorFst<LogArc> > aiter(pfst,q); !aiter.Done(); aiter.Next() ){
130 |       LogArc    arc   = aiter.Value();
131 |       LogWeight gamma = Divide(Times(Times(alpha[q], arc.weight), beta[arc.nextstate]), beta[0]);
132 | 
133 |       if( gamma.Value()==gamma.Value() ){
134 |         arc.weight = gamma;
135 |         aiter.SetValue(arc);
136 |       }
137 |     }
138 |   }
139 | 
140 |   Map(*pfst, fst, LogToStdMapper()); 
141 | 
142 |   delete lfst;
143 |   delete pfst;
144 |   return;
145 | }
146 | 
147 | void LatticePruner::_penalize_arcs( VectorFst<StdArc>* fst ){
148 | 
149 |   for( StateIterator<VectorFst<StdArc> > siter(*fst); !siter.Done(); siter.Next() ){
150 |     StdArc::StateId q = siter.Value();
151 |     for( MutableArcIterator<VectorFst<StdArc> > aiter(fst,q); !aiter.Done(); aiter.Next() ){
152 |       StdArc     arc = aiter.Value();
153 |       LabelDatum* ld = &penalties[arc.ilabel];
154 | 
155 |       if( ld->lhs>1 && ld->rhs>1 ){
156 |         arc.weight = 999; 
157 |       }else{
158 |         arc.weight = arc.weight.Value() * ld->max;
159 |       }
160 |       if( arc.weight == LogWeight::Zero() )
161 |         arc.weight = 999;
162 |       if( arc.weight != arc.weight )
163 |         arc.weight = 999;
164 |       aiter.SetValue(arc);
165 |     }
166 |   }
167 | 
168 |   return;
169 | }
170 | 
171 | 


--------------------------------------------------------------------------------
/src/lib/feature-reader.cc:
--------------------------------------------------------------------------------
 1 | #include <fst/fstlib.h>
 2 | #include "LegacyRnnLMHash.h"
 3 | #include "RnnLMDecoder.h"
 4 | #include "LegacyRnnLMDecodable.h"
 5 | #include "LegacyRnnLMReader.h"
 6 | using namespace fst;
 7 | 
 8 | //typedef std::unordered_map<std::string, std::vector<int> > FMAP;
 9 | typedef std::unordered_map<int, std::vector<int> > FMAP;
10 | 
11 | template<class H>
12 | void LoadFeatureConf (const H&h, FMAP* fmap, std::string& featurefilename) {
13 |   std::ifstream ifp (featurefilename.c_str ());
14 |   std::string prefix = "#";
15 |   std::string line;
16 | 
17 |   if (ifp.is_open ()) {
18 |     while (ifp.good ()) {
19 |       getline (ifp, line);
20 |       if (line.empty ())
21 | 	continue;
22 | 
23 |       std::vector<int> ids;
24 |       int id;
25 |       std::string word;
26 |       if (!line.compare (0, prefix.size (), prefix))
27 | 	continue;
28 |       
29 |       std::stringstream ss (line);
30 |       ss >> word;
31 |       while (ss >> id)
32 | 	ids.push_back (id);
33 |       cout << "Item: " << word << " " << h.GetWordId (word) << endl;
34 |       (*fmap) [h.GetWordId (word)] = ids;
35 |     }
36 |     ifp.close ();
37 |   }
38 | }
39 | 
40 | typedef LegacyRnnLMDecodable<Token, LegacyRnnLMHash> Decodable;
41 | DEFINE_string (rnnlm,   "", "The input RnnLM model.");
42 | DEFINE_string (feats,   "", "Auxiliary features conf file.");
43 | 
44 | int main (int argc, char* argv []) {
45 |   string usage = "feature-reader --rnnlm=test.rnnlm --feats=features.conf\n\n Usage: ";
46 |   set_new_handler (FailedNewHandler);
47 |   SetFlags (usage.c_str (), &argc, &argv, false);
48 | 
49 |   LegacyRnnLMReader<Decodable, LegacyRnnLMHash> reader (FLAGS_rnnlm);
50 |   LegacyRnnLMHash h = reader.CopyVocabHash ();
51 | 
52 |   FMAP fmap;
53 | 
54 |   LoadFeatureConf (h, &fmap, FLAGS_feats);
55 | 
56 |   for (FMAP::iterator it = fmap.begin (); it != fmap.end (); ++it) {
57 |     std::cout << it->first << "\t";
58 |     const std::vector<int>& feats = (*it).second;
59 |     for (int i = 0; i < feats.size (); i++)
60 |       cout << feats [i] << ((i == feats.size ()) ? "" : " ");
61 |     cout << endl;
62 |   }
63 | 			    
64 |   return 0;
65 | }
66 | 


--------------------------------------------------------------------------------
/src/lib/util.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 |  Copyright (c) [2012-], Josef Robert Novak
  3 |  All rights reserved.
  4 | 
  5 |  Redistribution and use in source and binary forms, with or without
  6 |   modification, are permitted #provided that the following conditions
  7 |   are met:
  8 | 
  9 |   * Redistributions of source code must retain the above copyright 
 10 |     notice, this list of conditions and the following disclaimer.
 11 |     * Redistributions in binary form must reproduce the above 
 12 |     copyright notice, this list of #conditions and the following 
 13 |     disclaimer in the documentation and/or other materials provided 
 14 |     with the distribution.
 15 | 
 16 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
 17 |  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
 18 |  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
 19 |  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
 20 |  COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
 21 |  INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
 22 |  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 
 23 |  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
 24 |  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 
 25 |  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
 26 |  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 
 27 |  OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | *
 29 | */
 30 | using namespace std;
 31 | #include <include/util.h>
 32 | using namespace fst;
 33 | 
 34 | 
 35 | string vec2str( vector<string> vec, string sep ){
 36 |   string ss;
 37 |   for(size_t i = 0; i < vec.size(); ++i){
 38 |     if(i != 0)
 39 |       ss += sep;
 40 |     ss += vec[i];
 41 |   }
 42 |   return ss;
 43 | }
 44 | 
 45 | string itoas( int i ){
 46 |   std::stringstream ostring;
 47 |   ostring << i;
 48 |   return ostring.str();
 49 | }
 50 | 
 51 | vector<string> tokenize_utf8_string (string* utf8_string, string* delimiter) {
 52 |   /*
 53 |      Support for tokenizing a utf-8 string. Adapted to also 
 54 |      support a delimiter. Note that leading, trailing or multiple 
 55 |      consecutive delimiters will result in empty vector elements.  
 56 |      Normally should not be a problem but just in case. Also note 
 57 |      that any tokens that cannot be found in the model symbol table will be
 58 |      deleted from the input word prior to grapheme-to-phoneme conversion.
 59 | 
 60 |      http://stackoverflow.com/questions/2852895/c-iterate-or-split-\
 61 |       utf-8-string-into-array-of-symbols#2856241
 62 |   */
 63 |   char* str   = (char*) utf8_string->c_str (); // utf-8 string
 64 |   char* str_i = str;                           // string iterator
 65 |   char* str_j = str;
 66 |   char* end   = str + strlen (str) + 1;        // end iterator
 67 |   vector<string> string_vec;
 68 |   if (delimiter->compare ("") != 0)
 69 |     string_vec.push_back ("");
 70 | 
 71 |   do {
 72 |     str_j = str_i;
 73 |     utf8::uint32_t code = utf8::next (str_i, end); // get 32 bit code
 74 |     if (code == 0)
 75 |       continue;
 76 |     int start = strlen (str) - strlen (str_j);
 77 |     int end   = strlen (str) - strlen (str_i);
 78 |     int len   = end - start;
 79 |       
 80 |     if (delimiter->compare ("") == 0) {
 81 |       string_vec.push_back (utf8_string->substr (start,len));
 82 |     } else {
 83 |       if (delimiter->compare (utf8_string->substr (start, len)) == 0)
 84 |         string_vec.push_back ("");
 85 |       else
 86 |         string_vec [string_vec.size () - 1] += utf8_string->substr (start, len);
 87 |     }
 88 |   } while (str_i < end);
 89 |   
 90 |   return string_vec;
 91 | }
 92 | 
 93 | 
 94 | vector<string> tokenize_entry (string* testword, string* sep, 
 95 | 			       SymbolTable* syms) {
 96 |   vector<string> tokens = tokenize_utf8_string (testword, sep);
 97 |   vector<string> entry;
 98 |   for (unsigned int i=0; i<tokens.size (); i++) {
 99 |     if (syms->Find (tokens.at (i)) != -1) {
100 |       entry.push_back (tokens.at (i));
101 |     }else{
102 |       cerr << "Symbol: '" << tokens.at (i)
103 |            << "' not found in input symbols table." << endl
104 |            << "Mapping to null..." << endl;
105 |     }
106 |   }
107 | 
108 |   return entry;
109 | }
110 | 
111 | vector<int> tokenize2ints (string* testword, string* sep, 
112 | 			   const SymbolTable* syms) {
113 |   vector<string> tokens = tokenize_utf8_string (testword, sep);
114 |   vector<int> entry;
115 |   for (unsigned int i=0; i<tokens.size(); i++) {
116 |     int label = syms->Find (tokens[i]);
117 |     if (label == -1)
118 |       cerr << "Symbol: '" << tokens[i]
119 |            << "' not found in input symbols table." << endl
120 |            << "Mapping to null..." << endl;
121 |     else
122 |       entry.push_back (label);
123 |   }
124 | 
125 |   return entry;
126 | }
127 | 
128 | #ifdef __MACH__
129 | timespec get_time( ){
130 |   clock_serv_t cclock;
131 |   mach_timespec_t mts;
132 |   host_get_clock_service(mach_host_self(), REALTIME_CLOCK, &cclock);
133 |   clock_get_time(cclock, &mts);
134 | 
135 |   timespec ts = {mts.tv_sec, mts.tv_nsec};
136 |   return ts;
137 | }
138 | #else
139 | timespec get_time( ){
140 |   timespec ts;
141 |   clock_gettime(CLOCK_REALTIME, &ts);
142 |   return ts;
143 | }
144 | #endif
145 | 
146 | timespec diff(timespec start, timespec end){
147 |   timespec temp;
148 |   if ((end.tv_nsec-start.tv_nsec)<0) {
149 |     temp.tv_sec = end.tv_sec-start.tv_sec-1;
150 |     temp.tv_nsec = 1000000000+end.tv_nsec-start.tv_nsec;
151 |   } else {
152 |     temp.tv_sec = end.tv_sec-start.tv_sec;
153 |     temp.tv_nsec = end.tv_nsec-start.tv_nsec;
154 |   }
155 |   return temp;
156 | }
157 | 
158 | DEFINE_bool   (help, false, "show usage information");
159 | void PhonetisaurusSetFlags (const char* usage, int* argc, char*** argv,
160 | 			    bool remove_flags) {
161 |   //Workaround for Apple's. It just skips all the options processing. 
162 | #if defined(__APPLE__) && defined(__MACH__)
163 |   SetFlags (usage, argc, argv, remove_flags);
164 | #else
165 |   int index = 1;
166 |   for (; index < *argc; ++index) {
167 |     string argval = (*argv)[index];
168 | 
169 |     if (argval[0] != '-' || argval == "-")
170 |       break;
171 |     while (argval[0] == '-')
172 |       argval = argval.substr(1);  // remove initial '-'s
173 | 
174 |     string arg = argval;
175 |     string val = "";
176 |     
177 |     // split argval (arg=val) into arg and val
178 |     size_t pos = argval.find("=");
179 |     if (pos != string::npos) {
180 |       arg = argval.substr(0, pos);
181 |       val = argval.substr(pos + 1);
182 |     }
183 | 
184 | 
185 |     FlagRegister<bool> *bool_register =
186 |       FlagRegister<bool>::GetRegister();
187 |     if (bool_register->SetFlag(arg, val)) 
188 |       continue;
189 |     FlagRegister<string> *string_register =
190 |       FlagRegister<string>::GetRegister();
191 |     if (string_register->SetFlag(arg, val))
192 |       continue;
193 |     FlagRegister<int32> *int32_register =
194 |       FlagRegister<int32>::GetRegister();
195 |     if (int32_register->SetFlag(arg, val))
196 |       continue;
197 |     FlagRegister<int64> *int64_register =
198 |       FlagRegister<int64>::GetRegister();
199 |     if (int64_register->SetFlag(arg, val))
200 |       continue;
201 |     FlagRegister<double> *double_register =
202 |       FlagRegister<double>::GetRegister();
203 |     if (double_register->SetFlag(arg, val))
204 |       continue;
205 |     
206 |     LOG(FATAL) << "SetFlags: Bad option: " << (*argv)[index];
207 |   }
208 |   
209 |   if (FLAGS_help) {
210 |     //Just show program flags - NOT general OpenFst flags
211 |     // There are too many and they are just confusing.
212 |     std::set< pair<string, string> > usage_set;
213 | 
214 |     cout << usage << "\n";
215 | 
216 |     FlagRegister<bool> *bool_register = FlagRegister<bool>::GetRegister();
217 |     bool_register->GetUsage(&usage_set);
218 |     FlagRegister<string> *string_register = FlagRegister<string>::GetRegister();
219 |     string_register->GetUsage(&usage_set);
220 |     FlagRegister<int32> *int32_register = FlagRegister<int32>::GetRegister();
221 |     int32_register->GetUsage(&usage_set);
222 |     FlagRegister<int64> *int64_register = FlagRegister<int64>::GetRegister();
223 |     int64_register->GetUsage(&usage_set);
224 |     FlagRegister<double> *double_register = FlagRegister<double>::GetRegister();
225 |     double_register->GetUsage(&usage_set);
226 | 
227 |     for (std::set< pair<string, string> >::const_iterator it =
228 |            usage_set.begin();
229 |          it != usage_set.end();
230 |          ++it) {
231 |       const string &file = it->first;
232 |       const string &usage = it->second;
233 |       
234 |       //if (file.compare ("flags.cc") == 0 || file.compare ("fst.cc") == 0 
235 |       if (file.compare ("fst.cc") == 0 \
236 |           || file.compare ("symbol-table.cc") == 0 || \
237 |           file.compare ("util.cc") == 0)
238 |         continue;
239 |       
240 |       //Else print out the args - they are from the actual program
241 |       cout << usage << endl;
242 |     }
243 |     //Fake this
244 |     cout << "  --help: type = bool, default = false" << endl;
245 |     cout << "  show usage information" << endl;
246 |     exit (0);
247 |   }
248 | #endif
249 | }
250 | 
251 | void LoadWordList (const std::string& filename,
252 |                   std::vector<std::string>* corpus) {
253 |   std::ifstream ifp (filename.c_str ());
254 |   std::string line;
255 | 
256 |   if (ifp.is_open ()) {
257 |     while (ifp.good ()) {
258 |       getline (ifp, line);
259 |       if (line.empty ())
260 |         continue;
261 |       
262 |       corpus->push_back (line);
263 |     }
264 |     ifp.close ();
265 |   }
266 | }
267 | 
268 | 
269 | void Split (const std::string& s, char delim, std::vector<std::string>& elems) {
270 |   std::stringstream ss (s);
271 |   std::string item;
272 |   while (getline (ss, item, delim))
273 |     elems.push_back (item);
274 | }
275 | 


--------------------------------------------------------------------------------
/test/check-nbest-wer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import re, sys, os
  3 | from collections import defaultdict
  4 | 
  5 | def RunRegressionPrep () :
  6 |     print "Standard alignment"
  7 |     command = """phonetisaurus-align --input=g014b2b/g014b2b.train \
  8 |     --ofile=g014b2b/g014b2b.corpus \
  9 |     --seq1_del=false \
 10 |     --grow=false
 11 |     """
 12 |     os.system (command)
 13 | 
 14 |     print "Alignment with support for growing"
 15 |     command = """phonetisaurus-align --input=g014b2b/g014b2b.train \
 16 |     --ofile=g014b2b/g014b2b.grow.corpus \
 17 |     --seq1_del=false \
 18 |     --grow=true
 19 |     """
 20 |     os.system (command)
 21 | 
 22 |     print "\nTraining standard ARPA"
 23 |     command = """estimate-ngram -o 8 -t g014b2b/g014b2b.corpus \
 24 |     -wl g014b2b/g014b2b.o8.arpa
 25 |     """
 26 |     os.system (command)
 27 | 
 28 |     print "Training grow-supported ARPA"
 29 |     command = """estimate-ngram -o 8 -t g014b2b/g014b2b.grow.corpus \
 30 |     -wl g014b2b/g014b2b.grow.o8.arpa
 31 |     """
 32 |     os.system (command)
 33 | 
 34 |     print "\nConverting stanard model to Fst"
 35 |     command = """phonetisaurus-arpa2wfst --lm=g014b2b/g014b2b.o8.arpa \
 36 |     --ofile=g014b2b/g014b2b.o8.fst
 37 |     """
 38 |     os.system (command)
 39 | 
 40 |     print "Converting grow-reported stanard model to Fst"
 41 |     command = """phonetisaurus-arpa2wfst --lm=g014b2b/g014b2b.grow.o8.arpa \
 42 |     --ofile=g014b2b/g014b2b.grow.o8.fst
 43 |     """
 44 |     os.system (command)
 45 | 
 46 |     print "\nTesting 5-best standard"
 47 |     command = """phonetisaurus-g2pfst --model=g014b2b/g014b2b.o8.fst \
 48 |     --wordlist=g014b2b/g014b2b.words \
 49 |     --nbest=5 | perl -e'while(<>){s/\|/ /g; print $_;}' \
 50 |     > g014b2b/g014b2b-n5.hyp
 51 |     """
 52 |     os.system (command)
 53 | 
 54 |     print "Testing 5-best grow-supported standard"
 55 |     command = """phonetisaurus-g2pfst --model=g014b2b/g014b2b.grow.o8.fst \
 56 |     --wordlist=g014b2b/g014b2b.words \
 57 |     --nbest=5 | perl -e'while(<>){s/\|/ /g; print $_;}' \
 58 |     > g014b2b/g014b2b-grow-n5.hyp
 59 |     """
 60 |     os.system (command)
 61 | 
 62 |     return
 63 | 
 64 | def LoadRefs (refs_file) :
 65 |     refs = {}
 66 | 
 67 |     with open (refs_file, "r") as ifp :
 68 |         for line in ifp :
 69 |             parts = re.split (ur"\t", line.decode ("utf8").strip ())
 70 |             word = parts.pop (0)
 71 |             refs [word] = parts
 72 | 
 73 |     return refs
 74 | 
 75 | def LoadNbestHyps (hyps_file) :
 76 |     hyps = defaultdict (list)
 77 | 
 78 |     with open (hyps_file, "r") as ifp :
 79 |         for line in ifp :
 80 |             parts = re.split (ur"\t", line.decode ("utf8").strip ())
 81 |             if parts [-1] == "" :
 82 |                 continue
 83 | 
 84 |             hyps [parts [0]].append (parts [-1])
 85 | 
 86 |     return hyps
 87 | 
 88 | def ComputeEval (hyps) :
 89 |     refs = LoadRefs ("g014b2b/g014b2b.ref")
 90 |     hyps = LoadNbestHyps (hyps)
 91 | 
 92 |     total = 0.
 93 |     corr = 0.
 94 |     for ref_word, ref_prons in refs.iteritems () :
 95 |         hyp_prons = hyps [ref_word]
 96 |         ref_set = set (ref_prons)
 97 |         hyp_set = set (hyp_prons)
 98 |         intersection = ref_set.intersection (hyp_set)
 99 | 
100 |         total += 1.0
101 |         if len (intersection) > 0 :
102 |             corr += 1.0
103 | 
104 |     print "Corr: {0}, Err: {1}, WACC: {2:0.2f}%, WER: {3:0.2f}%".format (
105 |         corr,
106 |         total - corr,
107 |         corr / total * 100,
108 |         (1.0 - (corr / total)) * 100
109 |     )
110 | 
111 | 
112 | if __name__ == "__main__" :
113 |     import argparse
114 | 
115 |     example = "{0} --prefix g014b2b".format (sys.argv [0])
116 |     parser = argparse.ArgumentParser (description=example)
117 |     parser.add_argument ("--prefix", "-p", help="Prefix.",
118 |                          default="g014b2b")
119 |     args = parser.parse_args ()
120 | 
121 |     RunRegressionPrep ()
122 |     ComputeEval ("{0}/{0}-n5.hyp".format (args.prefix))
123 |     ComputeEval ("{0}/{0}-grow-n5.hyp".format (args.prefix))
124 | 


--------------------------------------------------------------------------------