├── AUTHORS ├── INSTALL ├── LICENSE ├── Makefile.am ├── Makefile.in ├── NEWS ├── README ├── aclocal.m4 ├── ar-lib ├── compile ├── config.guess ├── config.sub ├── configure ├── configure.ac ├── data └── festival-2.4_sparrowhawk.patch ├── depcomp ├── documentation ├── README.md └── grammars │ ├── en_toy │ ├── README │ ├── byte.far │ ├── byte.grm │ ├── classify │ │ ├── cardinal.grm │ │ ├── date.grm │ │ ├── measure.grm │ │ ├── measure.tsv │ │ ├── money.grm │ │ ├── money.tsv │ │ ├── months.tsv │ │ ├── punctuation.grm │ │ ├── time.grm │ │ ├── tokenize_and_classify.grm │ │ └── word.grm │ ├── util.far │ ├── util.grm │ ├── verbalize │ │ ├── CARDINAL_NUMBER_NAME │ │ ├── ORDINAL_NUMBER_NAME │ │ ├── date.grm │ │ ├── measure.grm │ │ ├── money.grm │ │ ├── money.tsv │ │ ├── numbers.grm │ │ ├── time.grm │ │ ├── verbalize.grm │ │ └── verbatim.grm │ └── verbalize_serialization │ │ ├── CARDINAL_NUMBER_NAME │ │ ├── ORDINAL_NUMBER_NAME │ │ ├── date.grm │ │ ├── measure.grm │ │ ├── money.grm │ │ ├── money.tsv │ │ ├── numbers.grm │ │ ├── time.grm │ │ ├── verbalize.grm │ │ └── verbatim.grm │ ├── sentence_boundary_exceptions.txt │ ├── sparrowhawk_configuration.ascii_proto │ ├── sparrowhawk_configuration_serialization.ascii_proto │ ├── test.txt │ ├── tokenizer.ascii_proto │ ├── verbalizer.ascii_proto │ ├── verbalizer_serialization.ascii_proto │ └── verbalizer_serialization_spec.ascii_proto ├── install-sh ├── ltmain.sh ├── m4 ├── libtool.m4 ├── ltoptions.m4 ├── ltsugar.m4 ├── ltversion.m4 └── lt~obsolete.m4 ├── missing └── src ├── Makefile.am ├── Makefile.in ├── bin ├── Makefile.am ├── Makefile.in └── normalizer_main.cc ├── include ├── Makefile.am ├── Makefile.in └── sparrowhawk │ ├── field_path.h │ ├── io_utils.h │ ├── items.pb.h │ ├── links.pb.h │ ├── logger.h │ ├── normalizer.h │ ├── numbers.h │ ├── protobuf_parser.h │ ├── protobuf_serializer.h │ ├── record_serializer.h │ ├── regexp.h │ ├── rule_order.pb.h │ ├── rule_system.h │ ├── semiotic_classes.pb.h │ ├── sentence_boundary.h │ ├── sparrowhawk_configuration.pb.h │ ├── spec_serializer.h │ ├── string_utils.h │ └── style_serializer.h ├── lib ├── Makefile.am ├── Makefile.in ├── field_path.cc ├── io_utils.cc ├── items.pb.cc ├── links.pb.cc ├── normalizer.cc ├── normalizer_utils.cc ├── numbers.cc ├── protobuf_parser.cc ├── protobuf_serializer.cc ├── record_serializer.cc ├── regexp.cc ├── rule_order.pb.cc ├── rule_system.cc ├── semiotic_classes.pb.cc ├── sentence_boundary.cc ├── serialization_spec.pb.cc ├── sparrowhawk_configuration.pb.cc ├── spec_serializer.cc ├── string_utils.cc └── style_serializer.cc └── proto ├── Makefile.am ├── Makefile.in ├── items.proto ├── links.proto ├── rule_order.proto ├── semiotic_classes.proto ├── serialization_spec.proto └── sparrowhawk_configuration.proto /AUTHORS: -------------------------------------------------------------------------------- 1 | Google Inc. 2 | 3 | -------------------------------------------------------------------------------- /INSTALL: -------------------------------------------------------------------------------- 1 | Installation Instructions 2 | ************************* 3 | 4 | Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005, 5 | 2006, 2007 Free Software Foundation, Inc. 6 | 7 | This file is free documentation; the Free Software Foundation gives 8 | unlimited permission to copy, distribute and modify it. 9 | 10 | Basic Installation 11 | ================== 12 | 13 | Briefly, the shell commands `./configure; make; make install' should 14 | configure, build, and install this package. The following 15 | more-detailed instructions are generic; see the `README' file for 16 | instructions specific to this package. 17 | 18 | The `configure' shell script attempts to guess correct values for 19 | various system-dependent variables used during compilation. It uses 20 | those values to create a `Makefile' in each directory of the package. 21 | It may also create one or more `.h' files containing system-dependent 22 | definitions. Finally, it creates a shell script `config.status' that 23 | you can run in the future to recreate the current configuration, and a 24 | file `config.log' containing compiler output (useful mainly for 25 | debugging `configure'). 26 | 27 | It can also use an optional file (typically called `config.cache' 28 | and enabled with `--cache-file=config.cache' or simply `-C') that saves 29 | the results of its tests to speed up reconfiguring. Caching is 30 | disabled by default to prevent problems with accidental use of stale 31 | cache files. 32 | 33 | If you need to do unusual things to compile the package, please try 34 | to figure out how `configure' could check whether to do them, and mail 35 | diffs or instructions to the address given in the `README' so they can 36 | be considered for the next release. If you are using the cache, and at 37 | some point `config.cache' contains results you don't want to keep, you 38 | may remove or edit it. 39 | 40 | The file `configure.ac' (or `configure.in') is used to create 41 | `configure' by a program called `autoconf'. You need `configure.ac' if 42 | you want to change it or regenerate `configure' using a newer version 43 | of `autoconf'. 44 | 45 | The simplest way to compile this package is: 46 | 47 | 1. `cd' to the directory containing the package's source code and type 48 | `./configure' to configure the package for your system. 49 | 50 | Running `configure' might take a while. While running, it prints 51 | some messages telling which features it is checking for. 52 | 53 | 2. Type `make' to compile the package. 54 | 55 | 3. Optionally, type `make check' to run any self-tests that come with 56 | the package. 57 | 58 | 4. Type `make install' to install the programs and any data files and 59 | documentation. 60 | 61 | 5. You can remove the program binaries and object files from the 62 | source code directory by typing `make clean'. To also remove the 63 | files that `configure' created (so you can compile the package for 64 | a different kind of computer), type `make distclean'. There is 65 | also a `make maintainer-clean' target, but that is intended mainly 66 | for the package's developers. If you use it, you may have to get 67 | all sorts of other programs in order to regenerate files that came 68 | with the distribution. 69 | 70 | 6. Often, you can also type `make uninstall' to remove the installed 71 | files again. 72 | 73 | Compilers and Options 74 | ===================== 75 | 76 | Some systems require unusual options for compilation or linking that the 77 | `configure' script does not know about. Run `./configure --help' for 78 | details on some of the pertinent environment variables. 79 | 80 | You can give `configure' initial values for configuration parameters 81 | by setting variables in the command line or in the environment. Here 82 | is an example: 83 | 84 | ./configure CC=c99 CFLAGS=-g LIBS=-lposix 85 | 86 | *Note Defining Variables::, for more details. 87 | 88 | Compiling For Multiple Architectures 89 | ==================================== 90 | 91 | You can compile the package for more than one kind of computer at the 92 | same time, by placing the object files for each architecture in their 93 | own directory. To do this, you can use GNU `make'. `cd' to the 94 | directory where you want the object files and executables to go and run 95 | the `configure' script. `configure' automatically checks for the 96 | source code in the directory that `configure' is in and in `..'. 97 | 98 | With a non-GNU `make', it is safer to compile the package for one 99 | architecture at a time in the source code directory. After you have 100 | installed the package for one architecture, use `make distclean' before 101 | reconfiguring for another architecture. 102 | 103 | Installation Names 104 | ================== 105 | 106 | By default, `make install' installs the package's commands under 107 | `/usr/local/bin', include files under `/usr/local/include', etc. You 108 | can specify an installation prefix other than `/usr/local' by giving 109 | `configure' the option `--prefix=PREFIX'. 110 | 111 | You can specify separate installation prefixes for 112 | architecture-specific files and architecture-independent files. If you 113 | pass the option `--exec-prefix=PREFIX' to `configure', the package uses 114 | PREFIX as the prefix for installing programs and libraries. 115 | Documentation and other data files still use the regular prefix. 116 | 117 | In addition, if you use an unusual directory layout you can give 118 | options like `--bindir=DIR' to specify different values for particular 119 | kinds of files. Run `configure --help' for a list of the directories 120 | you can set and what kinds of files go in them. 121 | 122 | If the package supports it, you can cause programs to be installed 123 | with an extra prefix or suffix on their names by giving `configure' the 124 | option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'. 125 | 126 | Optional Features 127 | ================= 128 | 129 | Some packages pay attention to `--enable-FEATURE' options to 130 | `configure', where FEATURE indicates an optional part of the package. 131 | They may also pay attention to `--with-PACKAGE' options, where PACKAGE 132 | is something like `gnu-as' or `x' (for the X Window System). The 133 | `README' should mention any `--enable-' and `--with-' options that the 134 | package recognizes. 135 | 136 | For packages that use the X Window System, `configure' can usually 137 | find the X include and library files automatically, but if it doesn't, 138 | you can use the `configure' options `--x-includes=DIR' and 139 | `--x-libraries=DIR' to specify their locations. 140 | 141 | Specifying the System Type 142 | ========================== 143 | 144 | There may be some features `configure' cannot figure out automatically, 145 | but needs to determine by the type of machine the package will run on. 146 | Usually, assuming the package is built to be run on the _same_ 147 | architectures, `configure' can figure that out, but if it prints a 148 | message saying it cannot guess the machine type, give it the 149 | `--build=TYPE' option. TYPE can either be a short name for the system 150 | type, such as `sun4', or a canonical name which has the form: 151 | 152 | CPU-COMPANY-SYSTEM 153 | 154 | where SYSTEM can have one of these forms: 155 | 156 | OS KERNEL-OS 157 | 158 | See the file `config.sub' for the possible values of each field. If 159 | `config.sub' isn't included in this package, then this package doesn't 160 | need to know the machine type. 161 | 162 | If you are _building_ compiler tools for cross-compiling, you should 163 | use the option `--target=TYPE' to select the type of system they will 164 | produce code for. 165 | 166 | If you want to _use_ a cross compiler, that generates code for a 167 | platform different from the build platform, you should specify the 168 | "host" platform (i.e., that on which the generated programs will 169 | eventually be run) with `--host=TYPE'. 170 | 171 | Sharing Defaults 172 | ================ 173 | 174 | If you want to set default values for `configure' scripts to share, you 175 | can create a site shell script called `config.site' that gives default 176 | values for variables like `CC', `cache_file', and `prefix'. 177 | `configure' looks for `PREFIX/share/config.site' if it exists, then 178 | `PREFIX/etc/config.site' if it exists. Or, you can set the 179 | `CONFIG_SITE' environment variable to the location of the site script. 180 | A warning: not all `configure' scripts look for a site script. 181 | 182 | Defining Variables 183 | ================== 184 | 185 | Variables not defined in a site shell script can be set in the 186 | environment passed to `configure'. However, some packages may run 187 | configure again during the build, and the customized values of these 188 | variables may be lost. In order to avoid this problem, you should set 189 | them in the `configure' command line, using `VAR=value'. For example: 190 | 191 | ./configure CC=/usr/local2/bin/gcc 192 | 193 | causes the specified `gcc' to be used as the C compiler (unless it is 194 | overridden in the site shell script). 195 | 196 | Unfortunately, this technique does not work for `CONFIG_SHELL' due to 197 | an Autoconf bug. Until the bug is fixed you can use this workaround: 198 | 199 | CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash 200 | 201 | `configure' Invocation 202 | ====================== 203 | 204 | `configure' recognizes the following options to control how it operates. 205 | 206 | `--help' 207 | `-h' 208 | Print a summary of the options to `configure', and exit. 209 | 210 | `--version' 211 | `-V' 212 | Print the version of Autoconf used to generate the `configure' 213 | script, and exit. 214 | 215 | `--cache-file=FILE' 216 | Enable the cache: use and save the results of the tests in FILE, 217 | traditionally `config.cache'. FILE defaults to `/dev/null' to 218 | disable caching. 219 | 220 | `--config-cache' 221 | `-C' 222 | Alias for `--cache-file=config.cache'. 223 | 224 | `--quiet' 225 | `--silent' 226 | `-q' 227 | Do not print messages saying which checks are being made. To 228 | suppress all normal output, redirect it to `/dev/null' (any error 229 | messages will still be shown). 230 | 231 | `--srcdir=DIR' 232 | Look for the package's source code in directory DIR. Usually 233 | `configure' can determine that directory automatically. 234 | 235 | `configure' also accepts some other, not widely useful, options. Run 236 | `configure --help' for more details. 237 | 238 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = src 2 | ACLOCAL_AMFLAGS = -I m4 3 | EXTRA_DIST = LICENSE data documentation 4 | 5 | 6 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | Sparrowhawk - Release 0.1 2 | 3 | This is the alpha version. 4 | 5 | Sparrowhawk - Release 1.0 6 | 7 | * Added new verbalizer serialization, with accompanying grammars. 8 | 9 | 10 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Sparrowhawk - Release 1.0 2 | 3 | Sparrowhawk is an open-source implementation of Google's Kestrel text-to-speech 4 | text normalization system. It follows the discussion of the Kestrel system as 5 | described in: 6 | 7 | Ebden, Peter and Sproat, Richard. 2015. The Kestrel TTS text normalization 8 | system. Natural Language Engineering, Issue 03, pp 333-353. 9 | 10 | After sentence segmentation (sentence_boundary.h), the individual sentences are 11 | first tokenized with each token being classified, and then passed to the 12 | normalizer. The system can output as an unannotated string of words, and richer 13 | annotation with links between input tokens, their input string positions, and 14 | the output words is also available. 15 | 16 | REQUIREMENTS: 17 | 18 | This version is known to work under Linux using g++ (>= 4.6) and 19 | MacOS X using XCode 5. Expected to work wherever adequate POSIX 20 | (dlopen, ssize_t, basename), c99 (snprintf, strtoll, ), 21 | and C++11 (, , ) support 22 | are available. 23 | 24 | You must have installed the following packages: 25 | 26 | - OpenFst 1.5.4 or higher (www.openfst.org) 27 | - Thrax 1.2.2 or higher (http://www.openfst.org/twiki/bin/view/GRM/Thrax) 28 | - re2 (https://github.com/google/re2) 29 | - protobuf (http://protobuf.googlecode.com/files/protobuf-2.5.0.tar.gz --- 30 | see e.g. http://jugnu-life.blogspot.com/2013/09/install-protobuf-25-on-ubuntu.html) 31 | 32 | INSTALLATION: 33 | Follow the generic GNU build system instructions in ./INSTALL. We 34 | recommend configuring with --enable-static=no for faster 35 | compiles. 36 | 37 | NOTE: In some versions of Mac OS-X we have noticed a problem with configure 38 | whereby it fails to find fst.h. If this occurs, try configuring as follows: 39 | 40 | CPPFLAGS=-I/usr/local/include LDFLAGS=-L/usr/local/lib ./configure 41 | 42 | USAGE: 43 | Assuming you've installed under the default /usr/local, the library will be 44 | in /usr/local/lib, and the headers in /usr/local/include/sparrowhawk. 45 | 46 | To use in your own program, include and compile 47 | with '-I /usr/local/include'. The compiler must support C++11 (for g++ add the 48 | flag "-std=c++11"). Link against /usr/local/lib/libsparrowhawk.so and 49 | -ldl. Set your LD_LIBRARY_PATH (or equivalent) to contain /usr/local/lib. The 50 | linking is, by default, dynamic so that the Fst and Arc type DSO extensions 51 | can be used correctly if desired. 52 | 53 | DOCUMENTATION: 54 | See ./NEWS for updates since the last release. 55 | -------------------------------------------------------------------------------- /ar-lib: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # Wrapper for Microsoft lib.exe 3 | 4 | me=ar-lib 5 | scriptversion=2012-03-01.08; # UTC 6 | 7 | # Copyright (C) 2010-2013 Free Software Foundation, Inc. 8 | # Written by Peter Rosin . 9 | # 10 | # This program is free software; you can redistribute it and/or modify 11 | # it under the terms of the GNU General Public License as published by 12 | # the Free Software Foundation; either version 2, or (at your option) 13 | # any later version. 14 | # 15 | # This program is distributed in the hope that it will be useful, 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | # GNU General Public License for more details. 19 | # 20 | # You should have received a copy of the GNU General Public License 21 | # along with this program. If not, see . 22 | 23 | # As a special exception to the GNU General Public License, if you 24 | # distribute this file as part of a program that contains a 25 | # configuration script generated by Autoconf, you may include it under 26 | # the same distribution terms that you use for the rest of that program. 27 | 28 | # This file is maintained in Automake, please report 29 | # bugs to or send patches to 30 | # . 31 | 32 | 33 | # func_error message 34 | func_error () 35 | { 36 | echo "$me: $1" 1>&2 37 | exit 1 38 | } 39 | 40 | file_conv= 41 | 42 | # func_file_conv build_file 43 | # Convert a $build file to $host form and store it in $file 44 | # Currently only supports Windows hosts. 45 | func_file_conv () 46 | { 47 | file=$1 48 | case $file in 49 | / | /[!/]*) # absolute file, and not a UNC file 50 | if test -z "$file_conv"; then 51 | # lazily determine how to convert abs files 52 | case `uname -s` in 53 | MINGW*) 54 | file_conv=mingw 55 | ;; 56 | CYGWIN*) 57 | file_conv=cygwin 58 | ;; 59 | *) 60 | file_conv=wine 61 | ;; 62 | esac 63 | fi 64 | case $file_conv in 65 | mingw) 66 | file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'` 67 | ;; 68 | cygwin) 69 | file=`cygpath -m "$file" || echo "$file"` 70 | ;; 71 | wine) 72 | file=`winepath -w "$file" || echo "$file"` 73 | ;; 74 | esac 75 | ;; 76 | esac 77 | } 78 | 79 | # func_at_file at_file operation archive 80 | # Iterate over all members in AT_FILE performing OPERATION on ARCHIVE 81 | # for each of them. 82 | # When interpreting the content of the @FILE, do NOT use func_file_conv, 83 | # since the user would need to supply preconverted file names to 84 | # binutils ar, at least for MinGW. 85 | func_at_file () 86 | { 87 | operation=$2 88 | archive=$3 89 | at_file_contents=`cat "$1"` 90 | eval set x "$at_file_contents" 91 | shift 92 | 93 | for member 94 | do 95 | $AR -NOLOGO $operation:"$member" "$archive" || exit $? 96 | done 97 | } 98 | 99 | case $1 in 100 | '') 101 | func_error "no command. Try '$0 --help' for more information." 102 | ;; 103 | -h | --h*) 104 | cat <. 8 | # 9 | # This program is free software; you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation; either version 2, or (at your option) 12 | # any later version. 13 | # 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | # 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | 22 | # As a special exception to the GNU General Public License, if you 23 | # distribute this file as part of a program that contains a 24 | # configuration script generated by Autoconf, you may include it under 25 | # the same distribution terms that you use for the rest of that program. 26 | 27 | # This file is maintained in Automake, please report 28 | # bugs to or send patches to 29 | # . 30 | 31 | nl=' 32 | ' 33 | 34 | # We need space, tab and new line, in precisely that order. Quoting is 35 | # there to prevent tools from complaining about whitespace usage. 36 | IFS=" "" $nl" 37 | 38 | file_conv= 39 | 40 | # func_file_conv build_file lazy 41 | # Convert a $build file to $host form and store it in $file 42 | # Currently only supports Windows hosts. If the determined conversion 43 | # type is listed in (the comma separated) LAZY, no conversion will 44 | # take place. 45 | func_file_conv () 46 | { 47 | file=$1 48 | case $file in 49 | / | /[!/]*) # absolute file, and not a UNC file 50 | if test -z "$file_conv"; then 51 | # lazily determine how to convert abs files 52 | case `uname -s` in 53 | MINGW*) 54 | file_conv=mingw 55 | ;; 56 | CYGWIN*) 57 | file_conv=cygwin 58 | ;; 59 | *) 60 | file_conv=wine 61 | ;; 62 | esac 63 | fi 64 | case $file_conv/,$2, in 65 | *,$file_conv,*) 66 | ;; 67 | mingw/*) 68 | file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'` 69 | ;; 70 | cygwin/*) 71 | file=`cygpath -m "$file" || echo "$file"` 72 | ;; 73 | wine/*) 74 | file=`winepath -w "$file" || echo "$file"` 75 | ;; 76 | esac 77 | ;; 78 | esac 79 | } 80 | 81 | # func_cl_dashL linkdir 82 | # Make cl look for libraries in LINKDIR 83 | func_cl_dashL () 84 | { 85 | func_file_conv "$1" 86 | if test -z "$lib_path"; then 87 | lib_path=$file 88 | else 89 | lib_path="$lib_path;$file" 90 | fi 91 | linker_opts="$linker_opts -LIBPATH:$file" 92 | } 93 | 94 | # func_cl_dashl library 95 | # Do a library search-path lookup for cl 96 | func_cl_dashl () 97 | { 98 | lib=$1 99 | found=no 100 | save_IFS=$IFS 101 | IFS=';' 102 | for dir in $lib_path $LIB 103 | do 104 | IFS=$save_IFS 105 | if $shared && test -f "$dir/$lib.dll.lib"; then 106 | found=yes 107 | lib=$dir/$lib.dll.lib 108 | break 109 | fi 110 | if test -f "$dir/$lib.lib"; then 111 | found=yes 112 | lib=$dir/$lib.lib 113 | break 114 | fi 115 | if test -f "$dir/lib$lib.a"; then 116 | found=yes 117 | lib=$dir/lib$lib.a 118 | break 119 | fi 120 | done 121 | IFS=$save_IFS 122 | 123 | if test "$found" != yes; then 124 | lib=$lib.lib 125 | fi 126 | } 127 | 128 | # func_cl_wrapper cl arg... 129 | # Adjust compile command to suit cl 130 | func_cl_wrapper () 131 | { 132 | # Assume a capable shell 133 | lib_path= 134 | shared=: 135 | linker_opts= 136 | for arg 137 | do 138 | if test -n "$eat"; then 139 | eat= 140 | else 141 | case $1 in 142 | -o) 143 | # configure might choose to run compile as 'compile cc -o foo foo.c'. 144 | eat=1 145 | case $2 in 146 | *.o | *.[oO][bB][jJ]) 147 | func_file_conv "$2" 148 | set x "$@" -Fo"$file" 149 | shift 150 | ;; 151 | *) 152 | func_file_conv "$2" 153 | set x "$@" -Fe"$file" 154 | shift 155 | ;; 156 | esac 157 | ;; 158 | -I) 159 | eat=1 160 | func_file_conv "$2" mingw 161 | set x "$@" -I"$file" 162 | shift 163 | ;; 164 | -I*) 165 | func_file_conv "${1#-I}" mingw 166 | set x "$@" -I"$file" 167 | shift 168 | ;; 169 | -l) 170 | eat=1 171 | func_cl_dashl "$2" 172 | set x "$@" "$lib" 173 | shift 174 | ;; 175 | -l*) 176 | func_cl_dashl "${1#-l}" 177 | set x "$@" "$lib" 178 | shift 179 | ;; 180 | -L) 181 | eat=1 182 | func_cl_dashL "$2" 183 | ;; 184 | -L*) 185 | func_cl_dashL "${1#-L}" 186 | ;; 187 | -static) 188 | shared=false 189 | ;; 190 | -Wl,*) 191 | arg=${1#-Wl,} 192 | save_ifs="$IFS"; IFS=',' 193 | for flag in $arg; do 194 | IFS="$save_ifs" 195 | linker_opts="$linker_opts $flag" 196 | done 197 | IFS="$save_ifs" 198 | ;; 199 | -Xlinker) 200 | eat=1 201 | linker_opts="$linker_opts $2" 202 | ;; 203 | -*) 204 | set x "$@" "$1" 205 | shift 206 | ;; 207 | *.cc | *.CC | *.cxx | *.CXX | *.[cC]++) 208 | func_file_conv "$1" 209 | set x "$@" -Tp"$file" 210 | shift 211 | ;; 212 | *.c | *.cpp | *.CPP | *.lib | *.LIB | *.Lib | *.OBJ | *.obj | *.[oO]) 213 | func_file_conv "$1" mingw 214 | set x "$@" "$file" 215 | shift 216 | ;; 217 | *) 218 | set x "$@" "$1" 219 | shift 220 | ;; 221 | esac 222 | fi 223 | shift 224 | done 225 | if test -n "$linker_opts"; then 226 | linker_opts="-link$linker_opts" 227 | fi 228 | exec "$@" $linker_opts 229 | exit 1 230 | } 231 | 232 | eat= 233 | 234 | case $1 in 235 | '') 236 | echo "$0: No command. Try '$0 --help' for more information." 1>&2 237 | exit 1; 238 | ;; 239 | -h | --h*) 240 | cat <<\EOF 241 | Usage: compile [--help] [--version] PROGRAM [ARGS] 242 | 243 | Wrapper for compilers which do not understand '-c -o'. 244 | Remove '-o dest.o' from ARGS, run PROGRAM with the remaining 245 | arguments, and rename the output as expected. 246 | 247 | If you are trying to build a whole package this is not the 248 | right script to run: please start by reading the file 'INSTALL'. 249 | 250 | Report bugs to . 251 | EOF 252 | exit $? 253 | ;; 254 | -v | --v*) 255 | echo "compile $scriptversion" 256 | exit $? 257 | ;; 258 | cl | *[/\\]cl | cl.exe | *[/\\]cl.exe ) 259 | func_cl_wrapper "$@" # Doesn't return... 260 | ;; 261 | esac 262 | 263 | ofile= 264 | cfile= 265 | 266 | for arg 267 | do 268 | if test -n "$eat"; then 269 | eat= 270 | else 271 | case $1 in 272 | -o) 273 | # configure might choose to run compile as 'compile cc -o foo foo.c'. 274 | # So we strip '-o arg' only if arg is an object. 275 | eat=1 276 | case $2 in 277 | *.o | *.obj) 278 | ofile=$2 279 | ;; 280 | *) 281 | set x "$@" -o "$2" 282 | shift 283 | ;; 284 | esac 285 | ;; 286 | *.c) 287 | cfile=$1 288 | set x "$@" "$1" 289 | shift 290 | ;; 291 | *) 292 | set x "$@" "$1" 293 | shift 294 | ;; 295 | esac 296 | fi 297 | shift 298 | done 299 | 300 | if test -z "$ofile" || test -z "$cfile"; then 301 | # If no '-o' option was seen then we might have been invoked from a 302 | # pattern rule where we don't need one. That is ok -- this is a 303 | # normal compilation that the losing compiler can handle. If no 304 | # '.c' file was seen then we are probably linking. That is also 305 | # ok. 306 | exec "$@" 307 | fi 308 | 309 | # Name of file we expect compiler to create. 310 | cofile=`echo "$cfile" | sed 's|^.*[\\/]||; s|^[a-zA-Z]:||; s/\.c$/.o/'` 311 | 312 | # Create the lock directory. 313 | # Note: use '[/\\:.-]' here to ensure that we don't use the same name 314 | # that we are using for the .o file. Also, base the name on the expected 315 | # object file name, since that is what matters with a parallel build. 316 | lockdir=`echo "$cofile" | sed -e 's|[/\\:.-]|_|g'`.d 317 | while true; do 318 | if mkdir "$lockdir" >/dev/null 2>&1; then 319 | break 320 | fi 321 | sleep 1 322 | done 323 | # FIXME: race condition here if user kills between mkdir and trap. 324 | trap "rmdir '$lockdir'; exit 1" 1 2 15 325 | 326 | # Run the compile. 327 | "$@" 328 | ret=$? 329 | 330 | if test -f "$cofile"; then 331 | test "$cofile" = "$ofile" || mv "$cofile" "$ofile" 332 | elif test -f "${cofile}bj"; then 333 | test "${cofile}bj" = "$ofile" || mv "${cofile}bj" "$ofile" 334 | fi 335 | 336 | rmdir "$lockdir" 337 | exit $ret 338 | 339 | # Local Variables: 340 | # mode: shell-script 341 | # sh-indentation: 2 342 | # eval: (add-hook 'write-file-hooks 'time-stamp) 343 | # time-stamp-start: "scriptversion=" 344 | # time-stamp-format: "%:y-%02m-%02d.%02H" 345 | # time-stamp-time-zone: "UTC" 346 | # time-stamp-end: "; # UTC" 347 | # End: 348 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | AC_INIT([Sparrowhawk], [1.0.0], [rws@google.com]) 2 | AM_INIT_AUTOMAKE([foreign nostdinc -Wall -Werror]) 3 | 4 | AM_PROG_AR 5 | 6 | CPPFLAGS="$CPPFLAGS -funsigned-char" 7 | CXXFLAGS="$CXXFLAGS -std=c++11" 8 | 9 | AC_PROG_CXX 10 | AC_DISABLE_STATIC 11 | AC_PROG_LIBTOOL 12 | 13 | AC_CONFIG_SRCDIR([src/lib/normalizer.cc]) 14 | AC_CONFIG_FILES([ 15 | Makefile 16 | src/Makefile 17 | src/include/Makefile 18 | src/lib/Makefile 19 | src/proto/Makefile 20 | src/bin/Makefile 21 | ]) 22 | 23 | AC_CONFIG_MACRO_DIR([m4]) 24 | AC_LANG([C++]) 25 | 26 | AC_CHECK_HEADER([fst/fst.h], [], 27 | [AC_MSG_ERROR([fst/fst.h header not found])] 28 | ) 29 | 30 | AC_CHECK_HEADER([fst/extensions/far/far.h], [], 31 | [AC_MSG_ERROR([fst/extensions/far/far.h header not found])] 32 | ) 33 | 34 | AC_CHECK_HEADER([fst/extensions/pdt/pdt.h], [], 35 | [AC_MSG_ERROR([fst/extensions/pdt/pdt.h header not found])] 36 | ) 37 | 38 | dnl TODO(rws): add mpdt at some point 39 | dnl AC_CHECK_HEADER([fst/extensions/mpdt/mpdt.h], [], 40 | dnl [AC_MSG_ERROR([fst/extensions/mpdt/mpdt.h header not found])] 41 | dnl ) 42 | 43 | AC_CHECK_HEADER([thrax/thrax.h], [], 44 | [AC_MSG_ERROR([thrax/thrax.h header not found])] 45 | ) 46 | 47 | AC_CHECK_HEADER([re2/re2.h], [], 48 | [AC_MSG_ERROR([re2/re2.h header not found])] 49 | ) 50 | 51 | AC_CHECK_HEADER([google/protobuf/message.h], [], 52 | [AC_MSG_ERROR([google/protobuf/message.h header not found])] 53 | ) 54 | 55 | AC_ARG_ENABLE( 56 | [bin], 57 | [AS_HELP_STRING( 58 | [--enable-bin], 59 | [enable command-line binaries])], 60 | [], 61 | [enable_bin=yes]) 62 | AM_CONDITIONAL([HAVE_BIN], [test "x$enable_bin" != xno]) 63 | 64 | AC_CHECK_PROG([PROTOC], [protoc], [protoc]) 65 | AS_IF([test "x${PROTOC}" == "x"], 66 | [AC_MSG_ERROR([ProtoBuf compiler "protoc" not found.])]) 67 | 68 | AC_OUTPUT 69 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/README: -------------------------------------------------------------------------------- 1 | This is a toy classifier for English whose sole purpose is to accompany the 2 | documentation on GitHub, illustrating how to write grammars for Sparrowhawk. 3 | 4 | It is assumed you have OpenGrm Thrax version 1.2.1 or higher installed. If you 5 | do not then the first thing to do is go to: 6 | 7 | http://openfst.cs.nyu.edu/twiki/bin/view/GRM/Thrax 8 | 9 | and follow the instructions there. 10 | 11 | Paul Dixon's Windows port can be found at: 12 | 13 | http://openfst.cs.nyu.edu/twiki/bin/view/Contrib/OpenGrmThraxWin 14 | 15 | Once Thrax is installed you should go into the classify directory and do: 16 | 17 | thraxmakedep tokenize_and_classify.grm 18 | make 19 | 20 | Similarly in the verbalize directory: 21 | 22 | thraxmakedep verbalize.grm 23 | make 24 | 25 | And finally in the verbalize_serialization directory, do the same as above. 26 | 27 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/byte.far: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/sparrowhawk/a0503e26a433fbd3a9ff81ba7a08819e4a3bb668/documentation/grammars/en_toy/byte.far -------------------------------------------------------------------------------- /documentation/grammars/en_toy/byte.grm: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | # 13 | # Copyright 2005-2011 Google, Inc. 14 | 15 | # Standard constants for ASCII (byte) based strings. This mirrors the 16 | # functions provided by C/C++'s ctype.h library. 17 | 18 | # Note that [0] is missing. Matching the string-termination character is kinda weird. 19 | export kBytes = Optimize[ 20 | "[1]" | "[2]" | "[3]" | "[4]" | "[5]" | "[6]" | "[7]" | "[8]" | "[9]" | "[10]" | 21 | "[11]" | "[12]" | "[13]" | "[14]" | "[15]" | "[16]" | "[17]" | "[18]" | "[19]" | "[20]" | 22 | "[21]" | "[22]" | "[23]" | "[24]" | "[25]" | "[26]" | "[27]" | "[28]" | "[29]" | "[30]" | 23 | "[31]" | "[32]" | "[33]" | "[34]" | "[35]" | "[36]" | "[37]" | "[38]" | "[39]" | "[40]" | 24 | "[41]" | "[42]" | "[43]" | "[44]" | "[45]" | "[46]" | "[47]" | "[48]" | "[49]" | "[50]" | 25 | "[51]" | "[52]" | "[53]" | "[54]" | "[55]" | "[56]" | "[57]" | "[58]" | "[59]" | "[60]" | 26 | "[61]" | "[62]" | "[63]" | "[64]" | "[65]" | "[66]" | "[67]" | "[68]" | "[69]" | "[70]" | 27 | "[71]" | "[72]" | "[73]" | "[74]" | "[75]" | "[76]" | "[77]" | "[78]" | "[79]" | "[80]" | 28 | "[81]" | "[82]" | "[83]" | "[84]" | "[85]" | "[86]" | "[87]" | "[88]" | "[89]" | "[90]" | 29 | "[91]" | "[92]" | "[93]" | "[94]" | "[95]" | "[96]" | "[97]" | "[98]" | "[99]" | "[100]" | 30 | "[101]" | "[102]" | "[103]" | "[104]" | "[105]" | "[106]" | "[107]" | "[108]" | "[109]" | "[110]" | 31 | "[111]" | "[112]" | "[113]" | "[114]" | "[115]" | "[116]" | "[117]" | "[118]" | "[119]" | "[120]" | 32 | "[121]" | "[122]" | "[123]" | "[124]" | "[125]" | "[126]" | "[127]" | "[128]" | "[129]" | "[130]" | 33 | "[131]" | "[132]" | "[133]" | "[134]" | "[135]" | "[136]" | "[137]" | "[138]" | "[139]" | "[140]" | 34 | "[141]" | "[142]" | "[143]" | "[144]" | "[145]" | "[146]" | "[147]" | "[148]" | "[149]" | "[150]" | 35 | "[151]" | "[152]" | "[153]" | "[154]" | "[155]" | "[156]" | "[157]" | "[158]" | "[159]" | "[160]" | 36 | "[161]" | "[162]" | "[163]" | "[164]" | "[165]" | "[166]" | "[167]" | "[168]" | "[169]" | "[170]" | 37 | "[171]" | "[172]" | "[173]" | "[174]" | "[175]" | "[176]" | "[177]" | "[178]" | "[179]" | "[180]" | 38 | "[181]" | "[182]" | "[183]" | "[184]" | "[185]" | "[186]" | "[187]" | "[188]" | "[189]" | "[190]" | 39 | "[191]" | "[192]" | "[193]" | "[194]" | "[195]" | "[196]" | "[197]" | "[198]" | "[199]" | "[200]" | 40 | "[201]" | "[202]" | "[203]" | "[204]" | "[205]" | "[206]" | "[207]" | "[208]" | "[209]" | "[210]" | 41 | "[211]" | "[212]" | "[213]" | "[214]" | "[215]" | "[216]" | "[217]" | "[218]" | "[219]" | "[220]" | 42 | "[221]" | "[222]" | "[223]" | "[224]" | "[225]" | "[226]" | "[227]" | "[228]" | "[229]" | "[230]" | 43 | "[231]" | "[232]" | "[233]" | "[234]" | "[235]" | "[236]" | "[237]" | "[238]" | "[239]" | "[240]" | 44 | "[241]" | "[242]" | "[243]" | "[244]" | "[245]" | "[246]" | "[247]" | "[248]" | "[249]" | "[250]" | 45 | "[251]" | "[252]" | "[253]" | "[254]" | "[255]" 46 | ]; 47 | 48 | export kDigit = Optimize[ 49 | "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" 50 | ]; 51 | 52 | export kLower = Optimize[ 53 | "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m" | 54 | "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" 55 | ]; 56 | export kUpper = Optimize[ 57 | "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" | 58 | "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" 59 | ]; 60 | export kAlpha = Optimize[kLower | kUpper]; 61 | 62 | export kAlnum = Optimize[kDigit | kAlpha]; 63 | 64 | export kSpace = Optimize[ 65 | " " | "\t" | "\n" | "\r" 66 | ]; 67 | export kNotSpace = Optimize[kBytes - kSpace]; 68 | 69 | export kPunct = Optimize[ 70 | "!" | "\"" | "#" | "$" | "%" | "&" | "'" | "(" | ")" | "*" | "+" | "," | 71 | "-" | "." | "/" | ":" | ";" | "<" | "=" | ">" | "?" | "@" | "\[" | "\\" | 72 | "\]" | "^" | "_" | "`" | "{" | "|" | "}" | "~" 73 | ]; 74 | 75 | export kGraph = Optimize[kAlnum | kPunct]; 76 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/classify/cardinal.grm: -------------------------------------------------------------------------------- 1 | import '../byte.grm' as b; 2 | import '../util.grm' as u; 3 | 4 | d = b.kDigit; 5 | q = u.q; 6 | 7 | # 300 -> cardinal { integer: "300"} 8 | 9 | cardinal = 10 | u.I["cardinal { "] 11 | u.I["integer: " q] 12 | d+ 13 | u.I[q] 14 | u.I[" }"] 15 | ; 16 | 17 | export CARDINAL = Optimize[cardinal]; 18 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/classify/date.grm: -------------------------------------------------------------------------------- 1 | import '../byte.grm' as b; 2 | import '../util.grm' as u; 3 | 4 | d = b.kDigit; 5 | q = u.q; 6 | # The weight is to override the analysis of "Jan." etc. as a separate word (see 7 | # word.grm). 8 | month_file = StringFile['months.tsv'] <-20>; 9 | 10 | # Allows both abbreviations and full names: 11 | month = month_file | Project[month_file, 'output']; 12 | 13 | # Any number from 1-31: 14 | 15 | day = (d - "0") | "1" d | "2" d | "30" | "31"; 16 | 17 | # Any four digit number beginning with 1 or 2 18 | 19 | year = ("1" | "2") d{3}; 20 | 21 | # Maps input of the form 22 | # 23 | # Jan. 3,? 1980 24 | # 25 | # or 26 | # 27 | # 3 Jan.,? 1980 28 | # 29 | # into 30 | # 31 | # date { month: "January" day: "3" year: "1980" } 32 | # 33 | # Etc. 34 | 35 | mdy = 36 | u.I["date { "] 37 | u.I["month: " q] 38 | month 39 | u.D[" "+] 40 | u.I[q " day: " q] 41 | day 42 | u.D[","]? 43 | u.D[" "+] 44 | u.I[q " year: " q] 45 | year 46 | u.I[q] 47 | u.I[" preserve_order: true"] 48 | u.I[" }"]; 49 | 50 | dmy = 51 | u.I["date { "] 52 | u.I["day: " q] 53 | day 54 | u.D[" "+] 55 | u.I[q " month: " q] 56 | month 57 | u.D[","]? 58 | u.D[" "+] 59 | u.I[q " year: " q] 60 | year 61 | u.I[q] 62 | u.I[" preserve_order: true"] 63 | u.I[" }"]; 64 | 65 | export DATE = Optimize[mdy | dmy]; 66 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/classify/measure.grm: -------------------------------------------------------------------------------- 1 | import '../byte.grm' as b; 2 | import '../util.grm' as u; 3 | 4 | d = b.kDigit; 5 | q = u.q; 6 | measures = StringFile['measure.tsv']; 7 | 8 | # Maps input of the form 9 | # 10 | # 2.5kg 11 | # 12 | # into 13 | # 14 | # measure { decimal { integer_part: "2" fractional_part: "5" } units: "kilogram" } 15 | # 16 | # The fractional part is optional. 17 | 18 | measure = 19 | u.I["measure { "] 20 | u.I[" decimal { "] 21 | u.I["integer_part: " q] 22 | d+ 23 | u.I[q] 24 | (u.D["."] 25 | u.I[" fractional_part: " q] 26 | d+ 27 | u.I[q])? 28 | u.I["}"] 29 | u.I[" units: " q] 30 | u.D[" "*] # We allow spaces between the number and the measure. 31 | measures 32 | u.I[q] 33 | u.I["}"] 34 | ; 35 | 36 | export MEASURE = Optimize[measure]; 37 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/classify/measure.tsv: -------------------------------------------------------------------------------- 1 | kg kilogram 2 | cm centimeter 3 | ° degree 4 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/classify/money.grm: -------------------------------------------------------------------------------- 1 | import '../byte.grm' as b; 2 | import '../util.grm' as u; 3 | 4 | d = b.kDigit; 5 | q = u.q; 6 | currencies = StringFile['money.tsv']; 7 | 8 | # Maps input of the form 9 | # 10 | # $2.50 11 | # 12 | # into 13 | # 14 | # money { currency: "usd" amount { integer_part: "2" fractional_part: "50"} } } 15 | # 16 | # The fractional part is optional. 17 | 18 | money = 19 | u.I["money { "] 20 | u.I["currency: " q] 21 | currencies 22 | u.I[q] 23 | u.I[" amount { "] 24 | u.I["integer_part: " q] 25 | d+ 26 | u.I[q] 27 | (u.D["."] 28 | u.I[" fractional_part: " q] 29 | d{2} 30 | u.I[q])? 31 | u.I["} }"] 32 | ; 33 | 34 | export MONEY = Optimize[money]; 35 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/classify/money.tsv: -------------------------------------------------------------------------------- 1 | $ usd 2 | £ gbp 3 | € eur 4 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/classify/months.tsv: -------------------------------------------------------------------------------- 1 | Jan. January 2 | Feb. February 3 | Mar. March 4 | Apr. April 5 | May 6 | Jun. June 7 | Jul. July 8 | Aug. August 9 | Sep. September 10 | Oct. October 11 | Nov. November 12 | Dec. December 13 | Jan January 14 | Feb February 15 | Mar March 16 | Apr April 17 | Jun June 18 | Jul July 19 | Aug August 20 | Sep September 21 | Oct October 22 | Nov November 23 | Dec December 24 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/classify/punctuation.grm: -------------------------------------------------------------------------------- 1 | import '../util.grm' as u; 2 | 3 | # The trick of inserting this material and then replacing the "[PUNCT]" with the 4 | # relevant punctuation symbols works because of the way that Replace replaces 5 | # the arcs in the replacement fst. 6 | 7 | # phrase_break: true sets it so that a silence (sil) will be inserted. 8 | 9 | medium = 10 | u.I["tokens { name: \"[PUNCT]\" pause_length: " 11 | "PAUSE_MEDIUM phrase_break: true type: PUNCT }"] 12 | ; 13 | 14 | long = 15 | u.I["tokens { name: \"[PUNCT]\" pause_length: " 16 | "PAUSE_LONG phrase_break: true type: PUNCT }"] 17 | ; 18 | 19 | medium_punct = "," | ";" | "(" | ")"; 20 | long_punct = "." | "!" | "?" | ":"; 21 | 22 | export PUNCT = Optimize[ 23 | Replace["[ROOT][PUNCT]", medium, medium_punct] 24 | | Replace["[ROOT][PUNCT]", long, long_punct]] 25 | ; 26 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/classify/time.grm: -------------------------------------------------------------------------------- 1 | import '../byte.grm' as b; 2 | import '../util.grm' as u; 3 | 4 | d = b.kDigit; 5 | q = u.q; 6 | 7 | # Maps input of the form 8 | # 9 | # 3:30 10 | # 11 | # into 12 | # 13 | # time { hour: 3 minute: 30 } 14 | # 15 | # Hours and minutes fields are defined as ints, so here we don't use quotes. 16 | 17 | 18 | hour = 19 | "0"? d 20 | | "1" d 21 | | "2" ("0" | "1" | "2" | "3") 22 | ; 23 | 24 | minute = ("0" | "1" | "2" | "3" | "4" | "5") d; 25 | 26 | time = 27 | u.I["time { "] 28 | u.I["hours: "] 29 | hour 30 | u.D[":"] 31 | u.I[" minutes: "] 32 | minute 33 | u.I["}"] 34 | ; 35 | 36 | export TIME = Optimize[time]; 37 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/classify/tokenize_and_classify.grm: -------------------------------------------------------------------------------- 1 | import 'cardinal.grm' as c; 2 | import 'date.grm' as d; 3 | import 'measure.grm' as M; 4 | import 'money.grm' as m; 5 | import 'punctuation.grm' as p; 6 | import 'time.grm' as t; 7 | import '../util.grm' as u; 8 | import 'word.grm' as w; 9 | 10 | types = c.CARDINAL | d.DATE | M.MEASURE | m.MONEY | w.WORD | t.TIME; 11 | 12 | token = u.I["tokens { "] types u.I[" }"]; 13 | 14 | token_plus_punct = (p.PUNCT u.I[" "])* token (u.I[" "] p.PUNCT)*; 15 | 16 | # Collection of all possible semiotic classes, including ordinary words. 17 | 18 | export TOKENIZE_AND_CLASSIFY = 19 | Optimize[token_plus_punct (" " token_plus_punct)*] 20 | ; 21 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/classify/word.grm: -------------------------------------------------------------------------------- 1 | import '../byte.grm' as b; 2 | import '../util.grm' as u; 3 | 4 | # Make sure we don't treat periods at the end of abbreviations in 5 | # sentence_boundary_exceptions.txt as punctuation: 6 | exceptions = StringFile['../../sentence_boundary_exceptions.txt'] <-10>; 7 | 8 | # Markup for ordinary tokens that don't match anything else. 9 | # output is a "name:" token. Cost is to make this analysis more expensive. 10 | 11 | word = u.I["name: " u.q] ((b.kNotSpace <1>)+ | exceptions) u.I[u.q]; 12 | 13 | export WORD = Optimize[word]; 14 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/util.far: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/sparrowhawk/a0503e26a433fbd3a9ff81ba7a08819e4a3bb668/documentation/grammars/en_toy/util.far -------------------------------------------------------------------------------- /documentation/grammars/en_toy/util.grm: -------------------------------------------------------------------------------- 1 | import '../byte.grm' as b; 2 | 3 | func I[expr] { 4 | return "" : expr; 5 | } 6 | 7 | func D[expr] { 8 | return expr : ""; 9 | } 10 | 11 | export q = "\""; 12 | 13 | # Allows for arbitrary numbers of spaces on the verbalization side between 14 | # elements of the semiotic class. 15 | export s = " "*; 16 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/verbalize/CARDINAL_NUMBER_NAME: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/sparrowhawk/a0503e26a433fbd3a9ff81ba7a08819e4a3bb668/documentation/grammars/en_toy/verbalize/CARDINAL_NUMBER_NAME -------------------------------------------------------------------------------- /documentation/grammars/en_toy/verbalize/ORDINAL_NUMBER_NAME: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/sparrowhawk/a0503e26a433fbd3a9ff81ba7a08819e4a3bb668/documentation/grammars/en_toy/verbalize/ORDINAL_NUMBER_NAME -------------------------------------------------------------------------------- /documentation/grammars/en_toy/verbalize/date.grm: -------------------------------------------------------------------------------- 1 | import '../byte.grm' as b; 2 | import '../util.grm' as u; 3 | import 'numbers.grm' as n; 4 | 5 | # quotation mark 6 | q = u.q; 7 | 8 | # Used to allow for different numbers of spaces coming out of the serializer. 9 | s = u.s; 10 | 11 | month = b.kAlpha+; 12 | 13 | day = n.ORDINAL; 14 | 15 | d = b.kDigit; 16 | D = b.kDigit - "0"; 17 | 18 | two_digit = 19 | ((D d) @ n.CARDINAL) 20 | | ("0" : "oh ") (D @ n.CARDINAL) 21 | | ("00" : "hundred") 22 | ; 23 | 24 | # Years are not read as cardinals, generally: 25 | year = 26 | (("19" @ n.CARDINAL) u.I[" "] two_digit) 27 | | (("20" @ n.CARDINAL) u.I[" "] ((D d) @ two_digit)) 28 | | (("200" d) @ n.CARDINAL) 29 | ; 30 | 31 | # Remove these if they occur 32 | 33 | field = (b.kAlpha | "_")+; 34 | preserve_order = "preserve_order:" s "true" s; 35 | field_order = "field_order:" s q field q s; 36 | field_order_specs = (preserve_order | field_order)*; 37 | 38 | # Verbalization for MDY 39 | mdy = 40 | u.D["date" s "{" s] 41 | u.D["month:" s q] 42 | month 43 | u.I[" the "] 44 | u.D[q s] 45 | u.D["day:" s q] 46 | day 47 | u.I[" "] 48 | u.D[q s "year:" s q] 49 | year 50 | u.D[q s] 51 | u.D[field_order_specs]? 52 | u.D["}"] 53 | ; 54 | 55 | # Verbalization for DMY 56 | dmy = 57 | u.D["date" s "{" s] 58 | u.I["the "] 59 | u.D["day:" s q] 60 | day 61 | u.D[q s] 62 | u.I[" of "] 63 | u.D["month:" s q] 64 | month 65 | u.I[" "] 66 | u.D[q s "year:" s q] 67 | year 68 | u.D[q s] 69 | u.D[field_order_specs]? 70 | u.D["}"] 71 | ; 72 | 73 | export DATE = Optimize[mdy | dmy]; 74 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/verbalize/measure.grm: -------------------------------------------------------------------------------- 1 | import '../byte.grm' as b; 2 | import '../util.grm' as u; 3 | import 'numbers.grm' as n; 4 | 5 | # Except with exactly 1, the plural form is used, so we map to that form, and 6 | # then singularize below. 7 | measures = 8 | ("centimeter" : "centimeters") 9 | | ("kilogram" : "kilograms") 10 | | ("degree" : "degrees") 11 | ; 12 | 13 | # quotation mark 14 | q = u.q; 15 | 16 | # Used to allow for different numbers of spaces coming out of the serializer. 17 | s = u.s; 18 | 19 | # Removes the markup (allowing for various spacing possibilities in the 20 | # serialization) and verbalizes the remainder. 21 | measure = 22 | u.D["measure" s "{" s] 23 | u.D[s "decimal" s "{" s] 24 | u.D["integer_part:" s q] 25 | n.CARDINAL 26 | u.D[q] 27 | (u.D[s "fractional_part:" s q] 28 | u.I[" point "] 29 | n.DIGITS 30 | u.D[q])? 31 | u.D[s "}" s] 32 | u.D[s "units:" s q] 33 | u.I[" "] 34 | measures 35 | u.D[q] 36 | u.D[s "}" s] 37 | ; 38 | 39 | sigstar = b.kBytes*; 40 | 41 | # Uses the singular form after exactly "one". 42 | singularize = CDRewrite[Invert[measures], "[BOS]one ", "", sigstar]; 43 | 44 | export MEASURE = Optimize[measure @ singularize]; 45 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/verbalize/money.grm: -------------------------------------------------------------------------------- 1 | import '../byte.grm' as b; 2 | import '../util.grm' as u; 3 | import 'numbers.grm' as n; 4 | 5 | q = u.q; 6 | 7 | # Used to allow for different numbers of spaces coming out of the serializer. 8 | s = u.s; 9 | 10 | d = b.kDigit; 11 | 12 | currencies = StringFile['money.tsv']; 13 | 14 | # Simple currency amounts such as: 15 | # money { amount { integer_part: "3"} currency: "usd" } 16 | 17 | sigstar = b.kBytes*; 18 | 19 | # Rules to insert "_maj" and "_min" at the end of the currency terms. 20 | ins_maj = CDRewrite[u.I["_maj"], "", "[EOS]", sigstar]; 21 | ins_min = CDRewrite[u.I["_min"], "", "[EOS]", sigstar]; 22 | 23 | # Removes the markup (allowing for various spacing possibilities in the 24 | # serialization) and verbalizes the remainder. 25 | money_whole = 26 | u.D["money" s "{" s ""] 27 | u.D["amount" s "{" s ""] 28 | u.D["integer_part:" s q] 29 | n.CARDINAL 30 | u.D[q] 31 | u.D[s "}" s] 32 | u.D["currency: " q] 33 | u.I[" "] 34 | (ins_maj @ currencies) 35 | u.D[q] 36 | u.D[s "}"] 37 | ; 38 | 39 | del_zero = CDRewrite[u.D["0"], "[BOS]", "", sigstar]; 40 | 41 | # money { amount { integer_part: "3" fractional_part: "50"} currency: "usd" } 42 | # Here wa assume that the input has been reduplicated (see REDUP), and then on 43 | # the lefthand side we delete the minor currency and on the righthand side the 44 | # major currency. The reduplication is done IN CODE (see 45 | # RuleSystem::ApplyRules() in rule_system.cc). 46 | # 47 | # Removes the markup (allowing for various spacing possibilities in the 48 | # serialization) and verbalizes the remainder. 49 | 50 | money_all = 51 | u.D["money" s "{" s] 52 | u.D["amount" s "{" s] 53 | u.D["integer_part:" s q] 54 | n.CARDINAL 55 | u.D[q] 56 | u.D[" fractional_part:" s q] 57 | u.D[d+] 58 | u.D[q] 59 | u.D[s "}" s] 60 | u.D["currency:" s q] 61 | u.I[" "] 62 | (ins_maj @ currencies) 63 | u.D[q] 64 | u.D[s "}" s] 65 | u.I[" and "] 66 | u.D["money" s "{" s] 67 | u.D["amount" s "{" s] 68 | u.D["integer_part:" s q] 69 | u.D[d+] 70 | u.D[q] 71 | u.D[s "fractional_part:" s q] 72 | (del_zero @ n.CARDINAL) 73 | u.D[q] 74 | u.D[s "}" s] 75 | u.D["currency:" s q] 76 | u.I[" "] 77 | (ins_min @ currencies) 78 | u.D[q] 79 | u.D[s "}"] 80 | ; 81 | 82 | # Singularize after "one" (as in measures). 83 | 84 | singulars = 85 | ("dollars" : "dollar") 86 | | ("cents" : "cent") 87 | | ("pounds" : "pound") 88 | | ("pence" : "penny") 89 | | ("euros" : "euro") 90 | ; 91 | 92 | singularize = 93 | CDRewrite[singulars, "[BOS]one " | "and one ", "", sigstar] 94 | ; 95 | 96 | export MONEY = 97 | Optimize[ 98 | (money_whole | money_all) 99 | @ singularize] 100 | ; 101 | 102 | # keep one space 103 | ks = s : " "; 104 | 105 | # Pattern that matches the currency expression to be reduplicated. 106 | redup = 107 | "money" ks "{" ks 108 | "amount" ks "{" ks 109 | "integer_part:" ks q 110 | d+ 111 | q 112 | ks "fractional_part:" ks q 113 | d+ 114 | q 115 | ks "}" ks 116 | "currency:" ks q 117 | # Match to ins_maj then project back to the 3-letter code. 118 | Project[ins_maj @ currencies, 'input'] 119 | q 120 | ks "}" 121 | ; 122 | 123 | export REDUP = redup; 124 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/verbalize/money.tsv: -------------------------------------------------------------------------------- 1 | usd_maj dollars 2 | usd_min cents 3 | gbp_maj pounds 4 | gbp_min pence 5 | eur_maj euros 6 | eur_min cents 7 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/verbalize/numbers.grm: -------------------------------------------------------------------------------- 1 | import '../byte.grm' as b; 2 | import '../util.grm' as u; 3 | 4 | # English cardinal and ordinal number names are FSTs that are trained using the 5 | # algorithm reported in: 6 | # 7 | # Kyle Gorman and Richard Sproat. "Minimally supervised models for number 8 | # normalization." Transactions of the Association for Computational Linguistics. 2016. 9 | cardinal = LoadFst['CARDINAL_NUMBER_NAME']; 10 | 11 | ordinal = LoadFst['ORDINAL_NUMBER_NAME']; 12 | 13 | d = b.kDigit; 14 | 15 | digit = d @ cardinal; 16 | 17 | export CARDINAL = cardinal; 18 | 19 | export ORDINAL = ordinal; 20 | 21 | export DIGITS = Optimize[digit (u.I[" "] digit)*]; 22 | 23 | q = u.q; 24 | 25 | # Used to allow for different numbers of spaces coming out of the serializer. 26 | s = u.s; 27 | 28 | # Removes the markup (allowing for various spacing possibilities in the 29 | # serialization) and verbalizes the remainder. 30 | cardinal_markup = 31 | u.D["cardinal" s "{" s] 32 | u.D["integer:" s q] 33 | cardinal 34 | u.D[q] 35 | u.D[s "}"] 36 | ; 37 | 38 | export CARDINAL_MARKUP = Optimize[cardinal_markup]; 39 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/verbalize/time.grm: -------------------------------------------------------------------------------- 1 | import '../byte.grm' as b; 2 | import '../util.grm' as u; 3 | import 'numbers.grm' as n; 4 | 5 | q = u.q; 6 | 7 | # Used to allow for different numbers of spaces coming out of the serializer. 8 | s = u.s; 9 | 10 | d = b.kDigit; 11 | 12 | hour = (u.D["0"]? d | (d - "0") d) @ n.CARDINAL; 13 | 14 | sigstar = b.kBytes*; 15 | 16 | # Various renditions of minutes: 17 | # 18 | # 03 -> oh three 19 | # 13 -> thirteen 20 | # 00 -> o'clock 21 | # 22 | # Note that trailing 0 is removed so that 3:03 comes in as 23 | # 24 | # hours: 3 minutes: 3 25 | minute = 26 | ( (("" : "oh ") (d @ n.CARDINAL)) 27 | | (d d) @ n.CARDINAL) 28 | @ CDRewrite["oh zero" : "o'clock", "", "", sigstar]; 29 | 30 | # Removes the markup (allowing for various spacing possibilities in the 31 | # serialization) and verbalizes the remainder. 32 | time = 33 | u.D["time" s "{" s] 34 | u.D["hours:" s] 35 | hour 36 | u.D[s "minutes:" s] 37 | u.I[" "] 38 | minute 39 | u.D[s "}"] 40 | ; 41 | 42 | export TIME = Optimize[time]; 43 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/verbalize/verbalize.grm: -------------------------------------------------------------------------------- 1 | import 'date.grm' as d; 2 | import 'measure.grm' as M; 3 | import 'money.grm' as m; 4 | import 'numbers.grm' as n; 5 | import 'time.grm' as t; 6 | import 'verbatim.grm' as v; 7 | 8 | # Combines all of the semiotic classes together. 9 | 10 | export ALL = Optimize[ 11 | d.DATE | M.MEASURE | m.MONEY | n.CARDINAL_MARKUP | t.TIME | v.VERBATIM]; 12 | 13 | # Exports the REDUP from money. 14 | export REDUP = m.REDUP; 15 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/verbalize/verbatim.grm: -------------------------------------------------------------------------------- 1 | import '../byte.grm' as b; 2 | import '../util.grm' as u; 3 | 4 | # A verbatim grammar is needed as a backoff since if for some reason 5 | # verbalization fails, it backs off to reading the string as the literal 6 | # sequence of characters. 7 | 8 | q = u.q; 9 | # Used to allow for different numbers of spaces coming out of the serializer. 10 | s = u.s; 11 | 12 | char = b.kNotSpace u.I["_character"]; 13 | 14 | chars = char (u.I[" "] char)*; 15 | 16 | # Removes the markup (allowing for various spacing possibilities in the 17 | # serialization) and verbalizes the remainder. 18 | export VERBATIM = Optimize[u.D["verbatim:" s q] chars u.D[q]]; 19 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/verbalize_serialization/CARDINAL_NUMBER_NAME: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/sparrowhawk/a0503e26a433fbd3a9ff81ba7a08819e4a3bb668/documentation/grammars/en_toy/verbalize_serialization/CARDINAL_NUMBER_NAME -------------------------------------------------------------------------------- /documentation/grammars/en_toy/verbalize_serialization/ORDINAL_NUMBER_NAME: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/sparrowhawk/a0503e26a433fbd3a9ff81ba7a08819e4a3bb668/documentation/grammars/en_toy/verbalize_serialization/ORDINAL_NUMBER_NAME -------------------------------------------------------------------------------- /documentation/grammars/en_toy/verbalize_serialization/date.grm: -------------------------------------------------------------------------------- 1 | import '../byte.grm' as b; 2 | import '../util.grm' as u; 3 | import 'numbers.grm' as n; 4 | 5 | # quotation mark 6 | q = u.q; 7 | 8 | # Used to allow for different numbers of spaces coming out of the serializer. 9 | s = u.s; 10 | 11 | month = b.kAlpha+; 12 | 13 | day = n.ORDINAL; 14 | 15 | d = b.kDigit; 16 | D = b.kDigit - "0"; 17 | 18 | two_digit = 19 | ((D d) @ n.CARDINAL) 20 | | ("0" : "oh ") (D @ n.CARDINAL) 21 | | ("00" : "hundred") 22 | ; 23 | 24 | # Years are not read as cardinals, generally: 25 | year = 26 | (("19" @ n.CARDINAL) u.I[" "] two_digit) 27 | | (("20" @ n.CARDINAL) u.I[" "] ((D d) @ two_digit)) 28 | | (("200" d) @ n.CARDINAL) 29 | ; 30 | 31 | # Remove these if they occur 32 | 33 | field = (b.kAlpha | "_")+; 34 | preserve_order = "preserve_order:true"; 35 | field_order = "field_order:" field; 36 | field_order_specs = (preserve_order | field_order)*; 37 | 38 | # Verbalization for MDY 39 | mdy = 40 | u.D["date"] 41 | u.D["|month:"] 42 | month 43 | u.I[" the "] 44 | u.D["|day:"] 45 | day 46 | u.I[" "] 47 | u.D["|year:"] 48 | year 49 | u.D[field_order_specs]? 50 | u.D["|"] 51 | ; 52 | 53 | # Verbalization for DMY 54 | dmy = 55 | u.D["date"] 56 | u.I["the "] 57 | u.D["|day:"] 58 | day 59 | u.I[" of "] 60 | u.D["|month:"] 61 | month 62 | u.D["|year:"] 63 | u.I[" "] 64 | year 65 | u.D[field_order_specs]? 66 | u.D["|"] 67 | ; 68 | 69 | export DATE = Optimize[mdy | dmy]; 70 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/verbalize_serialization/measure.grm: -------------------------------------------------------------------------------- 1 | import '../byte.grm' as b; 2 | import '../util.grm' as u; 3 | import 'numbers.grm' as n; 4 | 5 | # Except with exactly 1, the plural form is used, so we map to that form, and 6 | # then singularize below. 7 | measures = 8 | ("centimeter" : "centimeters") 9 | | ("kilogram" : "kilograms") 10 | | ("degree" : "degrees") 11 | ; 12 | 13 | # quotation mark 14 | q = u.q; 15 | 16 | # Used to allow for different numbers of spaces coming out of the serializer. 17 | s = u.s; 18 | 19 | # Removes the markup (allowing for various spacing possibilities in the 20 | # serialization) and verbalizes the remainder. 21 | measure = 22 | u.D["measure"] 23 | u.D["|integer_part:"] 24 | n.CARDINAL 25 | (u.D["|fractional_part:"] 26 | u.I[" point "] 27 | n.DIGITS)? 28 | u.I[" "] 29 | u.D["|units:"] 30 | measures 31 | u.D["|"] 32 | ; 33 | 34 | sigstar = b.kBytes*; 35 | 36 | # Uses the singular form after exactly "one". 37 | singularize = CDRewrite[Invert[measures], "[BOS]one ", "", sigstar]; 38 | 39 | export MEASURE = Optimize[measure @ singularize]; 40 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/verbalize_serialization/money.grm: -------------------------------------------------------------------------------- 1 | import '../byte.grm' as b; 2 | import '../util.grm' as u; 3 | import 'numbers.grm' as n; 4 | 5 | q = u.q; 6 | 7 | # Used to allow for different numbers of spaces coming out of the serializer. 8 | s = u.s; 9 | 10 | d = b.kDigit; 11 | 12 | currencies = StringFile['money.tsv']; 13 | 14 | # Simple currency amounts such as: 15 | # money|integer_part:3|currency:usd| 16 | 17 | sigstar = b.kBytes*; 18 | 19 | # Rules to insert "_maj" and "_min" at the end of the currency terms. 20 | ins_maj = CDRewrite[u.I["_maj"], "", "[EOS]", sigstar]; 21 | ins_min = CDRewrite[u.I["_min"], "", "[EOS]", sigstar]; 22 | 23 | del_zero = CDRewrite[u.D["0"], "[BOS]", "", sigstar]; 24 | 25 | # money|integer_part:3|currency:usd|fractional_part:50|currency:usd| 26 | # Here wa assume that the input has been reduplicated (see REDUP), and then on 27 | # the lefthand side we delete the minor currency and on the righthand side the 28 | # major currency. The reduplication is done IN CODE (see 29 | # RuleSystem::ApplyRules() in rule_system.cc). 30 | # 31 | # Removes the markup (allowing for various spacing possibilities in the 32 | # serialization) and verbalizes the remainder. 33 | 34 | money = 35 | u.D["money"] 36 | u.D["|integer_part:"] 37 | n.CARDINAL 38 | u.D["|currency:"] 39 | u.I[" "] 40 | (ins_maj @ currencies) 41 | (u.I[" and "] 42 | u.D["|fractional_part:"] 43 | (del_zero @ n.CARDINAL) 44 | u.D["|currency:"] 45 | u.I[" "] 46 | (ins_min @ currencies))? 47 | u.D[s "|"] 48 | ; 49 | 50 | # Singularize after "one" (as in measures). 51 | 52 | singulars = 53 | ("dollars" : "dollar") 54 | | ("cents" : "cent") 55 | | ("pounds" : "pound") 56 | | ("pence" : "penny") 57 | | ("euros" : "euro") 58 | ; 59 | 60 | singularize = 61 | CDRewrite[singulars, "[BOS]one " | "and one ", "", sigstar] 62 | ; 63 | 64 | export MONEY = Optimize[money @ singularize]; 65 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/verbalize_serialization/money.tsv: -------------------------------------------------------------------------------- 1 | usd_maj dollars 2 | usd_min cents 3 | gbp_maj pounds 4 | gbp_min pence 5 | eur_maj euros 6 | eur_min cents 7 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/verbalize_serialization/numbers.grm: -------------------------------------------------------------------------------- 1 | import '../byte.grm' as b; 2 | import '../util.grm' as u; 3 | 4 | # English cardinal and ordinal number names are FSTs that are trained using the 5 | # algorithm reported in: 6 | # 7 | # Kyle Gorman and Richard Sproat. "Minimally supervised models for number 8 | # normalization." Transactions of the Association for Computational Linguistics. 2016. 9 | cardinal = LoadFst['CARDINAL_NUMBER_NAME']; 10 | 11 | ordinal = LoadFst['ORDINAL_NUMBER_NAME']; 12 | 13 | d = b.kDigit; 14 | 15 | digit = d @ cardinal; 16 | 17 | export CARDINAL = cardinal; 18 | 19 | export ORDINAL = ordinal; 20 | 21 | export DIGITS = Optimize[digit (u.I[" "] digit)*]; 22 | 23 | q = u.q; 24 | 25 | # Used to allow for different numbers of spaces coming out of the serializer. 26 | s = u.s; 27 | 28 | # Removes the markup (allowing for various spacing possibilities in the 29 | # serialization) and verbalizes the remainder. 30 | cardinal_markup = 31 | u.D["cardinal|integer:"] 32 | cardinal 33 | u.D[s "|"] 34 | ; 35 | 36 | export CARDINAL_MARKUP = Optimize[cardinal_markup]; 37 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/verbalize_serialization/time.grm: -------------------------------------------------------------------------------- 1 | import '../byte.grm' as b; 2 | import '../util.grm' as u; 3 | import 'numbers.grm' as n; 4 | 5 | q = u.q; 6 | 7 | # Used to allow for different numbers of spaces coming out of the serializer. 8 | s = u.s; 9 | 10 | d = b.kDigit; 11 | 12 | hour = (u.D["0"]? d | (d - "0") d) @ n.CARDINAL; 13 | 14 | sigstar = b.kBytes*; 15 | 16 | # Various renditions of minutes: 17 | # 18 | # 03 -> oh three 19 | # 13 -> thirteen 20 | # 00 -> o'clock 21 | # 22 | # Note that trailing 0 is removed so that 3:03 comes in as 23 | # 24 | # hours: 3 minutes: 3 25 | minute = 26 | ( (("" : "oh ") (d @ n.CARDINAL)) 27 | | (d d) @ n.CARDINAL) 28 | @ CDRewrite["oh zero" : "o'clock", "", "", sigstar]; 29 | 30 | # Removes the markup (allowing for various spacing possibilities in the 31 | # serialization) and verbalizes the remainder. 32 | time = 33 | u.D["time"] 34 | u.D["|hours:"] 35 | hour 36 | u.D["|minutes:"] 37 | u.I[" "] 38 | minute 39 | u.D["|"] 40 | ; 41 | 42 | export TIME = Optimize[time]; 43 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/verbalize_serialization/verbalize.grm: -------------------------------------------------------------------------------- 1 | import 'date.grm' as d; 2 | import 'measure.grm' as M; 3 | import 'money.grm' as m; 4 | import 'numbers.grm' as n; 5 | import 'time.grm' as t; 6 | import 'verbatim.grm' as v; 7 | 8 | # Combines all of the semiotic classes together. 9 | 10 | export ALL = Optimize[ 11 | d.DATE | M.MEASURE | m.MONEY | n.CARDINAL_MARKUP | t.TIME | v.VERBATIM]; 12 | -------------------------------------------------------------------------------- /documentation/grammars/en_toy/verbalize_serialization/verbatim.grm: -------------------------------------------------------------------------------- 1 | import '../byte.grm' as b; 2 | import '../util.grm' as u; 3 | 4 | # A verbatim grammar is needed as a backoff since if for some reason 5 | # verbalization fails, it backs off to reading the string as the literal 6 | # sequence of characters. 7 | 8 | q = u.q; 9 | # Used to allow for different numbers of spaces coming out of the serializer. 10 | s = u.s; 11 | 12 | char = b.kNotSpace u.I["_character"]; 13 | 14 | chars = char (u.I[" "] char)*; 15 | 16 | # Removes the markup (allowing for various spacing possibilities in the 17 | # serialization) and verbalizes the remainder. 18 | export VERBATIM = Optimize[u.D["verbatim|verbatim:" q?] chars u.D[q? "|"]]; 19 | -------------------------------------------------------------------------------- /documentation/grammars/sentence_boundary_exceptions.txt: -------------------------------------------------------------------------------- 1 | Mr. 2 | Dr. 3 | Mrs. 4 | St. 5 | Jan. 6 | Feb. 7 | Mar. 8 | Apr. 9 | Jun. 10 | Jul. 11 | Aug. 12 | Sep. 13 | Oct. 14 | Nov. 15 | Dec. 16 | -------------------------------------------------------------------------------- /documentation/grammars/sparrowhawk_configuration.ascii_proto: -------------------------------------------------------------------------------- 1 | tokenizer_grammar: "tokenizer.ascii_proto" 2 | 3 | verbalizer_grammar: "verbalizer.ascii_proto" 4 | 5 | sentence_boundary_regexp: "[\\.:!\\?] " 6 | 7 | sentence_boundary_exceptions_file: "sentence_boundary_exceptions.txt" 8 | -------------------------------------------------------------------------------- /documentation/grammars/sparrowhawk_configuration_serialization.ascii_proto: -------------------------------------------------------------------------------- 1 | tokenizer_grammar: "tokenizer.ascii_proto" 2 | 3 | verbalizer_grammar: "verbalizer_serialization.ascii_proto" 4 | 5 | sentence_boundary_regexp: "[\\.:!\\?] " 6 | 7 | sentence_boundary_exceptions_file: "sentence_boundary_exceptions.txt" 8 | 9 | serialization_spec: "verbalizer_serialization_spec.ascii_proto" 10 | -------------------------------------------------------------------------------- /documentation/grammars/test.txt: -------------------------------------------------------------------------------- 1 | The train left at 3:30 from Penn Station on Jan. 3, 2010. Mr. Snookums 2 | was on the train carrying $40.25 (£30.60) of Belgian chocolate in a 3kg box that 3 | was 20cm wide. 4 | -------------------------------------------------------------------------------- /documentation/grammars/tokenizer.ascii_proto: -------------------------------------------------------------------------------- 1 | grammar_file: "en_toy/classify/tokenize_and_classify.far" 2 | 3 | grammar_name: "TokenizerClassifier" 4 | 5 | rules { main: "TOKENIZE_AND_CLASSIFY" } 6 | -------------------------------------------------------------------------------- /documentation/grammars/verbalizer.ascii_proto: -------------------------------------------------------------------------------- 1 | grammar_file: "en_toy/verbalize/verbalize.far" 2 | 3 | grammar_name: "Verbalizer" 4 | 5 | rules { main: "ALL" redup: "REDUP" } 6 | -------------------------------------------------------------------------------- /documentation/grammars/verbalizer_serialization.ascii_proto: -------------------------------------------------------------------------------- 1 | grammar_file: "en_toy/verbalize_serialization/verbalize.far" 2 | 3 | grammar_name: "Verbalizer" 4 | 5 | rules { main: "ALL" } 6 | -------------------------------------------------------------------------------- /documentation/grammars/verbalizer_serialization_spec.ascii_proto: -------------------------------------------------------------------------------- 1 | class_spec { 2 | semiotic_class: "measure" 3 | style_spec { 4 | record_spec { 5 | field_path: "measure.decimal.integer_part" 6 | } 7 | record_spec { 8 | field_path: "measure.decimal.fractional_part" 9 | } 10 | record_spec { 11 | field_path: "measure.units" 12 | } 13 | required_fields: "measure.decimal.integer_part" 14 | } 15 | } 16 | class_spec { 17 | semiotic_class: "money" 18 | style_spec { 19 | record_spec { 20 | field_path: "money.amount.integer_part" 21 | suffix_spec { 22 | field_path: "money.currency" 23 | } 24 | } 25 | record_spec { 26 | field_path: "money.amount.fractional_part" 27 | suffix_spec { 28 | field_path: "money.currency" 29 | } 30 | } 31 | } 32 | } 33 | class_spec { 34 | semiotic_class: "cardinal" 35 | style_spec { 36 | record_spec { 37 | field_path: "cardinal.integer" 38 | } 39 | } 40 | } 41 | class_spec { 42 | semiotic_class: "time" 43 | style_spec { 44 | record_spec { 45 | field_path: "time.hours" 46 | } 47 | record_spec { 48 | field_path: "time.minutes" 49 | } 50 | } 51 | } 52 | class_spec { 53 | semiotic_class: "date" 54 | style_spec { 55 | record_spec { 56 | field_path: "date.day" 57 | } 58 | record_spec { 59 | field_path: "date.month" 60 | } 61 | record_spec { 62 | field_path: "date.year" 63 | } 64 | } 65 | style_spec { 66 | record_spec { 67 | field_path: "date.month" 68 | } 69 | record_spec { 70 | field_path: "date.day" 71 | } 72 | record_spec { 73 | field_path: "date.year" 74 | } 75 | } 76 | } 77 | class_spec { 78 | semiotic_class: "verbatim" 79 | style_spec { 80 | record_spec { 81 | field_path: "verbatim" 82 | } 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /m4/ltsugar.m4: -------------------------------------------------------------------------------- 1 | # ltsugar.m4 -- libtool m4 base layer. -*-Autoconf-*- 2 | # 3 | # Copyright (C) 2004, 2005, 2007, 2008 Free Software Foundation, Inc. 4 | # Written by Gary V. Vaughan, 2004 5 | # 6 | # This file is free software; the Free Software Foundation gives 7 | # unlimited permission to copy and/or distribute it, with or without 8 | # modifications, as long as this notice is preserved. 9 | 10 | # serial 6 ltsugar.m4 11 | 12 | # This is to help aclocal find these macros, as it can't see m4_define. 13 | AC_DEFUN([LTSUGAR_VERSION], [m4_if([0.1])]) 14 | 15 | 16 | # lt_join(SEP, ARG1, [ARG2...]) 17 | # ----------------------------- 18 | # Produce ARG1SEPARG2...SEPARGn, omitting [] arguments and their 19 | # associated separator. 20 | # Needed until we can rely on m4_join from Autoconf 2.62, since all earlier 21 | # versions in m4sugar had bugs. 22 | m4_define([lt_join], 23 | [m4_if([$#], [1], [], 24 | [$#], [2], [[$2]], 25 | [m4_if([$2], [], [], [[$2]_])$0([$1], m4_shift(m4_shift($@)))])]) 26 | m4_define([_lt_join], 27 | [m4_if([$#$2], [2], [], 28 | [m4_if([$2], [], [], [[$1$2]])$0([$1], m4_shift(m4_shift($@)))])]) 29 | 30 | 31 | # lt_car(LIST) 32 | # lt_cdr(LIST) 33 | # ------------ 34 | # Manipulate m4 lists. 35 | # These macros are necessary as long as will still need to support 36 | # Autoconf-2.59 which quotes differently. 37 | m4_define([lt_car], [[$1]]) 38 | m4_define([lt_cdr], 39 | [m4_if([$#], 0, [m4_fatal([$0: cannot be called without arguments])], 40 | [$#], 1, [], 41 | [m4_dquote(m4_shift($@))])]) 42 | m4_define([lt_unquote], $1) 43 | 44 | 45 | # lt_append(MACRO-NAME, STRING, [SEPARATOR]) 46 | # ------------------------------------------ 47 | # Redefine MACRO-NAME to hold its former content plus `SEPARATOR'`STRING'. 48 | # Note that neither SEPARATOR nor STRING are expanded; they are appended 49 | # to MACRO-NAME as is (leaving the expansion for when MACRO-NAME is invoked). 50 | # No SEPARATOR is output if MACRO-NAME was previously undefined (different 51 | # than defined and empty). 52 | # 53 | # This macro is needed until we can rely on Autoconf 2.62, since earlier 54 | # versions of m4sugar mistakenly expanded SEPARATOR but not STRING. 55 | m4_define([lt_append], 56 | [m4_define([$1], 57 | m4_ifdef([$1], [m4_defn([$1])[$3]])[$2])]) 58 | 59 | 60 | 61 | # lt_combine(SEP, PREFIX-LIST, INFIX, SUFFIX1, [SUFFIX2...]) 62 | # ---------------------------------------------------------- 63 | # Produce a SEP delimited list of all paired combinations of elements of 64 | # PREFIX-LIST with SUFFIX1 through SUFFIXn. Each element of the list 65 | # has the form PREFIXmINFIXSUFFIXn. 66 | # Needed until we can rely on m4_combine added in Autoconf 2.62. 67 | m4_define([lt_combine], 68 | [m4_if(m4_eval([$# > 3]), [1], 69 | [m4_pushdef([_Lt_sep], [m4_define([_Lt_sep], m4_defn([lt_car]))])]]dnl 70 | [[m4_foreach([_Lt_prefix], [$2], 71 | [m4_foreach([_Lt_suffix], 72 | ]m4_dquote(m4_dquote(m4_shift(m4_shift(m4_shift($@)))))[, 73 | [_Lt_sep([$1])[]m4_defn([_Lt_prefix])[$3]m4_defn([_Lt_suffix])])])])]) 74 | 75 | 76 | # lt_if_append_uniq(MACRO-NAME, VARNAME, [SEPARATOR], [UNIQ], [NOT-UNIQ]) 77 | # ----------------------------------------------------------------------- 78 | # Iff MACRO-NAME does not yet contain VARNAME, then append it (delimited 79 | # by SEPARATOR if supplied) and expand UNIQ, else NOT-UNIQ. 80 | m4_define([lt_if_append_uniq], 81 | [m4_ifdef([$1], 82 | [m4_if(m4_index([$3]m4_defn([$1])[$3], [$3$2$3]), [-1], 83 | [lt_append([$1], [$2], [$3])$4], 84 | [$5])], 85 | [lt_append([$1], [$2], [$3])$4])]) 86 | 87 | 88 | # lt_dict_add(DICT, KEY, VALUE) 89 | # ----------------------------- 90 | m4_define([lt_dict_add], 91 | [m4_define([$1($2)], [$3])]) 92 | 93 | 94 | # lt_dict_add_subkey(DICT, KEY, SUBKEY, VALUE) 95 | # -------------------------------------------- 96 | m4_define([lt_dict_add_subkey], 97 | [m4_define([$1($2:$3)], [$4])]) 98 | 99 | 100 | # lt_dict_fetch(DICT, KEY, [SUBKEY]) 101 | # ---------------------------------- 102 | m4_define([lt_dict_fetch], 103 | [m4_ifval([$3], 104 | m4_ifdef([$1($2:$3)], [m4_defn([$1($2:$3)])]), 105 | m4_ifdef([$1($2)], [m4_defn([$1($2)])]))]) 106 | 107 | 108 | # lt_if_dict_fetch(DICT, KEY, [SUBKEY], VALUE, IF-TRUE, [IF-FALSE]) 109 | # ----------------------------------------------------------------- 110 | m4_define([lt_if_dict_fetch], 111 | [m4_if(lt_dict_fetch([$1], [$2], [$3]), [$4], 112 | [$5], 113 | [$6])]) 114 | 115 | 116 | # lt_dict_filter(DICT, [SUBKEY], VALUE, [SEPARATOR], KEY, [...]) 117 | # -------------------------------------------------------------- 118 | m4_define([lt_dict_filter], 119 | [m4_if([$5], [], [], 120 | [lt_join(m4_quote(m4_default([$4], [[, ]])), 121 | lt_unquote(m4_split(m4_normalize(m4_foreach(_Lt_key, lt_car([m4_shiftn(4, $@)]), 122 | [lt_if_dict_fetch([$1], _Lt_key, [$2], [$3], [_Lt_key ])])))))])[]dnl 123 | ]) 124 | -------------------------------------------------------------------------------- /m4/ltversion.m4: -------------------------------------------------------------------------------- 1 | # ltversion.m4 -- version numbers -*- Autoconf -*- 2 | # 3 | # Copyright (C) 2004 Free Software Foundation, Inc. 4 | # Written by Scott James Remnant, 2004 5 | # 6 | # This file is free software; the Free Software Foundation gives 7 | # unlimited permission to copy and/or distribute it, with or without 8 | # modifications, as long as this notice is preserved. 9 | 10 | # @configure_input@ 11 | 12 | # serial 3337 ltversion.m4 13 | # This file is part of GNU Libtool 14 | 15 | m4_define([LT_PACKAGE_VERSION], [2.4.2]) 16 | m4_define([LT_PACKAGE_REVISION], [1.3337]) 17 | 18 | AC_DEFUN([LTVERSION_VERSION], 19 | [macro_version='2.4.2' 20 | macro_revision='1.3337' 21 | _LT_DECL(, macro_version, 0, [Which release of libtool.m4 was used?]) 22 | _LT_DECL(, macro_revision, 0) 23 | ]) 24 | -------------------------------------------------------------------------------- /m4/lt~obsolete.m4: -------------------------------------------------------------------------------- 1 | # lt~obsolete.m4 -- aclocal satisfying obsolete definitions. -*-Autoconf-*- 2 | # 3 | # Copyright (C) 2004, 2005, 2007, 2009 Free Software Foundation, Inc. 4 | # Written by Scott James Remnant, 2004. 5 | # 6 | # This file is free software; the Free Software Foundation gives 7 | # unlimited permission to copy and/or distribute it, with or without 8 | # modifications, as long as this notice is preserved. 9 | 10 | # serial 5 lt~obsolete.m4 11 | 12 | # These exist entirely to fool aclocal when bootstrapping libtool. 13 | # 14 | # In the past libtool.m4 has provided macros via AC_DEFUN (or AU_DEFUN) 15 | # which have later been changed to m4_define as they aren't part of the 16 | # exported API, or moved to Autoconf or Automake where they belong. 17 | # 18 | # The trouble is, aclocal is a bit thick. It'll see the old AC_DEFUN 19 | # in /usr/share/aclocal/libtool.m4 and remember it, then when it sees us 20 | # using a macro with the same name in our local m4/libtool.m4 it'll 21 | # pull the old libtool.m4 in (it doesn't see our shiny new m4_define 22 | # and doesn't know about Autoconf macros at all.) 23 | # 24 | # So we provide this file, which has a silly filename so it's always 25 | # included after everything else. This provides aclocal with the 26 | # AC_DEFUNs it wants, but when m4 processes it, it doesn't do anything 27 | # because those macros already exist, or will be overwritten later. 28 | # We use AC_DEFUN over AU_DEFUN for compatibility with aclocal-1.6. 29 | # 30 | # Anytime we withdraw an AC_DEFUN or AU_DEFUN, remember to add it here. 31 | # Yes, that means every name once taken will need to remain here until 32 | # we give up compatibility with versions before 1.7, at which point 33 | # we need to keep only those names which we still refer to. 34 | 35 | # This is to help aclocal find these macros, as it can't see m4_define. 36 | AC_DEFUN([LTOBSOLETE_VERSION], [m4_if([1])]) 37 | 38 | m4_ifndef([AC_LIBTOOL_LINKER_OPTION], [AC_DEFUN([AC_LIBTOOL_LINKER_OPTION])]) 39 | m4_ifndef([AC_PROG_EGREP], [AC_DEFUN([AC_PROG_EGREP])]) 40 | m4_ifndef([_LT_AC_PROG_ECHO_BACKSLASH], [AC_DEFUN([_LT_AC_PROG_ECHO_BACKSLASH])]) 41 | m4_ifndef([_LT_AC_SHELL_INIT], [AC_DEFUN([_LT_AC_SHELL_INIT])]) 42 | m4_ifndef([_LT_AC_SYS_LIBPATH_AIX], [AC_DEFUN([_LT_AC_SYS_LIBPATH_AIX])]) 43 | m4_ifndef([_LT_PROG_LTMAIN], [AC_DEFUN([_LT_PROG_LTMAIN])]) 44 | m4_ifndef([_LT_AC_TAGVAR], [AC_DEFUN([_LT_AC_TAGVAR])]) 45 | m4_ifndef([AC_LTDL_ENABLE_INSTALL], [AC_DEFUN([AC_LTDL_ENABLE_INSTALL])]) 46 | m4_ifndef([AC_LTDL_PREOPEN], [AC_DEFUN([AC_LTDL_PREOPEN])]) 47 | m4_ifndef([_LT_AC_SYS_COMPILER], [AC_DEFUN([_LT_AC_SYS_COMPILER])]) 48 | m4_ifndef([_LT_AC_LOCK], [AC_DEFUN([_LT_AC_LOCK])]) 49 | m4_ifndef([AC_LIBTOOL_SYS_OLD_ARCHIVE], [AC_DEFUN([AC_LIBTOOL_SYS_OLD_ARCHIVE])]) 50 | m4_ifndef([_LT_AC_TRY_DLOPEN_SELF], [AC_DEFUN([_LT_AC_TRY_DLOPEN_SELF])]) 51 | m4_ifndef([AC_LIBTOOL_PROG_CC_C_O], [AC_DEFUN([AC_LIBTOOL_PROG_CC_C_O])]) 52 | m4_ifndef([AC_LIBTOOL_SYS_HARD_LINK_LOCKS], [AC_DEFUN([AC_LIBTOOL_SYS_HARD_LINK_LOCKS])]) 53 | m4_ifndef([AC_LIBTOOL_OBJDIR], [AC_DEFUN([AC_LIBTOOL_OBJDIR])]) 54 | m4_ifndef([AC_LTDL_OBJDIR], [AC_DEFUN([AC_LTDL_OBJDIR])]) 55 | m4_ifndef([AC_LIBTOOL_PROG_LD_HARDCODE_LIBPATH], [AC_DEFUN([AC_LIBTOOL_PROG_LD_HARDCODE_LIBPATH])]) 56 | m4_ifndef([AC_LIBTOOL_SYS_LIB_STRIP], [AC_DEFUN([AC_LIBTOOL_SYS_LIB_STRIP])]) 57 | m4_ifndef([AC_PATH_MAGIC], [AC_DEFUN([AC_PATH_MAGIC])]) 58 | m4_ifndef([AC_PROG_LD_GNU], [AC_DEFUN([AC_PROG_LD_GNU])]) 59 | m4_ifndef([AC_PROG_LD_RELOAD_FLAG], [AC_DEFUN([AC_PROG_LD_RELOAD_FLAG])]) 60 | m4_ifndef([AC_DEPLIBS_CHECK_METHOD], [AC_DEFUN([AC_DEPLIBS_CHECK_METHOD])]) 61 | m4_ifndef([AC_LIBTOOL_PROG_COMPILER_NO_RTTI], [AC_DEFUN([AC_LIBTOOL_PROG_COMPILER_NO_RTTI])]) 62 | m4_ifndef([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE], [AC_DEFUN([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE])]) 63 | m4_ifndef([AC_LIBTOOL_PROG_COMPILER_PIC], [AC_DEFUN([AC_LIBTOOL_PROG_COMPILER_PIC])]) 64 | m4_ifndef([AC_LIBTOOL_PROG_LD_SHLIBS], [AC_DEFUN([AC_LIBTOOL_PROG_LD_SHLIBS])]) 65 | m4_ifndef([AC_LIBTOOL_POSTDEP_PREDEP], [AC_DEFUN([AC_LIBTOOL_POSTDEP_PREDEP])]) 66 | m4_ifndef([LT_AC_PROG_EGREP], [AC_DEFUN([LT_AC_PROG_EGREP])]) 67 | m4_ifndef([LT_AC_PROG_SED], [AC_DEFUN([LT_AC_PROG_SED])]) 68 | m4_ifndef([_LT_CC_BASENAME], [AC_DEFUN([_LT_CC_BASENAME])]) 69 | m4_ifndef([_LT_COMPILER_BOILERPLATE], [AC_DEFUN([_LT_COMPILER_BOILERPLATE])]) 70 | m4_ifndef([_LT_LINKER_BOILERPLATE], [AC_DEFUN([_LT_LINKER_BOILERPLATE])]) 71 | m4_ifndef([_AC_PROG_LIBTOOL], [AC_DEFUN([_AC_PROG_LIBTOOL])]) 72 | m4_ifndef([AC_LIBTOOL_SETUP], [AC_DEFUN([AC_LIBTOOL_SETUP])]) 73 | m4_ifndef([_LT_AC_CHECK_DLFCN], [AC_DEFUN([_LT_AC_CHECK_DLFCN])]) 74 | m4_ifndef([AC_LIBTOOL_SYS_DYNAMIC_LINKER], [AC_DEFUN([AC_LIBTOOL_SYS_DYNAMIC_LINKER])]) 75 | m4_ifndef([_LT_AC_TAGCONFIG], [AC_DEFUN([_LT_AC_TAGCONFIG])]) 76 | m4_ifndef([AC_DISABLE_FAST_INSTALL], [AC_DEFUN([AC_DISABLE_FAST_INSTALL])]) 77 | m4_ifndef([_LT_AC_LANG_CXX], [AC_DEFUN([_LT_AC_LANG_CXX])]) 78 | m4_ifndef([_LT_AC_LANG_F77], [AC_DEFUN([_LT_AC_LANG_F77])]) 79 | m4_ifndef([_LT_AC_LANG_GCJ], [AC_DEFUN([_LT_AC_LANG_GCJ])]) 80 | m4_ifndef([AC_LIBTOOL_LANG_C_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_C_CONFIG])]) 81 | m4_ifndef([_LT_AC_LANG_C_CONFIG], [AC_DEFUN([_LT_AC_LANG_C_CONFIG])]) 82 | m4_ifndef([AC_LIBTOOL_LANG_CXX_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_CXX_CONFIG])]) 83 | m4_ifndef([_LT_AC_LANG_CXX_CONFIG], [AC_DEFUN([_LT_AC_LANG_CXX_CONFIG])]) 84 | m4_ifndef([AC_LIBTOOL_LANG_F77_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_F77_CONFIG])]) 85 | m4_ifndef([_LT_AC_LANG_F77_CONFIG], [AC_DEFUN([_LT_AC_LANG_F77_CONFIG])]) 86 | m4_ifndef([AC_LIBTOOL_LANG_GCJ_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_GCJ_CONFIG])]) 87 | m4_ifndef([_LT_AC_LANG_GCJ_CONFIG], [AC_DEFUN([_LT_AC_LANG_GCJ_CONFIG])]) 88 | m4_ifndef([AC_LIBTOOL_LANG_RC_CONFIG], [AC_DEFUN([AC_LIBTOOL_LANG_RC_CONFIG])]) 89 | m4_ifndef([_LT_AC_LANG_RC_CONFIG], [AC_DEFUN([_LT_AC_LANG_RC_CONFIG])]) 90 | m4_ifndef([AC_LIBTOOL_CONFIG], [AC_DEFUN([AC_LIBTOOL_CONFIG])]) 91 | m4_ifndef([_LT_AC_FILE_LTDLL_C], [AC_DEFUN([_LT_AC_FILE_LTDLL_C])]) 92 | m4_ifndef([_LT_REQUIRED_DARWIN_CHECKS], [AC_DEFUN([_LT_REQUIRED_DARWIN_CHECKS])]) 93 | m4_ifndef([_LT_AC_PROG_CXXCPP], [AC_DEFUN([_LT_AC_PROG_CXXCPP])]) 94 | m4_ifndef([_LT_PREPARE_SED_QUOTE_VARS], [AC_DEFUN([_LT_PREPARE_SED_QUOTE_VARS])]) 95 | m4_ifndef([_LT_PROG_ECHO_BACKSLASH], [AC_DEFUN([_LT_PROG_ECHO_BACKSLASH])]) 96 | m4_ifndef([_LT_PROG_F77], [AC_DEFUN([_LT_PROG_F77])]) 97 | m4_ifndef([_LT_PROG_FC], [AC_DEFUN([_LT_PROG_FC])]) 98 | m4_ifndef([_LT_PROG_CXX], [AC_DEFUN([_LT_PROG_CXX])]) 99 | -------------------------------------------------------------------------------- /missing: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # Common wrapper for a few potentially missing GNU programs. 3 | 4 | scriptversion=2013-10-28.13; # UTC 5 | 6 | # Copyright (C) 1996-2013 Free Software Foundation, Inc. 7 | # Originally written by Fran,cois Pinard , 1996. 8 | 9 | # This program is free software; you can redistribute it and/or modify 10 | # it under the terms of the GNU General Public License as published by 11 | # the Free Software Foundation; either version 2, or (at your option) 12 | # any later version. 13 | 14 | # This program is distributed in the hope that it will be useful, 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | # GNU General Public License for more details. 18 | 19 | # You should have received a copy of the GNU General Public License 20 | # along with this program. If not, see . 21 | 22 | # As a special exception to the GNU General Public License, if you 23 | # distribute this file as part of a program that contains a 24 | # configuration script generated by Autoconf, you may include it under 25 | # the same distribution terms that you use for the rest of that program. 26 | 27 | if test $# -eq 0; then 28 | echo 1>&2 "Try '$0 --help' for more information" 29 | exit 1 30 | fi 31 | 32 | case $1 in 33 | 34 | --is-lightweight) 35 | # Used by our autoconf macros to check whether the available missing 36 | # script is modern enough. 37 | exit 0 38 | ;; 39 | 40 | --run) 41 | # Back-compat with the calling convention used by older automake. 42 | shift 43 | ;; 44 | 45 | -h|--h|--he|--hel|--help) 46 | echo "\ 47 | $0 [OPTION]... PROGRAM [ARGUMENT]... 48 | 49 | Run 'PROGRAM [ARGUMENT]...', returning a proper advice when this fails due 50 | to PROGRAM being missing or too old. 51 | 52 | Options: 53 | -h, --help display this help and exit 54 | -v, --version output version information and exit 55 | 56 | Supported PROGRAM values: 57 | aclocal autoconf autoheader autom4te automake makeinfo 58 | bison yacc flex lex help2man 59 | 60 | Version suffixes to PROGRAM as well as the prefixes 'gnu-', 'gnu', and 61 | 'g' are ignored when checking the name. 62 | 63 | Send bug reports to ." 64 | exit $? 65 | ;; 66 | 67 | -v|--v|--ve|--ver|--vers|--versi|--versio|--version) 68 | echo "missing $scriptversion (GNU Automake)" 69 | exit $? 70 | ;; 71 | 72 | -*) 73 | echo 1>&2 "$0: unknown '$1' option" 74 | echo 1>&2 "Try '$0 --help' for more information" 75 | exit 1 76 | ;; 77 | 78 | esac 79 | 80 | # Run the given program, remember its exit status. 81 | "$@"; st=$? 82 | 83 | # If it succeeded, we are done. 84 | test $st -eq 0 && exit 0 85 | 86 | # Also exit now if we it failed (or wasn't found), and '--version' was 87 | # passed; such an option is passed most likely to detect whether the 88 | # program is present and works. 89 | case $2 in --version|--help) exit $st;; esac 90 | 91 | # Exit code 63 means version mismatch. This often happens when the user 92 | # tries to use an ancient version of a tool on a file that requires a 93 | # minimum version. 94 | if test $st -eq 63; then 95 | msg="probably too old" 96 | elif test $st -eq 127; then 97 | # Program was missing. 98 | msg="missing on your system" 99 | else 100 | # Program was found and executed, but failed. Give up. 101 | exit $st 102 | fi 103 | 104 | perl_URL=http://www.perl.org/ 105 | flex_URL=http://flex.sourceforge.net/ 106 | gnu_software_URL=http://www.gnu.org/software 107 | 108 | program_details () 109 | { 110 | case $1 in 111 | aclocal|automake) 112 | echo "The '$1' program is part of the GNU Automake package:" 113 | echo "<$gnu_software_URL/automake>" 114 | echo "It also requires GNU Autoconf, GNU m4 and Perl in order to run:" 115 | echo "<$gnu_software_URL/autoconf>" 116 | echo "<$gnu_software_URL/m4/>" 117 | echo "<$perl_URL>" 118 | ;; 119 | autoconf|autom4te|autoheader) 120 | echo "The '$1' program is part of the GNU Autoconf package:" 121 | echo "<$gnu_software_URL/autoconf/>" 122 | echo "It also requires GNU m4 and Perl in order to run:" 123 | echo "<$gnu_software_URL/m4/>" 124 | echo "<$perl_URL>" 125 | ;; 126 | esac 127 | } 128 | 129 | give_advice () 130 | { 131 | # Normalize program name to check for. 132 | normalized_program=`echo "$1" | sed ' 133 | s/^gnu-//; t 134 | s/^gnu//; t 135 | s/^g//; t'` 136 | 137 | printf '%s\n' "'$1' is $msg." 138 | 139 | configure_deps="'configure.ac' or m4 files included by 'configure.ac'" 140 | case $normalized_program in 141 | autoconf*) 142 | echo "You should only need it if you modified 'configure.ac'," 143 | echo "or m4 files included by it." 144 | program_details 'autoconf' 145 | ;; 146 | autoheader*) 147 | echo "You should only need it if you modified 'acconfig.h' or" 148 | echo "$configure_deps." 149 | program_details 'autoheader' 150 | ;; 151 | automake*) 152 | echo "You should only need it if you modified 'Makefile.am' or" 153 | echo "$configure_deps." 154 | program_details 'automake' 155 | ;; 156 | aclocal*) 157 | echo "You should only need it if you modified 'acinclude.m4' or" 158 | echo "$configure_deps." 159 | program_details 'aclocal' 160 | ;; 161 | autom4te*) 162 | echo "You might have modified some maintainer files that require" 163 | echo "the 'autom4te' program to be rebuilt." 164 | program_details 'autom4te' 165 | ;; 166 | bison*|yacc*) 167 | echo "You should only need it if you modified a '.y' file." 168 | echo "You may want to install the GNU Bison package:" 169 | echo "<$gnu_software_URL/bison/>" 170 | ;; 171 | lex*|flex*) 172 | echo "You should only need it if you modified a '.l' file." 173 | echo "You may want to install the Fast Lexical Analyzer package:" 174 | echo "<$flex_URL>" 175 | ;; 176 | help2man*) 177 | echo "You should only need it if you modified a dependency" \ 178 | "of a man page." 179 | echo "You may want to install the GNU Help2man package:" 180 | echo "<$gnu_software_URL/help2man/>" 181 | ;; 182 | makeinfo*) 183 | echo "You should only need it if you modified a '.texi' file, or" 184 | echo "any other file indirectly affecting the aspect of the manual." 185 | echo "You might want to install the Texinfo package:" 186 | echo "<$gnu_software_URL/texinfo/>" 187 | echo "The spurious makeinfo call might also be the consequence of" 188 | echo "using a buggy 'make' (AIX, DU, IRIX), in which case you might" 189 | echo "want to install GNU make:" 190 | echo "<$gnu_software_URL/make/>" 191 | ;; 192 | *) 193 | echo "You might have modified some files without having the proper" 194 | echo "tools for further handling them. Check the 'README' file, it" 195 | echo "often tells you about the needed prerequisites for installing" 196 | echo "this package. You may also peek at any GNU archive site, in" 197 | echo "case some other package contains this missing '$1' program." 198 | ;; 199 | esac 200 | } 201 | 202 | give_advice "$1" | sed -e '1s/^/WARNING: /' \ 203 | -e '2,$s/^/ /' >&2 204 | 205 | # Propagate the correct exit status (expected to be 127 for a program 206 | # not found, 63 for a program that failed due to version mismatch). 207 | exit $st 208 | 209 | # Local variables: 210 | # eval: (add-hook 'write-file-hooks 'time-stamp) 211 | # time-stamp-start: "scriptversion=" 212 | # time-stamp-format: "%:y-%02m-%02d.%02H" 213 | # time-stamp-time-zone: "UTC" 214 | # time-stamp-end: "; # UTC" 215 | # End: 216 | -------------------------------------------------------------------------------- /src/Makefile.am: -------------------------------------------------------------------------------- 1 | SUBDIRS = proto include lib bin 2 | 3 | 4 | -------------------------------------------------------------------------------- /src/bin/Makefile.am: -------------------------------------------------------------------------------- 1 | if HAVE_BIN 2 | bin_PROGRAMS = normalizer_main 3 | 4 | AM_CPPFLAGS = -I$(srcdir)/../include 5 | 6 | LDADD= ../lib/libsparrowhawk.la -L/usr/local/lib/fst -lthrax -lfstfar -lfst -lm -ldl -lprotobuf -lre2 7 | 8 | normalizer_main_SOURCES = normalizer_main.cc 9 | endif 10 | -------------------------------------------------------------------------------- /src/bin/normalizer_main.cc: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | // Very simple stand-alone binary to run Sparrowhawk normalizer on a line of 15 | // text. 16 | // 17 | // It runs the sentence boundary detector on the input, and then normalizes each 18 | // sentence. 19 | // 20 | // As an example of use, build the test data here, and put them somewhere, such 21 | // as tmp/sparrowhawk_test 22 | // 23 | // Then copy the relevant fars and protos there, edit the protos and then run: 24 | // 25 | // blaze-bin/speech/tts/open_source/sparrowhawk/normalizer_main \ 26 | // --config tmp/sparrowhawk_test/sparrowhawk_configuration_af.ascii_proto 27 | // 28 | // Then input a few sentences on one line such as: 29 | // 30 | // Kameelperde het 'n kenmerkende voorkoms, met hul lang nekke en relatief \ 31 | // kort lywe. Hulle word 4,3 - 5,7m lank. Die bulle is effens langer as die \ 32 | // koeie. 33 | 34 | #include 35 | #include 36 | #include 37 | using std::string; 38 | #include 39 | using std::vector; 40 | 41 | #include 42 | 43 | DEFINE_bool(multi_line_text, false, "Text is spread across multiple lines."); 44 | DEFINE_string(config, "", "Path to the configuration proto."); 45 | DEFINE_string(path_prefix, "./", "Optional path prefix if not relative."); 46 | 47 | void NormalizeInput(const string& input, 48 | speech::sparrowhawk::Normalizer *normalizer) { 49 | const std::vector sentences = normalizer->SentenceSplitter(input); 50 | for (const auto& sentence : sentences) { 51 | string output; 52 | normalizer->Normalize(sentence, &output); 53 | std::cout << output << std::endl; 54 | } 55 | } 56 | 57 | int main(int argc, char** argv) { 58 | using speech::sparrowhawk::Normalizer; 59 | std::set_new_handler(FailedNewHandler); 60 | SET_FLAGS(argv[0], &argc, &argv, true); 61 | std::unique_ptr normalizer; 62 | normalizer.reset(new Normalizer()); 63 | CHECK(normalizer->Setup(FLAGS_config, FLAGS_path_prefix)); 64 | string input; 65 | if (FLAGS_multi_line_text) { 66 | string line; 67 | while (std::getline(std::cin, line)) { 68 | if (!input.empty()) input += " "; 69 | input += line; 70 | } 71 | NormalizeInput(input, normalizer.get()); 72 | } else { 73 | while (std::getline(std::cin, input)) { 74 | NormalizeInput(input, normalizer.get()); 75 | } 76 | } 77 | return 0; 78 | } 79 | -------------------------------------------------------------------------------- /src/include/Makefile.am: -------------------------------------------------------------------------------- 1 | BUILT_SOURCES = $(srcdir)/sparrowhawk/items.pb.h $(srcdir)/sparrowhawk/links.pb.h \ 2 | $(srcdir)/sparrowhawk/rule_order.pb.h \ 3 | $(srcdir)/sparrowhawk/semiotic_classes.pb.h \ 4 | $(srcdir)/sparrowhawk/sparrowhawk_configuration.pb.h 5 | 6 | nobase_include_HEADERS = sparrowhawk/field_path.h \ 7 | sparrowhawk/io_utils.h \ 8 | sparrowhawk/logger.h \ 9 | sparrowhawk/normalizer.h \ 10 | sparrowhawk/numbers.h \ 11 | sparrowhawk/protobuf_parser.h \ 12 | sparrowhawk/protobuf_serializer.h \ 13 | sparrowhawk/record_serializer.h \ 14 | sparrowhawk/regexp.h \ 15 | sparrowhawk/rule_system.h \ 16 | sparrowhawk/sentence_boundary.h \ 17 | sparrowhawk/spec_serializer.h \ 18 | sparrowhawk/string_utils.h \ 19 | sparrowhawk/style_serializer.h \ 20 | $(BUILT_SOURCES) 21 | 22 | sparrowhawk/items.pb.h: 23 | $(MAKE) -C $(srcdir)/../proto/ items.pb.h 24 | 25 | sparrowhawk/links.pb.h: 26 | $(MAKE) -C $(srcdir)/../proto/ links.pb.h 27 | 28 | sparrowhawk/rule_order.pb.h: 29 | $(MAKE) -C $(srcdir)/../proto/ rule_order.pb.h 30 | 31 | sparrowhawk/semiotic_classes.pb.h: 32 | $(MAKE) -C $(srcdir)/../proto/ semiotic_classes.pb.h 33 | 34 | sparrowhawk/serialization_spec.pb.h: 35 | $(MAKE) -C $(srcdir)/../proto/ serialization_spec.pb.h 36 | 37 | sparrowhawk/sparrowhawk_configuration.pb.h: 38 | $(MAKE) -C $(srcdir)/../proto/ sparrowhawk_configuration.pb.h 39 | 40 | -------------------------------------------------------------------------------- /src/include/sparrowhawk/field_path.h: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | // Utility to access specific subfields within a protocol buffer. FieldPath 15 | // objects make subfields available via Follow(). 16 | // 17 | 18 | #ifndef SPARROWHAWK_FIELD_PATH_H_ 19 | #define SPARROWHAWK_FIELD_PATH_H_ 20 | 21 | #include 22 | #include 23 | using std::string; 24 | #include 25 | using std::vector; 26 | 27 | #include 28 | #include 29 | #include 30 | 31 | namespace speech { 32 | namespace sparrowhawk { 33 | 34 | class FieldPath { 35 | public: 36 | // Creates and returns a FieldPath using a descriptor for the type of 37 | // messages we intend to Follow(). 38 | // Returns a null value if the input pointer is null. 39 | static std::unique_ptr Create(const google::protobuf::Descriptor *root_type); 40 | 41 | // Replaces this field_path with input path_string of type: 42 | // (message_name.)*scalar_field_name 43 | // Returns false if an error occurs with either the format of the string or 44 | // with mismatches of type (e.g. a subfield of an integer) or label (i.e. an 45 | // index is supplied when the field is not repeated.) 46 | bool Parse(const string& path_string); 47 | 48 | // Clear all fields from path. 49 | void Clear(); 50 | 51 | inline const google::protobuf::Descriptor *GetRootType() const { return root_type_; } 52 | 53 | // Number of fields on this path. Does not count the root as a field. 54 | inline int GetLength() const { return path_.size(); } 55 | 56 | // True if GetLength() == 0. 57 | inline bool IsEmpty() const { return GetLength() == 0; } 58 | 59 | // Follows the path starting from the given base message. *parent is filled 60 | // in with the immediate parent of the field at the end of the path and *field 61 | // is filled in with the terminal field's descriptor. 62 | // You can then use reflection to query the field value. 63 | // 64 | // Returns false only if the base message is incorrect (the only error that 65 | // can't be detected at parsing time); in this case *parent and *field are 66 | // unchanged. 67 | bool Follow(const google::protobuf::Message& base, const google::protobuf::Message **parent, 68 | const google::protobuf::FieldDescriptor **field) const; 69 | 70 | private: 71 | // Only used by the factory function Create. 72 | explicit FieldPath(const google::protobuf::Descriptor *root_type) 73 | : root_type_(root_type) {} 74 | 75 | // Parse intermediate message fields from input path. The parent is initially 76 | // root_type_ and is finally set to the penultimate field's descriptor. 77 | bool TraverseIntermediateFields(std::vector path, 78 | const google::protobuf::Descriptor **parent); 79 | 80 | // Parse terminal field "field" with given parent descriptor into path_. 81 | bool ParseTerminalField(const string &terminal_field_name, 82 | const google::protobuf::Descriptor *parent); 83 | 84 | std::vector path_; 85 | const google::protobuf::Descriptor *root_type_; 86 | }; 87 | 88 | } // namespace sparrowhawk 89 | } // namespace speech 90 | 91 | #endif // SPARROWHAWK_FIELD_PATH_H_ 92 | -------------------------------------------------------------------------------- /src/include/sparrowhawk/io_utils.h: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | // Various utilities to replace Google functionality for I/O. 15 | #ifndef SPARROWHAWK_IO_UTILS_H_ 16 | #define SPARROWHAWK_IO_UTILS_H_ 17 | 18 | #include 19 | using std::string; 20 | 21 | #include 22 | namespace speech { 23 | namespace sparrowhawk { 24 | 25 | class IOStream { 26 | public: 27 | static string LoadFileToString(const string &filename); 28 | }; 29 | 30 | } // namespace sparrowhawk 31 | } // namespace speech 32 | 33 | #endif // SPARROWHAWK_IO_UTILS_H_ 34 | -------------------------------------------------------------------------------- /src/include/sparrowhawk/links.pb.h: -------------------------------------------------------------------------------- 1 | // Generated by the protocol buffer compiler. DO NOT EDIT! 2 | // source: links.proto 3 | 4 | #ifndef PROTOBUF_links_2eproto__INCLUDED 5 | #define PROTOBUF_links_2eproto__INCLUDED 6 | 7 | #include 8 | 9 | #include 10 | 11 | #if GOOGLE_PROTOBUF_VERSION < 2005000 12 | #error This file was generated by a newer version of protoc which is 13 | #error incompatible with your Protocol Buffer headers. Please update 14 | #error your headers. 15 | #endif 16 | #if 2005000 < GOOGLE_PROTOBUF_MIN_PROTOC_VERSION 17 | #error This file was generated by an older version of protoc which is 18 | #error incompatible with your Protocol Buffer headers. Please 19 | #error regenerate this file with a newer version of protoc. 20 | #endif 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | // @@protoc_insertion_point(includes) 28 | 29 | namespace speech { 30 | namespace sparrowhawk { 31 | 32 | // Internal implementation detail -- do not call these. 33 | void protobuf_AddDesc_links_2eproto(); 34 | void protobuf_AssignDesc_links_2eproto(); 35 | void protobuf_ShutdownFile_links_2eproto(); 36 | 37 | class Links; 38 | 39 | // =================================================================== 40 | 41 | class Links : public ::google::protobuf::Message { 42 | public: 43 | Links(); 44 | virtual ~Links(); 45 | 46 | Links(const Links& from); 47 | 48 | inline Links& operator=(const Links& from) { 49 | CopyFrom(from); 50 | return *this; 51 | } 52 | 53 | inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const { 54 | return _unknown_fields_; 55 | } 56 | 57 | inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() { 58 | return &_unknown_fields_; 59 | } 60 | 61 | static const ::google::protobuf::Descriptor* descriptor(); 62 | static const Links& default_instance(); 63 | 64 | void Swap(Links* other); 65 | 66 | // implements Message ---------------------------------------------- 67 | 68 | Links* New() const; 69 | void CopyFrom(const ::google::protobuf::Message& from); 70 | void MergeFrom(const ::google::protobuf::Message& from); 71 | void CopyFrom(const Links& from); 72 | void MergeFrom(const Links& from); 73 | void Clear(); 74 | bool IsInitialized() const; 75 | 76 | int ByteSize() const; 77 | bool MergePartialFromCodedStream( 78 | ::google::protobuf::io::CodedInputStream* input); 79 | void SerializeWithCachedSizes( 80 | ::google::protobuf::io::CodedOutputStream* output) const; 81 | ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output) const; 82 | int GetCachedSize() const { return _cached_size_; } 83 | private: 84 | void SharedCtor(); 85 | void SharedDtor(); 86 | void SetCachedSize(int size) const; 87 | public: 88 | 89 | ::google::protobuf::Metadata GetMetadata() const; 90 | 91 | // nested types ---------------------------------------------------- 92 | 93 | // accessors ------------------------------------------------------- 94 | 95 | // optional int32 own_index = 1; 96 | inline bool has_own_index() const; 97 | inline void clear_own_index(); 98 | static const int kOwnIndexFieldNumber = 1; 99 | inline ::google::protobuf::int32 own_index() const; 100 | inline void set_own_index(::google::protobuf::int32 value); 101 | 102 | // optional int32 parent = 2; 103 | inline bool has_parent() const; 104 | inline void clear_parent(); 105 | static const int kParentFieldNumber = 2; 106 | inline ::google::protobuf::int32 parent() const; 107 | inline void set_parent(::google::protobuf::int32 value); 108 | 109 | // optional int32 first_child = 3; 110 | inline bool has_first_child() const; 111 | inline void clear_first_child(); 112 | static const int kFirstChildFieldNumber = 3; 113 | inline ::google::protobuf::int32 first_child() const; 114 | inline void set_first_child(::google::protobuf::int32 value); 115 | 116 | // optional int32 last_child = 4; 117 | inline bool has_last_child() const; 118 | inline void clear_last_child(); 119 | static const int kLastChildFieldNumber = 4; 120 | inline ::google::protobuf::int32 last_child() const; 121 | inline void set_last_child(::google::protobuf::int32 value); 122 | 123 | // @@protoc_insertion_point(class_scope:speech.sparrowhawk.Links) 124 | private: 125 | inline void set_has_own_index(); 126 | inline void clear_has_own_index(); 127 | inline void set_has_parent(); 128 | inline void clear_has_parent(); 129 | inline void set_has_first_child(); 130 | inline void clear_has_first_child(); 131 | inline void set_has_last_child(); 132 | inline void clear_has_last_child(); 133 | 134 | ::google::protobuf::UnknownFieldSet _unknown_fields_; 135 | 136 | ::google::protobuf::int32 own_index_; 137 | ::google::protobuf::int32 parent_; 138 | ::google::protobuf::int32 first_child_; 139 | ::google::protobuf::int32 last_child_; 140 | 141 | mutable int _cached_size_; 142 | ::google::protobuf::uint32 _has_bits_[(4 + 31) / 32]; 143 | 144 | friend void protobuf_AddDesc_links_2eproto(); 145 | friend void protobuf_AssignDesc_links_2eproto(); 146 | friend void protobuf_ShutdownFile_links_2eproto(); 147 | 148 | void InitAsDefaultInstance(); 149 | static Links* default_instance_; 150 | }; 151 | // =================================================================== 152 | 153 | 154 | // =================================================================== 155 | 156 | // Links 157 | 158 | // optional int32 own_index = 1; 159 | inline bool Links::has_own_index() const { 160 | return (_has_bits_[0] & 0x00000001u) != 0; 161 | } 162 | inline void Links::set_has_own_index() { 163 | _has_bits_[0] |= 0x00000001u; 164 | } 165 | inline void Links::clear_has_own_index() { 166 | _has_bits_[0] &= ~0x00000001u; 167 | } 168 | inline void Links::clear_own_index() { 169 | own_index_ = 0; 170 | clear_has_own_index(); 171 | } 172 | inline ::google::protobuf::int32 Links::own_index() const { 173 | return own_index_; 174 | } 175 | inline void Links::set_own_index(::google::protobuf::int32 value) { 176 | set_has_own_index(); 177 | own_index_ = value; 178 | } 179 | 180 | // optional int32 parent = 2; 181 | inline bool Links::has_parent() const { 182 | return (_has_bits_[0] & 0x00000002u) != 0; 183 | } 184 | inline void Links::set_has_parent() { 185 | _has_bits_[0] |= 0x00000002u; 186 | } 187 | inline void Links::clear_has_parent() { 188 | _has_bits_[0] &= ~0x00000002u; 189 | } 190 | inline void Links::clear_parent() { 191 | parent_ = 0; 192 | clear_has_parent(); 193 | } 194 | inline ::google::protobuf::int32 Links::parent() const { 195 | return parent_; 196 | } 197 | inline void Links::set_parent(::google::protobuf::int32 value) { 198 | set_has_parent(); 199 | parent_ = value; 200 | } 201 | 202 | // optional int32 first_child = 3; 203 | inline bool Links::has_first_child() const { 204 | return (_has_bits_[0] & 0x00000004u) != 0; 205 | } 206 | inline void Links::set_has_first_child() { 207 | _has_bits_[0] |= 0x00000004u; 208 | } 209 | inline void Links::clear_has_first_child() { 210 | _has_bits_[0] &= ~0x00000004u; 211 | } 212 | inline void Links::clear_first_child() { 213 | first_child_ = 0; 214 | clear_has_first_child(); 215 | } 216 | inline ::google::protobuf::int32 Links::first_child() const { 217 | return first_child_; 218 | } 219 | inline void Links::set_first_child(::google::protobuf::int32 value) { 220 | set_has_first_child(); 221 | first_child_ = value; 222 | } 223 | 224 | // optional int32 last_child = 4; 225 | inline bool Links::has_last_child() const { 226 | return (_has_bits_[0] & 0x00000008u) != 0; 227 | } 228 | inline void Links::set_has_last_child() { 229 | _has_bits_[0] |= 0x00000008u; 230 | } 231 | inline void Links::clear_has_last_child() { 232 | _has_bits_[0] &= ~0x00000008u; 233 | } 234 | inline void Links::clear_last_child() { 235 | last_child_ = 0; 236 | clear_has_last_child(); 237 | } 238 | inline ::google::protobuf::int32 Links::last_child() const { 239 | return last_child_; 240 | } 241 | inline void Links::set_last_child(::google::protobuf::int32 value) { 242 | set_has_last_child(); 243 | last_child_ = value; 244 | } 245 | 246 | 247 | // @@protoc_insertion_point(namespace_scope) 248 | 249 | } // namespace sparrowhawk 250 | } // namespace speech 251 | 252 | #ifndef SWIG 253 | namespace google { 254 | namespace protobuf { 255 | 256 | 257 | } // namespace google 258 | } // namespace protobuf 259 | #endif // SWIG 260 | 261 | // @@protoc_insertion_point(global_scope) 262 | 263 | #endif // PROTOBUF_links_2eproto__INCLUDED 264 | -------------------------------------------------------------------------------- /src/include/sparrowhawk/logger.h: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | // Various utilities to replace Google functionality for logging. 15 | #ifndef SPARROWHAWK_LOGGER_H_ 16 | #define SPARROWHAWK_LOGGER_H_ 17 | 18 | // TODO(rws): Write a more respectable logging system or link to some 19 | // open-source substitute. 20 | 21 | #include 22 | namespace speech { 23 | namespace sparrowhawk { 24 | 25 | 26 | } // namespace sparrowhawk 27 | } // namespace speech 28 | 29 | 30 | #define LoggerFormat(format) \ 31 | string(string("[%s:%s:%d] ") + format).c_str() 32 | 33 | #define LoggerMessage(type, format, ...) \ 34 | fprintf(stderr, \ 35 | LoggerFormat(format), \ 36 | type, \ 37 | __FILE__, \ 38 | __LINE__, \ 39 | ##__VA_ARGS__) 40 | 41 | #define LoggerDebug(format, ...) LoggerMessage("DEBUG", format, ##__VA_ARGS__) 42 | 43 | #define LoggerError(format, ...) LoggerMessage("ERROR", format, ##__VA_ARGS__) 44 | 45 | #define LoggerFatal(format, ...) { \ 46 | LoggerMessage("FATAL", format, ##__VA_ARGS__); \ 47 | exit(1); } \ 48 | 49 | #define LoggerInfo(format, ...) LoggerMessage("INFO", format, ##__VA_ARGS__) 50 | 51 | #define LoggerWarn(format, ...) LoggerMessage("WARNING", format, ##__VA_ARGS__) 52 | 53 | 54 | #endif // SPARROWHAWK_LOGGER_H_ 55 | -------------------------------------------------------------------------------- /src/include/sparrowhawk/normalizer.h: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | // The normalizer is the main part of Sparrowhawk. Loosely speaking it follows 15 | // the discussion of the (Google-internal) Kestrel system as described in 16 | // 17 | // Ebden, Peter and Sproat, Richard. 2015. The Kestrel TTS text normalization 18 | // system. Natural Language Engineering, Issue 03, pp 333-353. 19 | // 20 | // After sentence segmentation (sentence_boundary.h), the individual sentences 21 | // are first tokenized with each token being classified, and then passed to the 22 | // normalizer. The system can output as an unannotated string of words, and 23 | // richer annotation with links between input tokens, their input string 24 | // positions, and the output words is also available. 25 | 26 | #ifndef SPARROWHAWK_NORMALIZER_H_ 27 | #define SPARROWHAWK_NORMALIZER_H_ 28 | 29 | #include 30 | using std::string; 31 | #include 32 | using std::vector; 33 | 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | namespace speech { 42 | namespace sparrowhawk { 43 | 44 | class Normalizer { 45 | public: 46 | Normalizer(); 47 | 48 | ~Normalizer(); 49 | 50 | // The functions definitions have been split across two files, normalizer.cc 51 | // and normalizer_utils.cc, just to keep things a littler tidier. Below we 52 | // indicate where each function is found. 53 | 54 | // normalizer.cc 55 | // Method to load and set data for each derived method 56 | bool Setup(const string &configuration_proto, const string &pathname_prefix); 57 | 58 | // normalizer.cc 59 | // Interface to the normalization system for callers that want to be agnostic 60 | // about utterances. 61 | bool Normalize(const string &input, string *output) const; 62 | 63 | // normalizer.cc 64 | // Interface to the normalization system for callers that want to be agnostic 65 | // about utterances. Shows the token/word alignment. 66 | bool NormalizeAndShowLinks(const string &input, string *output) const; 67 | 68 | // normalizer_utils.cc 69 | // Helper for linearizing words from an utterance into a string 70 | string LinearizeWords(Utterance *utt) const; 71 | 72 | // normalizer_utils.cc 73 | // Helper for showing the indices of all tokens, words and their alignment 74 | // links. 75 | string ShowLinks(Utterance *utt) const; 76 | 77 | // normalizer.cc 78 | // Preprocessor to use the sentence splitter to break up text into 79 | // sentences. An application would normally call this first, and then 80 | // normalize each of the resulting sentences. 81 | std::vector SentenceSplitter(const string &input) const; 82 | 83 | private: 84 | // normalizer.cc 85 | // Internal interface to normalization. 86 | bool Normalize(Utterance *utt, const string &input) const; 87 | 88 | // normalizer_utils.cc 89 | // As in Kestrel, adds a phrase and silence. 90 | // TODO(rws): Possibly remove this since it is actually not being used. 91 | void AddPhraseToUtt(Utterance *utt) const { AddPhraseToUtt(utt, false); } 92 | 93 | // normalizer_utils.cc 94 | void AddPhraseToUtt(Utterance *utt, bool addword) const; 95 | 96 | // normalizer_utils.cc 97 | // Adds a single word to the end of the Word stream 98 | Word* AddWord(Utterance *utt, Token *token, 99 | const string &spelling) const; 100 | 101 | // normalizer_utils.cc 102 | // Function to add the words in the string 'name' onto the 103 | // end of the Word stream. 104 | Word* AddWords(Utterance *utt, Token *token, 105 | const string &name) const; 106 | 107 | // Finds the index of the provided token. 108 | int TokenIndex(Utterance *utt, Token *token) const; 109 | // normalizer_utils.cc 110 | // As with Peter's comment in 111 | // speech/patts2/modules/kestrel/verbalize_general.cc, clear out all the mucky 112 | // fields that we don't want verbalization to see. 113 | void CleanFields(Token *markup) const; 114 | 115 | // normalizer_utils.cc 116 | // Returns the substring of the input between left and right 117 | string InputSubstring(int left, int right) const; 118 | 119 | // normalizer.cc 120 | // Performs tokenization and classification on the input utterance, the first 121 | // step of normalization 122 | bool TokenizeAndClassifyUtt(Utterance *utt, const string &input) const; 123 | 124 | // normalizer_utils.cc 125 | // Serializes the contents of a Token to a string 126 | string ToString(const Token &markup) const; 127 | 128 | // normalizer.cc 129 | // Verbalizes semiotic classes, defaulting to verbatim verbalization for 130 | // something that is marked as a semiotic class but for which the 131 | // verbalization grammar fails. 132 | bool VerbalizeSemioticClass(const Token &markup, string *words) const; 133 | 134 | // normalizer.cc 135 | // Performs verbalization on the input utterance, the second step of 136 | // normalization 137 | bool VerbalizeUtt(Utterance *utt) const; 138 | 139 | string input_; 140 | std::unique_ptr tokenizer_classifier_rules_; 141 | std::unique_ptr verbalizer_rules_; 142 | std::unique_ptr sentence_boundary_; 143 | std::unique_ptr spec_serializer_; 144 | std::set sentence_boundary_exceptions_; 145 | 146 | DISALLOW_COPY_AND_ASSIGN(Normalizer); 147 | }; 148 | 149 | } // namespace sparrowhawk 150 | } // namespace speech 151 | 152 | #endif // SPARROWHAWK_NORMALIZER_H_ 153 | -------------------------------------------------------------------------------- /src/include/sparrowhawk/numbers.h: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | // Various utilities to replace Google functionality for safe_strtoX 15 | #ifndef SPARROWHAWK_NUMBERS_H_ 16 | #define SPARROWHAWK_NUMBERS_H_ 17 | 18 | #include 19 | #include 20 | using std::string; 21 | 22 | #include 23 | namespace speech { 24 | namespace sparrowhawk { 25 | 26 | typedef int32_t int32; 27 | typedef int64_t int64; 28 | 29 | bool safe_strtof(const string &value, float *output); 30 | 31 | bool safe_strtod(const string &value, double *output); 32 | 33 | bool safe_strto32(const string &value, int32 *output); 34 | 35 | bool safe_strto64(const string &value, int64 *output); 36 | 37 | } // namespace sparrowhawk 38 | } // namespace speech 39 | 40 | #endif // SPARROWHAWK_NUMBERS_H_ 41 | -------------------------------------------------------------------------------- /src/include/sparrowhawk/protobuf_parser.h: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | // This is a basic parser for reading protobufs directly from 15 | // FSTs. The main advantage this offers for the moment is the 16 | // ability to track token start/end points, but later can be 17 | // extended to other types and may ultimately be portable to 18 | // Android. 19 | // 20 | // This class is not thread safe since it needs to store internal 21 | // parse state. The expectation is to create temporary local instances 22 | // of it rather than persisting a single shared instance. 23 | 24 | #ifndef SPARROWHAWK_PROTOBUF_PARSER_H_ 25 | #define SPARROWHAWK_PROTOBUF_PARSER_H_ 26 | 27 | #include 28 | using std::string; 29 | #include 30 | using std::vector; 31 | 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | namespace speech { 38 | namespace sparrowhawk { 39 | 40 | using thrax::GrmManager; 41 | class Utterance; 42 | class Token; 43 | 44 | class ProtobufParser { 45 | public: 46 | typedef GrmManager::Transducer Transducer; 47 | 48 | explicit ProtobufParser(const Transducer *fst); 49 | ~ProtobufParser(); 50 | 51 | // Parses tokens from the member FST into the Token stream of the 52 | // utterance. Note that, as the name suggests, it *cannot* parse 53 | // other streams such as Word, Specification, etc. 54 | // This assumes that the FST has a unique path through it 55 | // (ie. has been created via ShortestPath()) 56 | bool ParseTokensFromFST(Utterance *utt, 57 | bool set_semiotic_class = true, 58 | bool fix_lookahead = false); 59 | 60 | // Parses the given message from the member FST. 61 | // Message must have been registered with ProtobufField for this 62 | // to succeed. 63 | // This assumes that the FST has a unique path through it 64 | // (ie. has been created via ShortestPath()) 65 | bool ParseMessageFromFST(google::protobuf::Message *message); 66 | 67 | protected: 68 | typedef fst::StateIterator StateIterator; 69 | typedef fst::ArcIterator ArcIterator; 70 | typedef Transducer::Arc::Label Label; 71 | typedef Transducer::StateId StateId; 72 | 73 | // Parses a single message from the FST. The message name and opening brace 74 | // have already been consumed; this goes until the closing brace. 75 | // If eof_allowed is true then it's not a failure to reach the end of the FST 76 | // before finding a closing brace. 77 | bool ParseMessage(bool eof_allowed, google::protobuf::Message *message); 78 | 79 | // Parses a single field value from the FST. The field name has already been 80 | // consumed, this just stores the value in the given string. 81 | bool ParseFieldValue(string *value); 82 | 83 | // As above, but deals with a quoted field which is rather trickier due to 84 | // escaping and suchforth. The first quote has already been consumed. 85 | bool ParseQuotedFieldValue(bool ignore_backslashes, string *value); 86 | 87 | // Consumes a single token label from the FST, ie. a message or field name. 88 | // Returns true if label found, false if not. 89 | bool ConsumeLabel(string *label); 90 | 91 | // Consumes any output whitespace from the FST. 92 | void ConsumeWhitespace(); 93 | 94 | // Moves to the next state in the FST. Returns true if one was found, false 95 | // if the end has been reached. 96 | bool NextState(); 97 | 98 | // Backs up to the previous state. Can only back up once, so should only be 99 | // called once between each call to NextState(). 100 | void PrevState(); 101 | 102 | // Updates start/end indices on a token that we've just parsed. 103 | void UpdateTokenIndices(Token *token, 104 | bool set_semiotic_class, 105 | bool fix_lookahead); 106 | 107 | // Logs an error message on parsing fail. 108 | void LogError(); 109 | 110 | // Records the field orders if there is a preserve_order field and it's true 111 | bool RecordFieldOrder(google::protobuf::Message *message, 112 | const std::vector &field_order); 113 | 114 | // Applies fixes to the token names caused by lookahead FSTs. 115 | void FixLookahead(Utterance *utt); 116 | 117 | // Sets the content of a field. 118 | void SetField(google::protobuf::Message *message, 119 | const google::protobuf::Reflection *reflection, 120 | const google::protobuf::FieldDescriptor *descriptor, 121 | const string &value) const; 122 | 123 | // FST we're parsing from. 124 | const Transducer *fst_; 125 | // Current state that we're up to. 126 | StateId state_; 127 | // The previous state 128 | StateId last_state_; 129 | // Input/output labels from the last arc. 130 | Label ilabel_; 131 | Label olabel_; 132 | // Start index of the current token. 133 | int token_start_; 134 | // End index of the immediately preceding token. 135 | int last_token_end_; 136 | // Name of the current token (ie. its input text). 137 | string token_name_; 138 | // Name (input labels) of the immediately preceding token. 139 | string last_token_name_; 140 | }; 141 | 142 | } // namespace sparrowhawk 143 | } // namespace speech 144 | 145 | #endif // SPARROWHAWK_PROTOBUF_PARSER_H_ 146 | -------------------------------------------------------------------------------- /src/include/sparrowhawk/protobuf_serializer.h: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | // This is a class to serialize protocol buffers directly 15 | // into a FST in preparation for them to be verbalized. 16 | // The main advantage of this for us is that we produce a FST with multiple 17 | // orderings which the verbalizer can consume however it wants; this 18 | // removes the necessity for the prior reordering hacks etc. 19 | // 20 | // As with ProtobufParser, this class is not threadsafe as it stores 21 | // internal state; the expectation is to create temporary local instances 22 | // of it rather than persisting a single shared instance. 23 | 24 | #ifndef SPARROWHAWK_PROTOBUF_SERIALIZER_H_ 25 | #define SPARROWHAWK_PROTOBUF_SERIALIZER_H_ 26 | 27 | #include 28 | using std::vector; 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | namespace speech { 37 | namespace sparrowhawk { 38 | 39 | using thrax::GrmManager; 40 | class Utterance; 41 | 42 | class ProtobufSerializer { 43 | public: 44 | typedef GrmManager::MutableTransducer MutableTransducer; 45 | typedef MutableTransducer::Arc::StateId StateId; 46 | 47 | // Serializes message into given fst. 48 | ProtobufSerializer(const google::protobuf::Message *message, 49 | MutableTransducer *fst); 50 | ~ProtobufSerializer(); 51 | 52 | // Serializes the message into the FST. 53 | void SerializeToFst(); 54 | 55 | // Serializes the message into a string 56 | string SerializeToString() const; 57 | 58 | protected: 59 | typedef google::protobuf::FieldDescriptor FieldDescriptor; 60 | typedef std::vector FieldDescriptorVector; 61 | typedef fst::StateIterator StateIterator; 62 | typedef fst::ArcIterator ArcIterator; 63 | typedef MutableTransducer::Arc Arc; 64 | typedef Arc::Label Label; 65 | 66 | // Internal constructor that allows selecting the state to begin from. 67 | ProtobufSerializer(const google::protobuf::Message *message, 68 | MutableTransducer *fst, 69 | StateId state); 70 | 71 | // Serializes the entire message into the FST, and returns the final state id. 72 | StateId SerializeToFstInternal(); 73 | 74 | // Serializes a single permutation into the FST. 75 | void SerializePermutation(const FieldDescriptorVector &fields); 76 | 77 | // Serializes a single field into the FST. 78 | StateId SerializeField(const FieldDescriptor *field, 79 | int index, 80 | StateId state); 81 | 82 | // Serializes a string into the FST. 83 | StateId SerializeString(const string &str, StateId state); 84 | 85 | // As above, allowing control of whether quotes are optional or not. 86 | StateId SerializeString(const string &str, 87 | StateId state, 88 | bool optional_quotes); 89 | 90 | // Serializes a single character into the FST. 91 | StateId SerializeChar(char c, StateId state); 92 | 93 | // Links the last arc that has a non-space output symbol to the new final 94 | // state by adding an epsilon arc from this arc's destination state to the new 95 | // final state, cutting out unnecessary whitespace and connecting multiple 96 | // permutations to a common destination. 97 | void StripTrailingSpace(StateId new_final_state); 98 | 99 | const google::protobuf::Message *message_; 100 | const google::protobuf::Reflection *reflection_; 101 | MutableTransducer *fst_; 102 | const StateId initial_state_; 103 | static const RE2 kReTrailingZeroes; 104 | static const int kReNumMatchGroups; 105 | 106 | private: 107 | DISALLOW_COPY_AND_ASSIGN(ProtobufSerializer); 108 | }; 109 | 110 | } // namespace sparrowhawk 111 | } // namespace speech 112 | 113 | #endif // SPARROWHAWK_PROTOBUF_SERIALIZER_H_ 114 | -------------------------------------------------------------------------------- /src/include/sparrowhawk/record_serializer.h: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | // Recursively serializes a single record in the spec and concatenates onto a 15 | // transducer. 16 | // 17 | // Typically the serialized field content looks like 18 | // :| 19 | // Note that nothing is serialized if the field corresponding to the record_spec 20 | // field_path is missing in the token. 21 | // 22 | // This is used by the StyleSerializer for serializing all the records in a 23 | // given style. It constructs the RecordSerializer for each record in the 24 | // style_spec. Given a token it sequentially invokes the Serialize function of 25 | // the records in the style being serialized. 26 | 27 | #ifndef SPARROWHAWK_RECORD_SERIALIZER_H_ 28 | #define SPARROWHAWK_RECORD_SERIALIZER_H_ 29 | 30 | #include 31 | #include 32 | using std::vector; 33 | 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | namespace speech { 42 | namespace sparrowhawk { 43 | 44 | class RecordSerializer { 45 | public: 46 | typedef fst::StdVectorFst MutableTransducer; 47 | 48 | // Creates and returns a RecordSerializer from the record_spec by noting the 49 | // field path and path label the record and recursively building 50 | // record_serializers for prefix and suffix specs. 51 | // Returns a null value if the spec is not well-formed. 52 | static std::unique_ptr Create( 53 | const RecordSpec &record_spec); 54 | 55 | // Serializes a token using the record spec, returns true only if the token 56 | // serializes correctly as per the record spec. For the input token, it 57 | // recursively traverses field_paths noted in the record_serializer and its 58 | // affix_serializers and concatenates serialized field content onto the 59 | // input fst. 60 | bool Serialize(const Token &token, MutableTransducer *fst) const; 61 | 62 | private: 63 | typedef MutableTransducer::Arc Arc; 64 | typedef Arc::StateId StateId; 65 | typedef Arc::Weight Weight; 66 | typedef fst::StringCompiler StringCompiler; 67 | 68 | // Only used by the factory function Create. 69 | RecordSerializer(); 70 | 71 | // Serializers for prefix specs in the specification. 72 | std::vector> prefix_serializers_; 73 | 74 | // Serializers for suffix specs in the specification. 75 | std::vector> suffix_serializers_; 76 | 77 | // Field path for the record_spec field. 78 | std::unique_ptr field_path_; 79 | 80 | // String denoting the terminating field's name for the record spec. 81 | string field_name_; 82 | 83 | // Default value to be emitted when field is not set. 84 | string default_value_; 85 | 86 | // Pattern to be escaped - record_separator or escape_character. 87 | RE2 escape_re_; 88 | 89 | // Replacement string for escape pattern - prepended escape_character. 90 | string escape_replacement_; 91 | 92 | // String Compiler for making fsts from strings. 93 | StringCompiler string_compiler_; 94 | 95 | // Serializes a record, escaping record_separator and escape_character. 96 | // Also serializes various factorizations as parallel arcs into the FST. 97 | void SerializeRecord(string *value, 98 | MutableTransducer *fst) const; 99 | 100 | // Assumes that the (non-repeated) field is set for the parent, and checks 101 | // that it corresponds to a scalar value. Also, in this case, adds an arc to 102 | // fst between states start and end, optionally adding a new state for end if 103 | // a sentinel is passed for end. It is an error to invoke this with a 104 | // repeated field. 105 | bool SerializeToFst(const google::protobuf::Message &parent, 106 | const google::protobuf::FieldDescriptor &field, 107 | MutableTransducer *fst) const; 108 | 109 | // Assumes that the (repeated) field is set for the parent, and checks that it 110 | // corresponds to a scalar value. Also, in this case, adds an arc to 111 | // fst between states start and end, optionally adding a new state for end if 112 | // a sentinel is passed for end. It is an error to invoke this with a 113 | // non-repeated field. 114 | bool SerializeToFstRepeated(const google::protobuf::Message &parent, 115 | const google::protobuf::FieldDescriptor &field, 116 | const int index, 117 | MutableTransducer *fst) const; 118 | 119 | // Recursively serializes prefix and suffix records into respective 120 | // transducers using appropriate record serializers. 121 | bool SerializeAffixes(const Token &token, 122 | MutableTransducer *prefix_fst, 123 | MutableTransducer *suffix_fst) const; 124 | 125 | DISALLOW_COPY_AND_ASSIGN(RecordSerializer); 126 | }; 127 | 128 | } // namespace sparrowhawk 129 | } // namespace speech 130 | 131 | #endif // SPARROWHAWK_RECORD_SERIALIZER_H_ 132 | -------------------------------------------------------------------------------- /src/include/sparrowhawk/regexp.h: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | // Various utilities to replace Google wrapper for re2. 15 | // Wrapper class regex library, it's very basic, and wraps the re2 16 | // regexp library. 17 | 18 | #ifndef SPARROWHAWK_REGEXP_H_ 19 | #define SPARROWHAWK_REGEXP_H_ 20 | 21 | #include 22 | using std::string; 23 | #include 24 | using std::vector; 25 | 26 | #include 27 | #include 28 | 29 | namespace speech { 30 | namespace sparrowhawk { 31 | 32 | // A regmatch is one match result - there may be one or more per string. 33 | struct RegMatch { 34 | int start_char; 35 | int end_char; 36 | string full_str; 37 | // number of sub-expressions 38 | int n_sub; 39 | int len; 40 | // if the regexp contained subexpressions 41 | std::vector sub_str; 42 | std::vector sub_start; 43 | std::vector sub_end; 44 | }; 45 | 46 | class Regexp { 47 | public: 48 | Regexp(); 49 | ~Regexp(); 50 | 51 | // Compiles a regexp. Returns true if compile successful. 52 | bool Compile(const string &pattern); 53 | 54 | // The number of sub expressions for this regexp. 55 | int nsubexp() const; 56 | 57 | // Checks for any match at all. Returns true if match. 58 | bool CheckFullMatch(const string &input) const; 59 | 60 | // Checks for any match at all. Returns true if match. 61 | bool CheckMatch(const string &input) const; 62 | 63 | // Checks for any match at all. Returns true if match. 64 | static bool CheckMatch(const string &input, const string &pattern); 65 | 66 | // Gets vector of start and end chars for all matching string parts 67 | // returns number of matches. Fills the matches vector with RegMatch objects. 68 | int GetAllMatches(const string &input, 69 | std::vector *matches) const; 70 | 71 | // Accessor for boolean whether this has been successfully compiled 72 | bool ok() const; 73 | 74 | // Deletes and resets internal data. 75 | void Clear(); 76 | 77 | private: 78 | // The underlying compiled regexp object internal structure 79 | RE2 *re_; 80 | 81 | int32 nsubexp_; 82 | 83 | DISALLOW_COPY_AND_ASSIGN(Regexp); 84 | }; 85 | 86 | } // namespace sparrowhawk 87 | } // namespace speech 88 | 89 | #endif // SPARROWHAWK_REGEXP_H_ 90 | -------------------------------------------------------------------------------- /src/include/sparrowhawk/rule_system.h: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | // A rule system consists of a cascaded set of grammar targets defined by 15 | // Thrax. See rule_order.proto for a description of what each rule complex can 16 | // contain. 17 | #ifndef SPARROWHAWK_RULE_SYSTEM_H_ 18 | #define SPARROWHAWK_RULE_SYSTEM_H_ 19 | 20 | #include 21 | #include 22 | #include 23 | using std::string; 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | namespace speech { 31 | namespace sparrowhawk { 32 | 33 | using fst::LabelLookAheadRelabeler; 34 | using fst::StdArcLookAheadFst; 35 | using fst::StdILabelLookAheadFst; 36 | using fst::StdOLabelLookAheadFst; 37 | using thrax::GrmManager; 38 | 39 | typedef fst::Fst Transducer; 40 | typedef fst::VectorFst MutableTransducer; 41 | 42 | typedef StdILabelLookAheadFst LookaheadFst; 43 | 44 | class RuleSystem { 45 | public: 46 | RuleSystem() { } 47 | ~RuleSystem(); 48 | 49 | // Loads a protobuf containing the filename of the grammar far 50 | // and the rule specifications as defined in rule_order.proto. 51 | bool LoadGrammar(const string& filename, const string& prefix); 52 | 53 | // This one returns the epsilon-free output projection of all 54 | // paths. use_lookahead constructs a lookahead FST for the composition. 55 | bool ApplyRules(const Transducer& input, 56 | MutableTransducer* output, 57 | bool use_lookahead) const; 58 | 59 | // These two return the string of the shortest path. 60 | bool ApplyRules(const string& input, 61 | string* output, 62 | bool use_lookahead) const; 63 | 64 | bool ApplyRules(const Transducer& input, 65 | string* output, 66 | bool use_lookahead) const; 67 | 68 | // Find the named transducer or NULL if nonexistent. 69 | const Transducer* FindRule(const string& name) const; 70 | 71 | const string& grammar_name() const { return grammar_name_; } 72 | 73 | private: 74 | Grammar grammar_; 75 | string grammar_name_; 76 | std::unique_ptr grm_; 77 | // Precomputed lookahead transducers 78 | mutable std::map lookaheads_; 79 | }; 80 | 81 | } // namespace sparrowhawk 82 | } // namespace speech 83 | 84 | #endif // SPARROWHAWK_RULE_SYSTEM_H_ 85 | -------------------------------------------------------------------------------- /src/include/sparrowhawk/sentence_boundary.h: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | // Simple interface for splitting text into sentences. Uses a regular expression 15 | // to define plausible end-of-sentence markers, and allows for a list of 16 | // exceptions --- e.g. abbreviations that end in periods that would not normally 17 | // signal a sentence boundary. 18 | #ifndef SPARROWHAWK_SENTENCE_BOUNDARY_H_ 19 | #define SPARROWHAWK_SENTENCE_BOUNDARY_H_ 20 | 21 | #include 22 | #include 23 | using std::string; 24 | #include 25 | using std::vector; 26 | 27 | #include 28 | #include 29 | 30 | namespace speech { 31 | namespace sparrowhawk { 32 | 33 | class SentenceBoundary { 34 | public: 35 | explicit SentenceBoundary(const string ®exp); 36 | 37 | // Loads exceptions, such as abbreviations that end in periods, things like 38 | // "Y!", or whatever. Note that these are all case sensitive, so one must 39 | // provide alternate forms if one expects that the form may be cased 40 | // differently. 41 | bool LoadSentenceBoundaryExceptions(const string &filename); 42 | 43 | std::vector ExtractSentences(const string &input_text) const; 44 | 45 | // If true, then prefixes each exception in the exception list with a space, 46 | // so that it when matching against a potential end-of-sentence position, it 47 | // will force the match to occur only when there is a preceding space, or at 48 | // the beginning of the string. 49 | void set_pad_exceptions_with_space_prefix(bool 50 | pad_exceptions_with_space_prefix) { 51 | pad_exceptions_with_space_prefix_ = pad_exceptions_with_space_prefix; 52 | } 53 | 54 | private: 55 | // Returns true if the candidate position is a plausible sentence 56 | // boundary. Currently uses the regexp and the sentence boundary exceptions 57 | // list, but could be replaced with something learned. 58 | bool EvaluateCandidate(const string &input_text, const string &marker) const; 59 | 60 | std::unique_ptr regexp_; 61 | std::vector sentence_boundary_exceptions_; 62 | bool pad_exceptions_with_space_prefix_; 63 | DISALLOW_COPY_AND_ASSIGN(SentenceBoundary); 64 | }; 65 | 66 | } // namespace sparrowhawk 67 | } // namespace speech 68 | 69 | #endif // SPARROWHAWK_SENTENCE_BOUNDARY_H_ 70 | -------------------------------------------------------------------------------- /src/include/sparrowhawk/spec_serializer.h: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | // Serializes a token based on a given spec for simple, fast verbalization. 15 | // Iteratively serializes the styles in a class_spec which are concatenated as 16 | // parallel arcs onto a transducer, which is returned as output. 17 | 18 | #ifndef SPARROWHAWK_SPEC_SERIALIZER_H_ 19 | #define SPARROWHAWK_SPEC_SERIALIZER_H_ 20 | 21 | #include 22 | #include 23 | #include 24 | using std::vector; 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | namespace speech { 34 | namespace sparrowhawk { 35 | 36 | class Serializer { 37 | public: 38 | typedef fst::StdVectorFst MutableTransducer; 39 | 40 | // Creates and returns a Serializer from the serialize_spec by creating 41 | // style_serializers for all its style_specs and storing the name of the 42 | // semiotic class. 43 | // Returns a null value if the spec is not well-formed. 44 | static std::unique_ptr Create( 45 | const SerializeSpec &serialize_spec); 46 | 47 | // Serializes a token using the serialization spec, i.e. builds an fst 48 | // corresponding to the serialization of the token. Appends a label for the 49 | // semiotic class name at the front and then adds parallel arcs for the 50 | // different valid style_specs. 51 | MutableTransducer Serialize(const Token &token) const; 52 | 53 | private: 54 | typedef MutableTransducer::Arc Arc; 55 | typedef fst::StringCompiler StringCompiler; 56 | 57 | // Only used by the factory function Create. 58 | Serializer() : string_compiler_(fst::StringTokenType::BYTE) {} 59 | 60 | // String Compiler for making fsts from strings. 61 | StringCompiler string_compiler_; 62 | 63 | // Map to store the serialization indexed by field descriptors. 64 | std::map>> serializers_; 66 | 67 | DISALLOW_COPY_AND_ASSIGN(Serializer); 68 | }; 69 | 70 | } // namespace sparrowhawk 71 | } // namespace speech 72 | 73 | #endif // SPARROWHAWK_SPEC_SERIALIZER_H_ 74 | -------------------------------------------------------------------------------- /src/include/sparrowhawk/string_utils.h: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | // Various utilities to replace Google functionality for strings. 15 | #ifndef SPARROWHAWK_STRING_UTILS_H_ 16 | #define SPARROWHAWK_STRING_UTILS_H_ 17 | 18 | #include 19 | using std::string; 20 | #include 21 | using std::vector; 22 | 23 | #include 24 | namespace speech { 25 | namespace sparrowhawk { 26 | 27 | // Splits string s by sep and returns a vector of strings. 28 | std::vector SplitString(const string &s, const string &delims); 29 | 30 | // Splits string s by sep and returns a vector of strings, skipping empties. 31 | std::vector SplitString(const string &s, 32 | const string &delims, 33 | bool skip_empty); 34 | 35 | // Strips whitespace off the beginning and end 36 | string StripWhitespace(const string &s); 37 | 38 | } // namespace sparrowhawk 39 | } // namespace speech 40 | 41 | #endif // SPARROWHAWK_STRING_UTILS_H_ 42 | -------------------------------------------------------------------------------- /src/include/sparrowhawk/style_serializer.h: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | // Iteratively serializes the records in a style_spec which are serially 15 | // concatenated onto a transducer. 16 | // 17 | // Typically the serialized field content looks like 18 | // (:|)* 19 | // where each unit is the serialization of a record. 20 | // 21 | // This is used by the Serializer for serializing all the styles in a given 22 | // semiotic class. It constructs the StyleSerializer for each style in the 23 | // class_spec permitted by the prohibited/requested values. Given a token it 24 | // sequentially invokes the Serialize function of the styles in the class being 25 | // serialized. 26 | 27 | #ifndef SPARROWHAWK_STYLE_SERIALIZER_H_ 28 | #define SPARROWHAWK_STYLE_SERIALIZER_H_ 29 | 30 | #include 31 | #include 32 | using std::vector; 33 | 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | 42 | namespace speech { 43 | namespace sparrowhawk { 44 | 45 | class StyleSerializer { 46 | public: 47 | typedef fst::StdVectorFst MutableTransducer; 48 | 49 | // Creates and returns a StyleSerializer from the style_spec by creating 50 | // record_serializers for all its record_specs and storing field_paths of 51 | // required and prohibited fields. 52 | // Returns a null value if the spec is not well-formed. 53 | static std::unique_ptr Create(const StyleSpec &style_spec); 54 | 55 | // Serializes a token using the style spec, returns true only for valid 56 | // styles satisfying required/prohibited field constraints. If so, all the 57 | // records in the style are serialized onto the input fst. 58 | bool Serialize(const Token &token, MutableTransducer *serialization) const; 59 | 60 | private: 61 | // Only used by the factory function Create. 62 | StyleSerializer() {} 63 | 64 | // Populates record_serializers_ using style_spec. 65 | static bool CreateRecordSerializers(const StyleSpec &style_spec, 66 | const std::unique_ptr &style_serializer); 67 | 68 | // Populates required_fields_ using style_spec. 69 | static bool SetRequiredFieldPaths(const StyleSpec &style_spec, 70 | const std::unique_ptr &style_serializer); 71 | 72 | // Populates prohibited_fields_ using style_spec. 73 | static bool SetProhibitedFieldPaths(const StyleSpec &style_spec, 74 | const std::unique_ptr &style_serializer); 75 | 76 | // Checks required_fields_ in token. 77 | bool CheckRequiredFields(const Token &token) const; 78 | 79 | // Checks prohibited_fields_ in token. 80 | bool CheckProhibitedFields(const Token &token) const; 81 | 82 | // FieldPaths to required fields in the specification. 83 | std::vector> required_fields_; 84 | 85 | // FieldPaths to prohibited fields in the specification. 86 | std::vector prohibited_fields_; 87 | 88 | // Record serializers for the record specs in the style. 89 | std::vector> record_serializers_; 90 | 91 | // Takes as input a message and a target field path ending in a scalar field 92 | // to within the input message and returns true if the field at the end of the 93 | // path is set. It further assumes that all the intermediate messages are 94 | // non-repeated, although the terminating field itself may be repeated. 95 | bool IsFieldSet(const google::protobuf::Message &root, 96 | const FieldPath &field_path) const; 97 | 98 | DISALLOW_COPY_AND_ASSIGN(StyleSerializer); 99 | }; 100 | 101 | } // namespace sparrowhawk 102 | } // namespace speech 103 | 104 | #endif // SPARROWHAWK_STYLE_SERIALIZER_H_ 105 | -------------------------------------------------------------------------------- /src/lib/Makefile.am: -------------------------------------------------------------------------------- 1 | # Need both because of the output of protoc 2 | AM_CPPFLAGS = -I$(srcdir)/../include -I$(srcdir)/../include/sparrowhawk 3 | 4 | lib_LTLIBRARIES = libsparrowhawk.la 5 | proto_sources = items.pb.cc \ 6 | links.pb.cc \ 7 | rule_order.pb.cc \ 8 | semiotic_classes.pb.cc \ 9 | serialization_spec.pb.cc \ 10 | sparrowhawk_configuration.pb.cc 11 | 12 | libsparrowhawk_la_SOURCES = field_path.cc \ 13 | io_utils.cc \ 14 | normalizer.cc \ 15 | normalizer_utils.cc \ 16 | numbers.cc \ 17 | protobuf_parser.cc \ 18 | protobuf_serializer.cc \ 19 | record_serializer.cc \ 20 | regexp.cc \ 21 | rule_system.cc \ 22 | sentence_boundary.cc \ 23 | spec_serializer.cc \ 24 | string_utils.cc \ 25 | style_serializer.cc \ 26 | $(proto_sources) 27 | 28 | libsparrowhawk_la_LDFLAGS = -version-info 0:0:0 29 | 30 | items.pb.cc: 31 | $(MAKE) -C $(srcdir)/../proto/ items.pb.cc 32 | 33 | links.pb.cc: 34 | $(MAKE) -C $(srcdir)/../proto/ links.pb.cc 35 | 36 | rule_order.pb.cc: 37 | $(MAKE) -C $(srcdir)/../proto/ rule_order.pb.cc 38 | 39 | semiotic_classes.pb.cc: 40 | $(MAKE) -C $(srcdir)/../proto/ semiotic_classes.pb.cc 41 | 42 | serialization_spec.pb.cc: 43 | $(MAKE) -C $(srcdir)/../proto/ serialization_spec.pb.cc 44 | 45 | sparrowhawk_configuration.pb.cc: 46 | $(MAKE) -C $(srcdir)/../proto/ sparrowhawk_configuration.pb.cc 47 | 48 | -------------------------------------------------------------------------------- /src/lib/field_path.cc: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | #include 15 | 16 | #include 17 | #include 18 | using std::string; 19 | #include 20 | using std::vector; 21 | 22 | #include 23 | #include 24 | #include 25 | 26 | namespace speech { 27 | namespace sparrowhawk { 28 | 29 | using google::protobuf::Descriptor; 30 | using google::protobuf::FieldDescriptor; 31 | using google::protobuf::Message; 32 | 33 | std::unique_ptr FieldPath::Create( 34 | const Descriptor *root_type) { 35 | if (root_type == nullptr) { 36 | return nullptr; 37 | } else { 38 | std::unique_ptr field_path(new FieldPath(root_type)); 39 | return field_path; 40 | } 41 | } 42 | 43 | void FieldPath::Clear() { 44 | path_.clear(); 45 | } 46 | 47 | bool FieldPath::Follow(const Message &base, const Message **parent, 48 | const FieldDescriptor **field) const { 49 | if (base.GetDescriptor() != root_type_) { 50 | LOG(ERROR) << "Input Message to Follow is of type " 51 | << base.GetDescriptor()->name() 52 | << " while the field_path root type is " << root_type_->name(); 53 | return false; 54 | } 55 | const Message *inner_message = &base; 56 | int size = path_.size(); 57 | for (int i = 0; i < size - 1; ++i) { 58 | // Iterating over singular messages. 59 | inner_message = &inner_message->GetReflection()->GetMessage(*inner_message, 60 | path_[i]); 61 | } 62 | *parent = inner_message; 63 | *field = path_[size - 1]; 64 | return true; 65 | } 66 | 67 | // Helper function to go through the intermediate message fields. 68 | bool FieldPath::TraverseIntermediateFields( 69 | std::vector path, 70 | const google::protobuf::Descriptor **parent) { 71 | for (int i = 0; i < path.size() - 1; ++i) { 72 | string &field_name = path[i]; 73 | const FieldDescriptor *field = (*parent)->FindFieldByName(field_name); 74 | if (field == nullptr) { 75 | LOG(ERROR) << (*parent)->full_name() 76 | << " does not contain a field named '" 77 | << field_name << "'."; 78 | return false; 79 | } 80 | if (field->type() != FieldDescriptor::TYPE_MESSAGE) { 81 | LOG(ERROR) << "Non-terminal field " << field->full_name() 82 | << " is not a message."; 83 | return false; 84 | } 85 | path_.push_back(field); 86 | *parent = field->message_type(); 87 | } 88 | return true; 89 | } 90 | 91 | // Helper function to parse the terminal scalar field. 92 | bool FieldPath::ParseTerminalField(const string &terminal_field_name, 93 | const Descriptor *parent) { 94 | const FieldDescriptor *terminal_field = 95 | parent->FindFieldByName(terminal_field_name); 96 | if (terminal_field == nullptr) { 97 | LOG(ERROR) << parent->full_name() << " does not contain a field named '" 98 | << terminal_field_name << "'."; 99 | return false; 100 | } else if (terminal_field->type() == FieldDescriptor::TYPE_MESSAGE) { 101 | LOG(ERROR) << "Terminal field " << terminal_field->full_name() 102 | << " is a message."; 103 | return false; 104 | } else { 105 | path_.push_back(terminal_field); 106 | } 107 | return true; 108 | } 109 | 110 | bool FieldPath::Parse(const string &path_string) { 111 | // Overwriting without clearing the field_path is illegal. 112 | if (!IsEmpty()) { 113 | LOG(ERROR) << "Cannot overwrite field_path. Use Clear() to reset."; 114 | return false; 115 | } 116 | std::vector path = SplitString(path_string, "."); 117 | const Descriptor *parent = root_type_; 118 | if (TraverseIntermediateFields(path, &parent) && 119 | ParseTerminalField(path.back(), parent)) { 120 | return true; 121 | } 122 | Clear(); 123 | return false; 124 | } 125 | 126 | } // namespace sparrowhawk 127 | } // namespace speech 128 | -------------------------------------------------------------------------------- /src/lib/io_utils.cc: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | #include 15 | 16 | #include 17 | #include 18 | using std::ifstream; 19 | #include 20 | 21 | #include 22 | 23 | namespace speech { 24 | namespace sparrowhawk { 25 | 26 | string IOStream::LoadFileToString(const string &filename) { 27 | std::ifstream strm(filename.c_str(), std::ios_base::in); 28 | if (!strm) { 29 | LoggerFatal("Error opening file %s", filename.c_str()); 30 | } 31 | strm.seekg(0, strm.end); 32 | int length = strm.tellg(); 33 | strm.seekg(0, strm.beg); 34 | std::unique_ptr data(new char[length + 1], 35 | std::default_delete()); 36 | strm.read(data.get(), length); 37 | if (strm.fail()) { 38 | LoggerFatal("Error loading from file %s", filename.c_str()); 39 | } 40 | data.get()[length] = 0; 41 | return string(data.get(), length); 42 | } 43 | 44 | } // namespace sparrowhawk 45 | } // namespace speech 46 | -------------------------------------------------------------------------------- /src/lib/normalizer.cc: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | #include 15 | 16 | #include 17 | #include 18 | using std::string; 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | namespace speech { 33 | namespace sparrowhawk { 34 | 35 | // TODO(rws): We actually need to do something with this. 36 | const char kDefaultSentenceBoundaryRegexp[] = "[\\.:!\\?] "; 37 | 38 | Normalizer::Normalizer() { } 39 | 40 | Normalizer::~Normalizer() { } 41 | 42 | bool Normalizer::Setup(const string &configuration_proto, 43 | const string &pathname_prefix) { 44 | SparrowhawkConfiguration configuration; 45 | string proto_string = IOStream::LoadFileToString(pathname_prefix + 46 | "/" + configuration_proto); 47 | if (!google::protobuf::TextFormat::ParseFromString(proto_string, &configuration)) 48 | return false; 49 | if (!(configuration.has_tokenizer_grammar())) 50 | LoggerError("Configuration does not define a tokenizer-classifier grammar"); 51 | if (!(configuration.has_verbalizer_grammar())) 52 | LoggerError("Configuration does not define a verbalizer grammar"); 53 | tokenizer_classifier_rules_.reset(new RuleSystem); 54 | if (!tokenizer_classifier_rules_->LoadGrammar( 55 | configuration.tokenizer_grammar(), 56 | pathname_prefix)) 57 | return false; 58 | verbalizer_rules_.reset(new RuleSystem); 59 | if (!verbalizer_rules_->LoadGrammar(configuration.verbalizer_grammar(), 60 | pathname_prefix)) 61 | return false; 62 | string sentence_boundary_regexp; 63 | if (configuration.has_sentence_boundary_regexp()) { 64 | sentence_boundary_regexp = configuration.sentence_boundary_regexp(); 65 | } else { 66 | sentence_boundary_regexp = kDefaultSentenceBoundaryRegexp; 67 | } 68 | sentence_boundary_.reset(new SentenceBoundary(sentence_boundary_regexp)); 69 | if (configuration.has_sentence_boundary_exceptions_file()) { 70 | if (!sentence_boundary_->LoadSentenceBoundaryExceptions( 71 | configuration.sentence_boundary_exceptions_file())) { 72 | LoggerError("Cannot load sentence boundary exceptions file: %s", 73 | configuration.sentence_boundary_exceptions_file().c_str()); 74 | } 75 | } 76 | if (configuration.has_serialization_spec()) { 77 | string spec_string = IOStream::LoadFileToString( 78 | pathname_prefix + "/" + configuration.serialization_spec()); 79 | SerializeSpec spec; 80 | if (spec_string.empty() || 81 | !google::protobuf::TextFormat::ParseFromString(spec_string, &spec) || 82 | (spec_serializer_ = Serializer::Create(spec)) == nullptr) { 83 | LoggerError("Failed to load a valid serialization spec from file: %s", 84 | configuration.serialization_spec().c_str()); 85 | return false; 86 | } 87 | } 88 | return true; 89 | } 90 | 91 | bool Normalizer::Normalize(const string &input, string *output) const { 92 | std::unique_ptr utt; 93 | utt.reset(new Utterance); 94 | if (!Normalize(utt.get(), input)) return false; 95 | *output = LinearizeWords(utt.get()); 96 | return true; 97 | } 98 | 99 | bool Normalizer::Normalize(Utterance *utt, const string &input) const { 100 | return TokenizeAndClassifyUtt(utt, input) && VerbalizeUtt(utt); 101 | } 102 | 103 | bool Normalizer::NormalizeAndShowLinks( 104 | const string &input, string *output) const { 105 | std::unique_ptr utt; 106 | utt.reset(new Utterance); 107 | if (!Normalize(utt.get(), input)) return false; 108 | *output = ShowLinks(utt.get()); 109 | return true; 110 | } 111 | 112 | bool Normalizer::TokenizeAndClassifyUtt(Utterance *utt, 113 | const string &input) const { 114 | typedef fst::StringCompiler Compiler; 115 | Compiler compiler(fst::StringTokenType::BYTE); 116 | MutableTransducer input_fst, output; 117 | if (!compiler(input, &input_fst)) { 118 | LoggerError("Failed to compile input string \"%s\"", input.c_str()); 119 | return false; 120 | } 121 | if (!tokenizer_classifier_rules_->ApplyRules(input_fst, 122 | &output, 123 | true /* use_lookahead */)) { 124 | LoggerError("Failed to tokenize \"%s\"", input.c_str()); 125 | return false; 126 | } 127 | MutableTransducer shortest_path; 128 | fst::ShortestPath(output, &shortest_path); 129 | ProtobufParser parser(&shortest_path); 130 | if (!parser.ParseTokensFromFST(utt, true /* set SEMIOTIC_CLASS */)) { 131 | LoggerError("Failed to parse tokens from FST for \"%s\"", input.c_str()); 132 | return false; 133 | } 134 | return true; 135 | } 136 | 137 | // As in Kestrel's Run(), this processes each token in turn and creates the Word 138 | // stream, adding words each with a unique wordid. Takes a different action on 139 | // the type: 140 | // 141 | // PUNCT: do nothing 142 | // SEMIOTIC_CLASS: call verbalizer FSTs 143 | // WORD: add to word stream 144 | bool Normalizer::VerbalizeUtt(Utterance *utt) const { 145 | for (int i = 0; i < utt->linguistic().tokens_size(); ++i) { 146 | Token *token = utt->mutable_linguistic()->mutable_tokens(i); 147 | string token_form = ToString(*token); 148 | token->set_first_daughter(-1); // Sets to default unset. 149 | token->set_last_daughter(-1); // Sets to default unset. 150 | // Add a single silence for punctuation that forms phrase breaks. This is 151 | // set via the grammar, though ultimately we'd like a proper phrasing 152 | // module. 153 | if (token->type() == Token::PUNCT) { 154 | if (token->phrase_break() && 155 | (utt->linguistic().words_size() == 0 || 156 | utt->linguistic().words( 157 | utt->linguistic().words_size() - 1).id() != "sil")) { 158 | AddWord(utt, token, "sil"); 159 | } 160 | } else if (token->type() == Token::SEMIOTIC_CLASS) { 161 | if (!token->skip()) { 162 | LoggerDebug("Verbalizing: [%s]\n", token_form.c_str()); 163 | string words; 164 | if (VerbalizeSemioticClass(*token, &words)) { 165 | AddWords(utt, token, words); 166 | } else { 167 | LoggerWarn("First-pass verbalization FAILED for [%s]", 168 | token_form.c_str()); 169 | // Back off to verbatim reading 170 | string original_token = token->name(); 171 | token->Clear(); 172 | token->set_name(original_token); 173 | token->set_verbatim(original_token); 174 | if (VerbalizeSemioticClass(*token, &words)) { 175 | LoggerWarn("Reversion to verbatim succeeded for [%s]", 176 | original_token.c_str()); 177 | AddWords(utt, token, words); 178 | } else { 179 | // If we've done our checks right, we should never get here 180 | LoggerError("Verbalization FAILED for [%s]", token_form.c_str()); 181 | } 182 | } 183 | } 184 | } else if (token->type() == Token::WORD) { 185 | if (token->has_wordid()) { 186 | AddWord(utt, token, token->wordid()); 187 | } else { 188 | LoggerError("Token [%s] has type WORD but there is no word id", 189 | token_form.c_str()); 190 | } 191 | } else { 192 | LoggerError("No type found for [%s]", token_form.c_str()); 193 | } 194 | } 195 | LoggerDebug("Verbalize output: Words\n%s\n\n", LinearizeWords(utt).c_str()); 196 | return true; 197 | } 198 | 199 | bool Normalizer::VerbalizeSemioticClass(const Token &markup, 200 | string *words) const { 201 | Token local(markup); 202 | CleanFields(&local); 203 | MutableTransducer input_fst; 204 | if (spec_serializer_ == nullptr) { 205 | ProtobufSerializer serializer(&local, &input_fst); 206 | serializer.SerializeToFst(); 207 | } else { 208 | input_fst = spec_serializer_->Serialize(local); 209 | } 210 | if (!verbalizer_rules_->ApplyRules(input_fst, 211 | words, 212 | false /* use_lookahead */)) { 213 | LoggerError("Failed to verbalize \"%s\"", ToString(local).c_str()); 214 | return false; 215 | } 216 | return true; 217 | } 218 | 219 | std::vector Normalizer::SentenceSplitter(const string &input) const { 220 | return sentence_boundary_->ExtractSentences(input); 221 | } 222 | 223 | } // namespace sparrowhawk 224 | } // namespace speech 225 | -------------------------------------------------------------------------------- /src/lib/normalizer_utils.cc: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | // TODO(rws): This is small enough now that maybe we don't really need this 15 | // separate file. 16 | // 17 | // More definitions for the Normalizer class, put here because they are icky 18 | // low-level hanky panky. 19 | 20 | // utt->AppendToken() 21 | // utt->AppendWord() 22 | 23 | #include 24 | #include 25 | using std::string; 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | namespace speech { 33 | namespace sparrowhawk { 34 | 35 | // Same as in Kestrel: add a phrase boundary at the beginning and ending of the 36 | // utterance. 37 | 38 | void Normalizer::AddPhraseToUtt(Utterance* utt, bool addword) const { 39 | Token* token = utt->mutable_linguistic()->add_tokens(); 40 | token->set_type(Token::PUNCT); 41 | token->set_name(""); 42 | token->set_phrase_break(true); 43 | if (addword) AddWord(utt, token, "sil"); 44 | } 45 | 46 | int Normalizer::TokenIndex(Utterance* utt, Token *token) const { 47 | for (int i = 0; i < utt->linguistic().tokens_size(); ++i) { 48 | const class Token *t = &(utt->linguistic().tokens(i)); 49 | if (t == token) { 50 | return i; 51 | } 52 | } 53 | return -1; 54 | } 55 | 56 | Word* Normalizer::AddWord(Utterance* utt, 57 | Token* token, 58 | const string& spelling) const { 59 | Word* word = utt->mutable_linguistic()->add_words(); 60 | int word_index = utt->linguistic().words_size() - 1; 61 | if (!token->has_first_daughter() || token->first_daughter() == -1) { 62 | token->set_first_daughter(word_index); 63 | } 64 | token->set_last_daughter(word_index); 65 | word->set_parent(TokenIndex(utt, token)); 66 | word->set_spelling(spelling); 67 | word->set_id(spelling); 68 | return word; 69 | } 70 | 71 | // Similar to Kestrel, but without the lexicon().ContainsWordId(spelling) logic, 72 | // which we want to shunt to later processing. 73 | // We assume that if someone puts a "," in the verbalization grammar, they mean 74 | // for this to represent a phrase boundary, so we add in logic here fore that. 75 | 76 | Word* Normalizer::AddWords(Utterance* utt, Token* token, 77 | const string& words) const { 78 | std::vector word_names = SplitString(words, " \t\n"); 79 | Word* word = NULL; 80 | 81 | for (int i = 0; i < word_names.size(); ++i) { 82 | if (word_names[i] == ",") 83 | word = AddWord(utt, token, "sil"); 84 | else 85 | word = AddWord(utt, token, word_names[i]); 86 | } 87 | return word; // return last word added. 88 | } 89 | 90 | void Normalizer::CleanFields(Token* markup) const { 91 | markup->clear_first_daughter(); 92 | markup->clear_last_daughter(); 93 | markup->clear_type(); 94 | markup->clear_skip(); 95 | markup->clear_next_space(); 96 | markup->clear_phrase_break(); 97 | markup->clear_start_index(); 98 | markup->clear_end_index(); 99 | markup->clear_name(); 100 | } 101 | 102 | string Normalizer::InputSubstring(int left, int right) const { 103 | if (left < 0 || right >= input_.size() || left > right) return ""; 104 | return input_.substr(left, right - left + 1); 105 | } 106 | 107 | string Normalizer::LinearizeWords(Utterance* utt) const { 108 | string output; 109 | for (int i = 0; i < utt->linguistic().words_size(); ++i) { 110 | if (i) output.append(" "); 111 | output.append(utt->linguistic().words(i).spelling()); 112 | } 113 | return output; 114 | } 115 | 116 | string Normalizer::ShowLinks(Utterance *utt) const { 117 | string output; 118 | for (int i = 0; i < utt->linguistic().tokens_size(); ++i) { 119 | output.append("Token:\t" + std::to_string(i) + "\t"); 120 | output.append(utt->linguistic().tokens(i).name() + "\t"); 121 | // Start and end positions in the input string. 122 | output.append(std::to_string(utt->linguistic().tokens(i).start_index())); 123 | output.append(","); 124 | output.append(std::to_string(utt->linguistic().tokens(i).end_index())); 125 | output.append("\t"); 126 | // First and last word daughters. 127 | output.append(std::to_string(utt->linguistic().tokens(i).first_daughter())); 128 | output.append(","); 129 | output.append(std::to_string(utt->linguistic().tokens(i).last_daughter())); 130 | output.append("\n"); 131 | } 132 | for (int i = 0; i < utt->linguistic().words_size(); ++i) { 133 | output.append("Word:\t" + std::to_string(i) + "\t"); 134 | output.append(utt->linguistic().words(i).spelling()); 135 | output.append("\t" + std::to_string(utt->linguistic().words(i).parent())); 136 | output.append("\n"); 137 | } 138 | return output; 139 | } 140 | 141 | string Normalizer::ToString(const Token& markup) const { 142 | ProtobufSerializer serializer(&markup, NULL); 143 | return serializer.SerializeToString(); 144 | } 145 | 146 | } // namespace sparrowhawk 147 | } // namespace speech 148 | -------------------------------------------------------------------------------- /src/lib/numbers.cc: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | using std::string; 20 | 21 | namespace speech { 22 | namespace sparrowhawk { 23 | 24 | #define CONVERT(value, output) \ 25 | char *endptr; \ 26 | *output = strtof(value.c_str(), &endptr); \ 27 | if (errno == ERANGE) return false; \ 28 | if (endptr < value.c_str() + value.size()) return false; \ 29 | return true; 30 | 31 | bool safe_strtof(const string &value, float *output) { 32 | CONVERT(value, output); 33 | } 34 | 35 | bool safe_strtod(const string &value, double *output) { 36 | CONVERT(value, output); 37 | } 38 | 39 | bool safe_strto32(const string &value, int32 *output) { 40 | CONVERT(value, output); 41 | } 42 | 43 | bool safe_strto64(const string &value, int64 *output) { 44 | CONVERT(value, output); 45 | } 46 | 47 | #undef CONVERT 48 | 49 | } // namespace sparrowhawk 50 | } // namespace speech 51 | -------------------------------------------------------------------------------- /src/lib/regexp.cc: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | #include 15 | 16 | #include 17 | #include 18 | 19 | namespace speech { 20 | namespace sparrowhawk { 21 | 22 | Regexp::Regexp() { 23 | re_ = nullptr; 24 | nsubexp_ = -1; 25 | } 26 | 27 | Regexp::~Regexp() { 28 | Clear(); 29 | } 30 | 31 | void Regexp::Clear() { 32 | delete re_; 33 | re_ = nullptr; 34 | nsubexp_ = -1; 35 | } 36 | 37 | int Regexp::nsubexp() const { 38 | return nsubexp_; 39 | } 40 | 41 | bool Regexp::ok() const { 42 | return re_ != nullptr && re_->ok(); 43 | } 44 | 45 | bool Regexp::Compile(const string &pattern) { 46 | Clear(); 47 | RE2::Options options; 48 | options.set_longest_match(true); 49 | options.set_log_errors(false); 50 | re_ = new RE2(pattern, options); 51 | 52 | if (re_ == nullptr) { 53 | LoggerError("Error in allocating regexp \"%s\"", pattern.c_str()); 54 | return false; 55 | } 56 | if (!re_->ok()) { 57 | LoggerError("Error in allocating regexp \"%s\": %s", 58 | pattern.c_str(), 59 | re_->error().c_str()); 60 | Clear(); 61 | return false; 62 | } 63 | nsubexp_ = re_->NumberOfCapturingGroups(); 64 | return true; 65 | } 66 | 67 | bool Regexp::CheckMatch(const string &input) const { 68 | if (ok()) { 69 | return RE2::PartialMatch(input, *re_); 70 | } else { 71 | return false; 72 | } 73 | } 74 | 75 | bool Regexp::CheckFullMatch(const string &input) const { 76 | if (ok()) { 77 | return RE2::FullMatch(input, *re_); 78 | } else { 79 | return false; 80 | } 81 | } 82 | 83 | bool Regexp::CheckMatch(const string &input, const string &pattern) { 84 | return RE2::PartialMatch(input, pattern); 85 | } 86 | 87 | int Regexp::GetAllMatches(const string &input, 88 | std::vector *matches) const { 89 | if (!ok()) { 90 | return 0; 91 | } 92 | int nmatches = 0; 93 | int offset = 0; 94 | int end_pos = input.size(); 95 | matches->clear(); 96 | re2::StringPiece input_piece(input); 97 | 98 | std::unique_ptr matched_pieces(new re2::StringPiece[1 + nsubexp_]); 99 | bool result = re_->Match(input_piece, 100 | offset, 101 | end_pos, 102 | RE2::UNANCHORED, 103 | matched_pieces.get(), 104 | 1 + nsubexp_); 105 | RegMatch re_info; 106 | while (result) { 107 | nmatches++; 108 | re_info.sub_str.clear(); 109 | re_info.sub_start.clear(); 110 | re_info.sub_end.clear(); 111 | re_info.full_str = ""; 112 | re_info.n_sub = nsubexp_; 113 | int match_offset = matched_pieces[0].data() - input.c_str(); 114 | int match_length = matched_pieces[0].length(); 115 | 116 | re_info.start_char = match_offset; 117 | re_info.end_char = match_offset + match_length; 118 | re_info.len = match_length; 119 | re_info.full_str = matched_pieces[0].as_string(); 120 | 121 | for (int i = 1; i <= nsubexp_; ++i) { 122 | re_info.sub_str.push_back(matched_pieces[i].as_string()); 123 | int sub_match_start = matched_pieces[i].data() - input.c_str(); 124 | re_info.sub_start.push_back(sub_match_start); 125 | re_info.sub_end.push_back(sub_match_start + matched_pieces[i].length()); 126 | } 127 | 128 | matches->push_back(re_info); 129 | offset = re_info.end_char; 130 | result = re_->Match(input_piece, 131 | offset, 132 | end_pos, 133 | RE2::UNANCHORED, 134 | matched_pieces.get(), 135 | 1 + nsubexp_); 136 | } 137 | return nmatches; 138 | } 139 | 140 | } // namespace sparrowhawk 141 | } // namespace speech 142 | -------------------------------------------------------------------------------- /src/lib/rule_system.cc: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | 20 | namespace speech { 21 | namespace sparrowhawk { 22 | 23 | using fst::LabelLookAheadRelabeler; 24 | using fst::StdArc; 25 | 26 | RuleSystem::~RuleSystem() { 27 | std::map::iterator iter; 28 | for (iter = lookaheads_.begin(); iter != lookaheads_.end(); iter++) { 29 | delete iter->second; 30 | } 31 | } 32 | 33 | bool RuleSystem::LoadGrammar(const string& filename, const string& prefix) { 34 | // This is the contents of filename. 35 | string proto_string = IOStream::LoadFileToString(prefix + filename); 36 | if (!google::protobuf::TextFormat::ParseFromString(proto_string, &grammar_)) 37 | return false; 38 | string grm_file = prefix + grammar_.grammar_file(); 39 | grammar_name_ = grammar_.grammar_name(); 40 | grm_.reset(new GrmManager); 41 | if (!grm_->LoadArchive(grm_file)) { 42 | LoggerError("Error loading archive \"%s\" from \"%s\"", 43 | grammar_name_.c_str(), grm_file.c_str()); 44 | return false; 45 | } 46 | // Verifies that the rules named in the rule ordering all exist in the 47 | // grammar. 48 | for (int i = 0; i < grammar_.rules_size(); ++i) { 49 | Rule rule = grammar_.rules(i); 50 | if (grm_->GetFst(rule.main()) == NULL) { 51 | LoggerError("Rule \"%s\" not found in \"%s\"", 52 | rule.main().c_str(), grammar_name_.c_str()); 53 | return false; 54 | } 55 | if (rule.has_parens() && grm_->GetFst(rule.parens()) == NULL) { 56 | LoggerError("Rule \"%s\" not found in \"%s\"", 57 | rule.parens().c_str(), grammar_name_.c_str()); 58 | return false; 59 | } 60 | if (rule.has_redup() && grm_->GetFst(rule.redup()) == NULL) { 61 | LoggerError("Rule \"%s\" not found in \"%s\"", 62 | rule.redup().c_str(), grammar_name_.c_str()); 63 | return false; 64 | } 65 | } 66 | return true; 67 | } 68 | 69 | bool RuleSystem::ApplyRules(const Transducer& input, 70 | MutableTransducer* output, 71 | bool use_lookahead) const { 72 | MutableTransducer mutable_input(input); 73 | for (int i = 0; i < grammar_.rules_size(); ++i) { 74 | Rule rule = grammar_.rules(i); 75 | if (rule.has_redup()) { 76 | const string& redup_rule = rule.redup(); 77 | MutableTransducer redup1; 78 | // Not an error if it fails. 79 | if (grm_->Rewrite(redup_rule, mutable_input, &redup1, "")) { 80 | MutableTransducer redup2(redup1); 81 | fst::Concat(redup1, &redup2); 82 | fst::Union(&mutable_input, redup2); 83 | fst::RmEpsilon(&mutable_input); 84 | } 85 | } 86 | const string& rule_name = rule.main(); 87 | string parens_rule = rule.has_parens() ? rule.parens() : ""; 88 | // Only use lookahead on non (M)PDT's 89 | bool success = true; 90 | if (parens_rule.empty() 91 | && use_lookahead) { 92 | std::map::iterator iter = 93 | lookaheads_.find(rule_name); 94 | LookaheadFst *lookahead_rule_fst; 95 | if (iter == lookaheads_.end()) { 96 | const Transducer *rule_fst = grm_->GetFst(rule_name); 97 | lookahead_rule_fst = new LookaheadFst(*rule_fst); 98 | lookaheads_[rule_name] = lookahead_rule_fst; 99 | } else { 100 | lookahead_rule_fst = iter->second; 101 | } 102 | LabelLookAheadRelabeler::Relabel(&mutable_input, 103 | *lookahead_rule_fst, 104 | false); 105 | fst::ComposeFst tmp_output(mutable_input, 106 | *lookahead_rule_fst); 107 | *output = tmp_output; 108 | if (output->NumStates() == 0) { 109 | success = false; 110 | } 111 | // Otherwise we just use the regular rewrite mechanism 112 | } else if (!grm_->Rewrite(rule_name, 113 | mutable_input, 114 | output, 115 | parens_rule 116 | ) 117 | || output->NumStates() == 0) { 118 | success = false; 119 | } 120 | if (!success) { 121 | LoggerError("Application of rule \"%s\" failed", rule_name.c_str()); 122 | return false; 123 | } 124 | mutable_input = *output; 125 | } 126 | // NB: We do NOT want to Project in this case because this will be the input 127 | // to the ProtobufParser, which needs the input-side epsilons in order to keep 128 | // track of positions in the input. 129 | fst::RmEpsilon(output); 130 | return true; 131 | } 132 | 133 | typedef fst::StringCompiler Compiler; 134 | typedef fst::StringPrinter Printer; 135 | 136 | bool RuleSystem::ApplyRules(const string& input, 137 | string* output, 138 | bool use_lookahead) const { 139 | Compiler compiler(fst::StringTokenType::BYTE); 140 | MutableTransducer input_fst, output_fst; 141 | if (!compiler.operator()(input, &input_fst)) { 142 | LoggerError("Failed to compile input string \"%s\"", input.c_str()); 143 | return false; 144 | } 145 | if (!ApplyRules(input_fst, &output_fst, use_lookahead)) return false; 146 | MutableTransducer shortest_path; 147 | fst::ShortestPath(output_fst, &shortest_path); 148 | fst::Project(&shortest_path, fst::PROJECT_OUTPUT); 149 | fst::RmEpsilon(&shortest_path); 150 | Printer printer(fst::StringTokenType::BYTE); 151 | if (!printer.operator()(shortest_path, output)) { 152 | LoggerError("Failed to print output string"); 153 | return false; 154 | } 155 | return true; 156 | } 157 | 158 | bool RuleSystem::ApplyRules(const Transducer& input, 159 | string* output, 160 | bool use_lookahead) const { 161 | MutableTransducer output_fst; 162 | if (!ApplyRules(input, &output_fst, use_lookahead)) return false; 163 | MutableTransducer shortest_path; 164 | fst::ShortestPath(output_fst, &shortest_path); 165 | fst::Project(&shortest_path, fst::PROJECT_OUTPUT); 166 | fst::RmEpsilon(&shortest_path); 167 | Printer printer(fst::StringTokenType::BYTE); 168 | if (!printer.operator()(shortest_path, output)) { 169 | LoggerError("Failed to print to output string"); 170 | return false; 171 | } 172 | return true; 173 | } 174 | 175 | const Transducer* RuleSystem::FindRule(const string& name) const { 176 | return grm_->GetFst(name); 177 | } 178 | 179 | 180 | } // namespace sparrowhawk 181 | } // namespace speech 182 | -------------------------------------------------------------------------------- /src/lib/sentence_boundary.cc: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | #include 15 | 16 | #include 17 | #include 18 | using std::string; 19 | #include 20 | using std::vector; 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | namespace speech { 28 | namespace sparrowhawk { 29 | 30 | SentenceBoundary::SentenceBoundary(const string ®exp) : 31 | pad_exceptions_with_space_prefix_(true) { 32 | regexp_.reset(new Regexp); 33 | if (!regexp_->Compile(regexp)) { 34 | LoggerFatal("SentenceBoundary failed with bad regexp: %s", regexp.c_str()); 35 | } 36 | } 37 | 38 | bool SentenceBoundary::LoadSentenceBoundaryExceptions(const string &filename) { 39 | string raw = IOStream::LoadFileToString(filename); 40 | std::vector tokens = SplitString(raw, "\n", true /* skip_empty */); 41 | for (auto token : tokens) { 42 | token = StripWhitespace(token); 43 | // Having it as an unordered list is of course not very efficient for 44 | // search, but we do not expect these lists to be very long. 45 | // We pad with a space before it since most scripts that use end-of-sentence 46 | // markers ambiguously to denote abbreviations also use spaces to delimit 47 | // words. 48 | // TODO(rws): We should extend this to regexps to handle things like German 49 | // ordinals. 50 | if (pad_exceptions_with_space_prefix_) 51 | sentence_boundary_exceptions_.push_back(" " + token); 52 | } 53 | return true; 54 | } 55 | 56 | std::vector SentenceBoundary::ExtractSentences( 57 | const string &input_text) const { 58 | std::vector potentials; 59 | regexp_->GetAllMatches(input_text, &potentials); 60 | std::vector cutpoints; 61 | int last = 0, i; 62 | for (i = 0; i < potentials.size(); ++i) { 63 | const int start = potentials[i].start_char; 64 | const int end = potentials[i].end_char; 65 | const string text_before = input_text.substr(last, start - last); 66 | const string marker = input_text.substr(start, end - start); 67 | const string text_after = input_text.substr(end); 68 | if (EvaluateCandidate(text_before, marker)) { 69 | cutpoints.push_back(end); 70 | last = end; 71 | } 72 | } 73 | std::vector result; 74 | last = 0; 75 | string sentence; 76 | for (int i = 0; i < cutpoints.size(); ++i) { 77 | sentence = StripWhitespace(input_text.substr(last, cutpoints[i] - last)); 78 | if (!sentence.empty()) result.push_back(sentence); 79 | last = cutpoints[i]; 80 | } 81 | sentence = StripWhitespace(input_text.substr(last)); 82 | if (!sentence.empty()) result.push_back(sentence); 83 | return result; 84 | } 85 | 86 | bool SentenceBoundary::EvaluateCandidate(const string &input_text, 87 | const string &marker) const { 88 | // Gets the previous sentence and the marker, minus any trailing whitespace. 89 | string previous = StripWhitespace(input_text + marker); 90 | int previous_length = previous.size(); 91 | for (const auto &exception : sentence_boundary_exceptions_) { 92 | int length = exception.size(); 93 | if (length <= previous_length && 94 | previous.substr(previous_length - length, length) == exception) 95 | return false; 96 | // If the exception starts with a space because we have added one, then also 97 | // check to see if this was the first token --- i.e. matches the entire 98 | // previous "sentence". 99 | if (pad_exceptions_with_space_prefix_) { 100 | string stripped_exception = StripWhitespace(exception); 101 | if (previous == stripped_exception) return false; 102 | } 103 | } 104 | return true; 105 | } 106 | 107 | } // namespace sparrowhawk 108 | } // namespace speech 109 | -------------------------------------------------------------------------------- /src/lib/spec_serializer.cc: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | #include 15 | 16 | #include 17 | #include 18 | using std::vector; 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | namespace speech { 27 | namespace sparrowhawk { 28 | 29 | using google::protobuf::Descriptor; 30 | using google::protobuf::FieldDescriptor; 31 | using google::protobuf::Reflection; 32 | 33 | namespace { 34 | 35 | typedef Serializer::MutableTransducer MutableTransducer; 36 | const char kClassSeparator[] = "|"; 37 | 38 | } // namespace 39 | 40 | std::unique_ptr Serializer::Create( 41 | const SerializeSpec &serialize_spec) { 42 | std::unique_ptr serializer(new Serializer()); 43 | const Descriptor *token_descriptor = Token::descriptor(); 44 | for (const ClassSpec &class_spec : serialize_spec.class_spec()) { 45 | const FieldDescriptor *class_descriptor = 46 | token_descriptor->FindFieldByName(class_spec.semiotic_class()); 47 | if (class_descriptor == nullptr) { 48 | LOG(ERROR) << "Cannot find " << class_spec.semiotic_class() 49 | << " field in Token proto"; 50 | return nullptr; 51 | } 52 | std::vector> &styles = 53 | serializer->serializers_[class_descriptor]; 54 | for (const StyleSpec &style_spec : class_spec.style_spec()) { 55 | auto style_serializer = StyleSerializer::Create(style_spec); 56 | if (style_serializer) { 57 | styles.push_back(std::move(style_serializer)); 58 | } else { 59 | return nullptr; 60 | } 61 | } 62 | } 63 | return serializer; 64 | } 65 | 66 | MutableTransducer Serializer::Serialize(const Token &token) const { 67 | MutableTransducer fst; 68 | const Reflection *reflection = token.GetReflection(); 69 | for (const auto &candidate_class : serializers_) { 70 | if (reflection->HasField(token, candidate_class.first)) { 71 | string_compiler_(candidate_class.first->name() + kClassSeparator, 72 | &fst); 73 | MutableTransducer fst_styles; 74 | for (const auto &candidate_style : candidate_class.second) { 75 | MutableTransducer fst_style; 76 | fst_style.SetStart(fst_style.AddState()); 77 | fst_style.SetFinal(0, 1); 78 | if (candidate_style->Serialize(token, &fst_style)) { 79 | Union(&fst_styles, fst_style); 80 | } 81 | } 82 | Concat(&fst, fst_styles); 83 | } 84 | } 85 | return fst; 86 | } 87 | 88 | } // namespace sparrowhawk 89 | } // namespace speech 90 | -------------------------------------------------------------------------------- /src/lib/string_utils.cc: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | #include 15 | 16 | #include 17 | using std::string; 18 | #include 19 | using std::vector; 20 | 21 | namespace speech { 22 | namespace sparrowhawk { 23 | 24 | std::vector SplitString(const string &s, const string &delims) { 25 | return SplitString(s, delims, false); 26 | } 27 | 28 | std::vector SplitString(const string &s, 29 | const string &delims, 30 | bool skip_empty) { 31 | std::vector out; 32 | if (s.empty()) { 33 | return out; 34 | } 35 | 36 | string::size_type len = s.length(), i = 0, pos = 0; 37 | do { 38 | if ((i = s.find_first_of(delims, pos)) == string::npos) { 39 | string substring = s.substr(pos); 40 | if (skip_empty && substring.empty()) continue; 41 | out.push_back(substring); 42 | } else { 43 | if (pos != i) { 44 | string substring = s.substr(pos, i - pos); 45 | if (skip_empty && substring.empty()) continue; 46 | out.push_back(substring); 47 | } 48 | pos = i + 1; 49 | } 50 | } while (i != string::npos && pos < len); 51 | return out; 52 | } 53 | 54 | string StripWhitespace(const string &s) { 55 | int start = s.find_first_not_of(" \t\n"); 56 | if (start == string::npos) return ""; 57 | int end = s.find_last_not_of(" \t\n"); 58 | return s.substr(start, end - start + 1); 59 | } 60 | 61 | } // namespace sparrowhawk 62 | } // namespace speech 63 | -------------------------------------------------------------------------------- /src/lib/style_serializer.cc: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | #include 15 | 16 | #include 17 | #include 18 | using std::string; 19 | #include 20 | using std::vector; 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | namespace speech { 32 | namespace sparrowhawk { 33 | 34 | using google::protobuf::Descriptor; 35 | using google::protobuf::FieldDescriptor; 36 | using google::protobuf::Reflection; 37 | using google::protobuf::TextFormat; 38 | using google::protobuf::Message; 39 | 40 | bool StyleSerializer::CreateRecordSerializers( 41 | const StyleSpec &style_spec, 42 | const std::unique_ptr &style_serializer) { 43 | for (const RecordSpec &record_spec : style_spec.record_spec()) { 44 | auto record_serializer = RecordSerializer::Create(record_spec); 45 | if (record_serializer) { 46 | style_serializer->record_serializers_.push_back( 47 | std::move(record_serializer)); 48 | } else { 49 | return false; 50 | } 51 | } 52 | return true; 53 | } 54 | 55 | bool StyleSerializer::SetRequiredFieldPaths( 56 | const StyleSpec &style_spec, 57 | const std::unique_ptr &style_serializer) { 58 | const Descriptor *token_descriptor = Token::descriptor(); 59 | for (const string &required_fields : style_spec.required_fields()) { 60 | std::vector any_of; 61 | for (const auto &required_field : 62 | SplitString(required_fields, "|")) { 63 | std::unique_ptr field_path = 64 | FieldPath::Create(token_descriptor); 65 | any_of.push_back(*field_path); 66 | if (!any_of.back().Parse(required_field)) { 67 | LOG(ERROR) << "FieldPath failed to parse for required field: " 68 | << required_field; 69 | return false; 70 | } 71 | } 72 | style_serializer->required_fields_.push_back(std::move(any_of)); 73 | } 74 | return true; 75 | } 76 | 77 | bool StyleSerializer::SetProhibitedFieldPaths( 78 | const StyleSpec &style_spec, 79 | const std::unique_ptr &style_serializer) { 80 | const Descriptor *token_descriptor = Token::descriptor(); 81 | for (const string &prohibited_field : style_spec.prohibited_fields()) { 82 | std::vector &prohibited_fields = 83 | style_serializer->prohibited_fields_; 84 | std::unique_ptr field_path = 85 | FieldPath::Create(token_descriptor); 86 | prohibited_fields.push_back(*field_path); 87 | if (!prohibited_fields.back().Parse(prohibited_field)) { 88 | LOG(ERROR) << "FieldPath failed to parse for prohibited field: " 89 | << prohibited_field; 90 | return false; 91 | } 92 | } 93 | return true; 94 | } 95 | 96 | std::unique_ptr StyleSerializer::Create( 97 | const StyleSpec &style_spec) { 98 | std::unique_ptr style_serializer(new StyleSerializer()); 99 | if (!CreateRecordSerializers(style_spec, style_serializer) || 100 | !SetRequiredFieldPaths(style_spec, style_serializer) || 101 | !SetProhibitedFieldPaths(style_spec, style_serializer)) { 102 | return nullptr; 103 | } 104 | return style_serializer; 105 | } 106 | 107 | bool StyleSerializer::IsFieldSet(const Message &root, 108 | const FieldPath &field_path) const { 109 | const Message *parent; 110 | const FieldDescriptor *field; 111 | if (!field_path.Follow(root, &parent, &field)) { 112 | LOG(ERROR) << "FieldPath traversal failed for input Message " 113 | << root.DebugString(); 114 | return false; 115 | } 116 | const Reflection *parent_reflection = parent->GetReflection(); 117 | if (field->label() == FieldDescriptor::LABEL_REPEATED) { 118 | // The field is assumed to be a scalar here. 119 | if (parent_reflection->FieldSize(*parent, field) == 0) { 120 | return false; 121 | } 122 | } else if (!parent_reflection->HasField(*parent, field)) { 123 | return false; 124 | } 125 | return true; 126 | } 127 | 128 | bool StyleSerializer::CheckRequiredFields(const Token &token) const { 129 | for (const std::vector &field_paths : required_fields_) { 130 | bool found = false; 131 | for (const FieldPath &field_path : field_paths) { 132 | if (IsFieldSet(token, field_path)) { 133 | found = true; 134 | break; 135 | } 136 | } 137 | if (!found) { 138 | return false; 139 | } 140 | } 141 | return true; 142 | } 143 | 144 | bool StyleSerializer::CheckProhibitedFields(const Token &token) const { 145 | for (const FieldPath &field_path : prohibited_fields_) { 146 | if (IsFieldSet(token, field_path)) { 147 | return false; 148 | } 149 | } 150 | return true; 151 | } 152 | 153 | bool StyleSerializer::Serialize(const Token &token, 154 | MutableTransducer *serialization) const { 155 | if (!CheckRequiredFields(token) || !CheckProhibitedFields(token)) { 156 | return false; 157 | } 158 | for (const auto &record_serializer : record_serializers_) { 159 | if (!record_serializer->Serialize(token, serialization)) { 160 | LOG(ERROR) << "Record serialization failure for token " + token.name(); 161 | return false; 162 | } 163 | } 164 | return true; 165 | } 166 | 167 | 168 | } // namespace sparrowhawk 169 | } // namespace speech 170 | -------------------------------------------------------------------------------- /src/proto/Makefile.am: -------------------------------------------------------------------------------- 1 | dist_noinst_DATA = items.proto \ 2 | links.proto \ 3 | rule_order.proto \ 4 | semiotic_classes.proto \ 5 | serialization_spec.proto \ 6 | sparrowhawk_configuration.proto 7 | 8 | CC_OUT = $(srcdir)/../lib 9 | H_OUT = $(srcdir)/../include/sparrowhawk 10 | 11 | %.pb.cc %.pb.h: %.proto 12 | $(PROTOC) --proto_path=$(srcdir) --cpp_out=$(srcdir) $^ 13 | cp $*.pb.h $(H_OUT) 14 | cp $*.pb.cc $(CC_OUT) 15 | 16 | MOSTLYCLEANFILES = items.pb.h items.pb.cc \ 17 | links.pb.h links.pb.cc \ 18 | rule_order.pb.h rule_order.pb.cc \ 19 | semiotic_classes.pb.h semiotic_classes.pb.cc \ 20 | serialization_spec.pb.h serialization_spec.pb.cc \ 21 | sparrowhawk_configuration.pb.h sparrowhawk_configuration.pb.cc 22 | 23 | all: $(MOSTLYCLEANFILES) 24 | 25 | -------------------------------------------------------------------------------- /src/proto/items.proto: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | syntax = "proto2"; 15 | 16 | // TODO(rws): Probably phase out links since we are not using it. 17 | import "links.proto"; 18 | import "semiotic_classes.proto"; 19 | 20 | package speech.sparrowhawk; 21 | 22 | // Message containing the contents for a single token as determined by the 23 | // tokenizer. Roughly speaking, a token corresponds to a single verbalizable 24 | // entity, such as a single word, or single semiotic object such as "$15.60". 25 | message Token { 26 | // Describes the kind of entity this token represents. 27 | enum Type { 28 | // A known word which is present in the lexicon. 29 | WORD = 1; 30 | 31 | // A semiotic class. 32 | SEMIOTIC_CLASS = 2; 33 | 34 | // Punctuation which is not expected to be pronounced. 35 | PUNCT = 3; 36 | 37 | // A word, but requires some further verbalization work. 38 | // For example, Thai words with a trailing repetition character. 39 | WORD_NEEDS_VERBALIZATION = 4; 40 | } 41 | 42 | // General pause duration lengths. 43 | enum PauseLength { 44 | PAUSE_NONE = 0; // No pause. 45 | PAUSE_SHORT = 1; // Brief pause, eg. for brackets or quotes. 46 | PAUSE_MEDIUM = 2; // Longer pause, for a comma or similar. 47 | PAUSE_LONG = 3; // Longest pause, for a fullstop or phrase break. 48 | } 49 | 50 | // Structural relationships. The children are words. 51 | // TODO(rws): Probably phase out links since we are not using it. 52 | optional Links links = 1; 53 | 54 | // Optional information about where this token came from in the 55 | // original input. 56 | // Indices are given in Unicode codepoints (*not* byte indices). 57 | optional uint32 start_index = 2; 58 | optional uint32 end_index = 3; 59 | 60 | // The name of the token, which is generally the original unnormalized text 61 | // the token was generated from. 62 | // 63 | // Voice Building Note: This field appears on ScriptLine protos that serve as 64 | // input to voice building. 65 | optional string name = 4; 66 | 67 | // Basic type of the token (see enum comments). 68 | // 69 | // Voice Building Note: This field appears on ScriptLine protos that serve as 70 | // input to voice building. 71 | optional Type type = 5; 72 | 73 | // The wordid of the token, when a single one is known. 74 | // Set when type == WORD 75 | optional string wordid = 6; 76 | 77 | // If the token is a word, this represents the regular lower-cased spelling of 78 | // that word. 79 | optional string spelling = 7; 80 | 81 | // If true, this token is a phrase break. 82 | optional bool phrase_break = 8; 83 | 84 | // Indicates a pause of given length, in seconds. Used when pause given from 85 | // markup. 86 | // Currently unused. 87 | optional float pause_duration = 9; 88 | 89 | // If set, indicates a general length of pause that should be introduced 90 | // for synthesis. For example, a fullstop would generally generate 91 | // a longer pause than a comma. 92 | // Currently unused. 93 | optional PauseLength pause_length = 10 [default = PAUSE_NONE]; 94 | 95 | // This is used to store spelling with stress mark produced 96 | // by stress assigner or provided in input text. 97 | // Currently unused. 98 | optional string spelling_with_stress = 11; 99 | 100 | // If true, don't verbalize this token. Used to skip tokens that are part of a 101 | // multi-token semiotic class, or bypass homograph resolution when explicit 102 | // wordids are provided. 103 | optional bool skip = 12; 104 | 105 | // Is true if a space follows this token. E.g., after tokenization in Chinese/Japanese. 106 | // Currently unused. 107 | optional bool next_space = 13; 108 | 109 | // All the following (fields in the range [14-27]) are used when 110 | // the token represents a semiotic class. In such a case, one of these 111 | // is filled by the output from the classifier/parser stage. 112 | // Alternatively, if part of the input was given as markup, it will 113 | // be copied from the input to these fields. 114 | optional Cardinal cardinal = 14; 115 | optional Ordinal ordinal = 15; 116 | optional string digit = 16; 117 | optional Decimal decimal = 17; 118 | optional Fraction fraction = 18; 119 | optional Time time = 19; 120 | optional Measure measure = 20; 121 | optional Decimal percent = 21; 122 | optional Date date = 22; 123 | optional Telephone telephone = 23; 124 | optional Money money = 24; 125 | optional Electronic electronic = 25; 126 | optional string verbatim = 26; 127 | optional string letters = 27; 128 | 129 | // Tokens defined by things they connect, for example "-" in "Mon-Fri", 130 | // ":" in "1:1", etc. 131 | optional Connector connector = 28; 132 | 133 | // Abbreviations, intended for languages where they may inflect depending 134 | // on case etc. 135 | optional Abbreviation abbreviation = 29; 136 | 137 | // Indices of the first and last words. 138 | optional int32 first_daughter = 30; 139 | 140 | optional int32 last_daughter = 31; 141 | 142 | extensions 1000 to max; 143 | } 144 | 145 | // A single word 146 | message Word { 147 | // Structural relationships. The parent items are tokens. 148 | // TODO(rws): Probably phase out links since we are not using it. 149 | optional Links links = 1; 150 | 151 | // The id of the word, predominantly used as a key into the lexicon. 152 | optional string id = 2; 153 | 154 | // The conventional spelling of the word. 155 | // There can be several spellings matching one id in the lexicon 156 | // (e.g. colour, color correspond to the same wordid) and vice versa 157 | // (spelling "project" maps to ids "project_nou" and "project_vrb"). 158 | optional string spelling = 3; 159 | 160 | // If set, indicates the length of pause that should be generated for 161 | // this word, in seconds. Only applies to the special word "sil". 162 | // Currently unused. 163 | optional float pause_length = 4; 164 | 165 | // True when the prosodic_features have specified that there should (value 166 | // true) or should not (value false) be a pause just after this word, either 167 | // because contains_pause was specified in an utterance in which this was the 168 | // penultimate word, or because precedes_pause was specified in an utterance 169 | // in which this was the last word. 170 | // Currently unused. 171 | optional bool precedes_pause = 5; 172 | 173 | // Parent token 174 | optional int32 parent = 6; 175 | 176 | extensions 1000 to max; 177 | 178 | } 179 | 180 | // A single utterance's linguistic structure 181 | message LinguisticStructure { 182 | // ID uniquely identifying this utterance. If used in asynchronous mode 183 | // the utterance IDs can be used to match multiply emitted utterances 184 | // generated from a single source. 64-bit integer is used to 185 | // accommodate utterance ID as a timestamp. 186 | optional int64 id = 1; 187 | optional string input = 2; // The original sentence. 188 | repeated Token tokens = 3; 189 | repeated Word words = 4; 190 | 191 | extensions 1000 to max; 192 | } 193 | 194 | // An utterance 195 | message Utterance { 196 | // An arbitrary identifier used to identify the utterance for debugging 197 | // purposes. The controller assigns this id internally when it creates an 198 | // utterance, and the id is unique (with high probability) within the process. 199 | // Currently unused. 200 | optional uint64 id = 1; 201 | 202 | // If loaded from file, the filename (usually without a path). Mainly intended 203 | // as a human-readable identifier for debugging purposes. 204 | // Currently unused. 205 | optional string filename = 2; 206 | 207 | // This field can be mutated by various text pre-processing streams, such as 208 | // character segmenters and text filters. 209 | // Currently unused. 210 | optional string sentence = 3; 211 | 212 | // Copy of the original sentence that is guaranteed not to be changed by the 213 | // pipeline. 214 | // Currently unused. 215 | optional string original_sentence = 4; 216 | 217 | // If segmentation was applied on the original sentences, the following field 218 | // will contain the results of the segmentation. Each string corresponds to 219 | // an individual sentence. 220 | // Currently unused. 221 | repeated string segmenter_output = 5; 222 | 223 | // Linguistic streams, words, tokens etc. 224 | optional LinguisticStructure linguistic = 6; 225 | 226 | extensions 1000 to max; 227 | } 228 | -------------------------------------------------------------------------------- /src/proto/links.proto: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | // Defines the relationship between items in the linguistic structure and other 15 | // parts of the utterance, maintaining hierarchies such as syllables being 16 | // parents of phonemes, words being parents of syllables etc. 17 | // 18 | // Unless otherwise noted, we use global 0-based indices within an utterance. 19 | // For example, the 20th phoneme in the utterance will have index 19, even 20 | // though it may be the 1st phoneme in its syllable. 21 | 22 | syntax = "proto2"; 23 | 24 | package speech.sparrowhawk; 25 | 26 | message Links { 27 | // The index of this entity; mainly useful for debugging purposes. 28 | optional int32 own_index = 1; 29 | 30 | // The index of the parent entity of the current entity. 31 | optional int32 parent = 2; 32 | 33 | // The index of the first child of the current entity. 34 | optional int32 first_child = 3; 35 | 36 | // The index of the last child of the current entity. 37 | optional int32 last_child = 4; 38 | } 39 | -------------------------------------------------------------------------------- /src/proto/rule_order.proto: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | // Definition of ordering of rules to be applied. Each rule, and an optional 15 | // pushdown-transducer parenthesis set, is applied (composed) to the input in 16 | // the order specified. 17 | // 18 | // If a reduplication rule (also a Thrax target) is supplied, then that will 19 | // optionally copy any matching input. This is most useful in cases where it is 20 | // desirable to copy the entire to-be-verbalized token. For example, with money 21 | // where one can copy an entire money token, and then read the major currency 22 | // off the first copy and the minor currency off the second. See 23 | // 24 | // Ebden, Peter and Sproat, Richard. 2015. The Kestrel TTS text normalization 25 | // system. Natural Language Engineering, Issue 03, pp 333-353. 26 | // 27 | // for further discussion. 28 | // 29 | // See the Thrax documentation at 30 | // http://www.openfst.org/twiki/bin/view/GRM/ThraxQuickTour for discussion of 31 | // PDTs and (coming soon) MPTDs. 32 | 33 | syntax = "proto2"; 34 | 35 | package speech.sparrowhawk; 36 | 37 | message Rule { 38 | required string main = 1; // Main normalization rule. 39 | optional string parens = 2; // Optional PDT parens. 40 | optional string assignments = 3; // Optional MPDT assignments. 41 | optional string redup = 4; // Optional reduplication rule. 42 | }; 43 | 44 | message Grammar { 45 | required string grammar_file = 1; 46 | required string grammar_name = 2; // Name for this grammar. 47 | repeated Rule rules = 3; 48 | }; 49 | -------------------------------------------------------------------------------- /src/proto/serialization_spec.proto: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | // Proto messages describing specifications for serializing semiotic classes. 15 | // These serializations determine the input to the verbalization grammars. 16 | // TODO(drasha) consider changing the name to serialize_spec.proto for 17 | // consistency. 18 | 19 | syntax = "proto2"; 20 | 21 | package speech.sparrowhawk; 22 | 23 | // Specification for serializing a sub-part of a semiotic class. RecordSpecs may 24 | // be simple, such as a single field, or recursively combine additional 25 | // RecordSpecs to specify more elaborate formats. 26 | // For a repeated scalar field, we simply serialize all the values in the 27 | // token for this field in an identical fashion, respecting the original 28 | // order. 29 | // NB. Assumes there are no repeated embedded messages in semiotic_classes.proto 30 | message RecordSpec { 31 | // The serialization for these RecordSpecs will be emitted prior to every 32 | // instance of the main field for this spec. 33 | repeated RecordSpec prefix_spec = 1; 34 | 35 | // The serialization for these RecordSpecs will be emitted after every 36 | // instance of the main field for this spec. 37 | repeated RecordSpec suffix_spec = 2; 38 | 39 | // Field serialization specification: the fields below are used to include a 40 | // value from the input proto in the serialization. This record will only be 41 | // included in the output serialization if this field is present in the input, 42 | // a default value is supplied, or a one_of field is given. 43 | 44 | // The path (from the top-level token, in proto_path.h format) to this field. 45 | // If the label field is not set, the terminal portion of this will be used as 46 | // the label in the serialized output. 47 | optional string field_path = 3; 48 | 49 | // Defines the record label in the serialization. This should be set only to 50 | // override the use of the terminal field name from the field path as the 51 | // default label. 52 | optional string label = 4; 53 | 54 | // String defining the value to be used for the field in case it is not set. 55 | // Note that prefix and suffix records with default values will not be 56 | // serialized if the parent record is missing. The default value is 57 | // well-defined only for singular fields and is ignored otherwise. 58 | optional string default_value = 5; 59 | } 60 | 61 | // Specification for serializing a semiotic class in a particular style. 62 | // StyleSpecs provide required and prohibited fields to help determine the style 63 | // to be used for verbalization. 64 | message StyleSpec { 65 | // Gives the specification for how tokens should be serialized in this style. 66 | // The serialization components for this style will be emitted in the same 67 | // order as the record specs in this field. 68 | repeated RecordSpec record_spec = 1; 69 | 70 | // When more than one serialization style is used for a semiotic class, it may 71 | // be possible to infer that a serialization is inappropriate due to the 72 | // presence or absence of a particular field. The following fields provide a 73 | // mechanism to do this. 74 | 75 | // This serialization will not be emitted unless all of the fields referred to 76 | // here are present. A single instance can have multiple fields (separated by 77 | // "|") from which at least one field is required for serialization. 78 | repeated string required_fields = 2; 79 | 80 | // This serialization will not be emitted if any of the fields referred to 81 | // here are present. 82 | repeated string prohibited_fields = 3; 83 | } 84 | 85 | // Specification of a serialization format for a particular semiotic class. 86 | message ClassSpec { 87 | // Indicates the type of token that may be serialized by this spec: those with 88 | // this field present, e.g. "cardinal" or "measure". 89 | optional /* required */ string semiotic_class = 1; 90 | 91 | // Denotes the style within the semiotic class. StyleSpecs augment ClassSpec 92 | // by enabling multiple ways of verbalizing the same semiotic class. 93 | repeated StyleSpec style_spec = 2; 94 | } 95 | 96 | // Collection of all serialization specs for a language. A single semiotic class 97 | // may have more than one specification, and all matching serializations for 98 | // that class will be included as paths in the output. 99 | message SerializeSpec { 100 | repeated ClassSpec class_spec = 1; 101 | } 102 | -------------------------------------------------------------------------------- /src/proto/sparrowhawk_configuration.proto: -------------------------------------------------------------------------------- 1 | // Licensed under the Apache License, Version 2.0 (the "License"); 2 | // you may not use this file except in compliance with the License. 3 | // You may obtain a copy of the License at 4 | // 5 | // http://www.apache.org/licenses/LICENSE-2.0 6 | // 7 | // Unless required by applicable law or agreed to in writing, software 8 | // distributed under the License is distributed on an "AS IS" BASIS, 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | // See the License for the specific language governing permissions and 11 | // limitations under the License. 12 | // 13 | // Copyright 2015 and onwards Google, Inc. 14 | // The SparrowhawkConfiguration stores information about the grammars, and rules 15 | // to run. 16 | 17 | syntax = "proto2"; 18 | 19 | package speech.sparrowhawk; 20 | 21 | message SparrowhawkConfiguration { 22 | optional string tokenizer_grammar = 1; // Tokenizer-classifier. 23 | 24 | optional string verbalizer_grammar = 2; 25 | 26 | // Regular expression for sentence boundary detector. This is a set 27 | // of possible end-of-sentence markers. 28 | optional string sentence_boundary_regexp = 3; 29 | 30 | // Optional file specifying tokens that end in a possible end-of-sentence 31 | // marker that should *not* usually induce an end-of-sentence decision 32 | // e.g. “Mr.” 33 | optional string sentence_boundary_exceptions_file = 4; 34 | 35 | // Optional file with SerializeSpec for verbalizer as a text proto. If the 36 | // the field is not set, we resort to protobuf serializer. 37 | optional string serialization_spec = 5; 38 | } 39 | --------------------------------------------------------------------------------