├── AUTHORS
├── INSTALL
├── LICENSE
├── Makefile.am
├── Makefile.in
├── NEWS
├── README
├── aclocal.m4
├── ar-lib
├── compile
├── config.guess
├── config.sub
├── configure
├── configure.ac
├── data
    └── festival-2.4_sparrowhawk.patch
├── depcomp
├── documentation
    ├── README.md
    └── grammars
    │   ├── en_toy
    │       ├── README
    │       ├── byte.far
    │       ├── byte.grm
    │       ├── classify
    │       │   ├── cardinal.grm
    │       │   ├── date.grm
    │       │   ├── measure.grm
    │       │   ├── measure.tsv
    │       │   ├── money.grm
    │       │   ├── money.tsv
    │       │   ├── months.tsv
    │       │   ├── punctuation.grm
    │       │   ├── time.grm
    │       │   ├── tokenize_and_classify.grm
    │       │   └── word.grm
    │       ├── util.far
    │       ├── util.grm
    │       ├── verbalize
    │       │   ├── CARDINAL_NUMBER_NAME
    │       │   ├── ORDINAL_NUMBER_NAME
    │       │   ├── date.grm
    │       │   ├── measure.grm
    │       │   ├── money.grm
    │       │   ├── money.tsv
    │       │   ├── numbers.grm
    │       │   ├── time.grm
    │       │   ├── verbalize.grm
    │       │   └── verbatim.grm
    │       └── verbalize_serialization
    │       │   ├── CARDINAL_NUMBER_NAME
    │       │   ├── ORDINAL_NUMBER_NAME
    │       │   ├── date.grm
    │       │   ├── measure.grm
    │       │   ├── money.grm
    │       │   ├── money.tsv
    │       │   ├── numbers.grm
    │       │   ├── time.grm
    │       │   ├── verbalize.grm
    │       │   └── verbatim.grm
    │   ├── sentence_boundary_exceptions.txt
    │   ├── sparrowhawk_configuration.ascii_proto
    │   ├── sparrowhawk_configuration_serialization.ascii_proto
    │   ├── test.txt
    │   ├── tokenizer.ascii_proto
    │   ├── verbalizer.ascii_proto
    │   ├── verbalizer_serialization.ascii_proto
    │   └── verbalizer_serialization_spec.ascii_proto
├── install-sh
├── ltmain.sh
├── m4
    ├── libtool.m4
    ├── ltoptions.m4
    ├── ltsugar.m4
    ├── ltversion.m4
    └── lt~obsolete.m4
├── missing
└── src
    ├── Makefile.am
    ├── Makefile.in
    ├── bin
        ├── Makefile.am
        ├── Makefile.in
        └── normalizer_main.cc
    ├── include
        ├── Makefile.am
        ├── Makefile.in
        └── sparrowhawk
        │   ├── field_path.h
        │   ├── io_utils.h
        │   ├── items.pb.h
        │   ├── links.pb.h
        │   ├── logger.h
        │   ├── normalizer.h
        │   ├── numbers.h
        │   ├── protobuf_parser.h
        │   ├── protobuf_serializer.h
        │   ├── record_serializer.h
        │   ├── regexp.h
        │   ├── rule_order.pb.h
        │   ├── rule_system.h
        │   ├── semiotic_classes.pb.h
        │   ├── sentence_boundary.h
        │   ├── sparrowhawk_configuration.pb.h
        │   ├── spec_serializer.h
        │   ├── string_utils.h
        │   └── style_serializer.h
    ├── lib
        ├── Makefile.am
        ├── Makefile.in
        ├── field_path.cc
        ├── io_utils.cc
        ├── items.pb.cc
        ├── links.pb.cc
        ├── normalizer.cc
        ├── normalizer_utils.cc
        ├── numbers.cc
        ├── protobuf_parser.cc
        ├── protobuf_serializer.cc
        ├── record_serializer.cc
        ├── regexp.cc
        ├── rule_order.pb.cc
        ├── rule_system.cc
        ├── semiotic_classes.pb.cc
        ├── sentence_boundary.cc
        ├── serialization_spec.pb.cc
        ├── sparrowhawk_configuration.pb.cc
        ├── spec_serializer.cc
        ├── string_utils.cc
        └── style_serializer.cc
    └── proto
        ├── Makefile.am
        ├── Makefile.in
        ├── items.proto
        ├── links.proto
        ├── rule_order.proto
        ├── semiotic_classes.proto
        ├── serialization_spec.proto
        └── sparrowhawk_configuration.proto


/AUTHORS:
--------------------------------------------------------------------------------
1 | Google Inc.
2 | 
3 | 


--------------------------------------------------------------------------------
/INSTALL:
--------------------------------------------------------------------------------
  1 | Installation Instructions
  2 | *************************
  3 | 
  4 | Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005,
  5 | 2006, 2007 Free Software Foundation, Inc.
  6 | 
  7 | This file is free documentation; the Free Software Foundation gives
  8 | unlimited permission to copy, distribute and modify it.
  9 | 
 10 | Basic Installation
 11 | ==================
 12 | 
 13 | Briefly, the shell commands `./configure; make; make install' should
 14 | configure, build, and install this package.  The following
 15 | more-detailed instructions are generic; see the `README' file for
 16 | instructions specific to this package.
 17 | 
 18 |    The `configure' shell script attempts to guess correct values for
 19 | various system-dependent variables used during compilation.  It uses
 20 | those values to create a `Makefile' in each directory of the package.
 21 | It may also create one or more `.h' files containing system-dependent
 22 | definitions.  Finally, it creates a shell script `config.status' that
 23 | you can run in the future to recreate the current configuration, and a
 24 | file `config.log' containing compiler output (useful mainly for
 25 | debugging `configure').
 26 | 
 27 |    It can also use an optional file (typically called `config.cache'
 28 | and enabled with `--cache-file=config.cache' or simply `-C') that saves
 29 | the results of its tests to speed up reconfiguring.  Caching is
 30 | disabled by default to prevent problems with accidental use of stale
 31 | cache files.
 32 | 
 33 |    If you need to do unusual things to compile the package, please try
 34 | to figure out how `configure' could check whether to do them, and mail
 35 | diffs or instructions to the address given in the `README' so they can
 36 | be considered for the next release.  If you are using the cache, and at
 37 | some point `config.cache' contains results you don't want to keep, you
 38 | may remove or edit it.
 39 | 
 40 |    The file `configure.ac' (or `configure.in') is used to create
 41 | `configure' by a program called `autoconf'.  You need `configure.ac' if
 42 | you want to change it or regenerate `configure' using a newer version
 43 | of `autoconf'.
 44 | 
 45 | The simplest way to compile this package is:
 46 | 
 47 |   1. `cd' to the directory containing the package's source code and type
 48 |      `./configure' to configure the package for your system.
 49 | 
 50 |      Running `configure' might take a while.  While running, it prints
 51 |      some messages telling which features it is checking for.
 52 | 
 53 |   2. Type `make' to compile the package.
 54 | 
 55 |   3. Optionally, type `make check' to run any self-tests that come with
 56 |      the package.
 57 | 
 58 |   4. Type `make install' to install the programs and any data files and
 59 |      documentation.
 60 | 
 61 |   5. You can remove the program binaries and object files from the
 62 |      source code directory by typing `make clean'.  To also remove the
 63 |      files that `configure' created (so you can compile the package for
 64 |      a different kind of computer), type `make distclean'.  There is
 65 |      also a `make maintainer-clean' target, but that is intended mainly
 66 |      for the package's developers.  If you use it, you may have to get
 67 |      all sorts of other programs in order to regenerate files that came
 68 |      with the distribution.
 69 | 
 70 |   6. Often, you can also type `make uninstall' to remove the installed
 71 |      files again.
 72 | 
 73 | Compilers and Options
 74 | =====================
 75 | 
 76 | Some systems require unusual options for compilation or linking that the
 77 | `configure' script does not know about.  Run `./configure --help' for
 78 | details on some of the pertinent environment variables.
 79 | 
 80 |    You can give `configure' initial values for configuration parameters
 81 | by setting variables in the command line or in the environment.  Here
 82 | is an example:
 83 | 
 84 |      ./configure CC=c99 CFLAGS=-g LIBS=-lposix
 85 | 
 86 |    *Note Defining Variables::, for more details.
 87 | 
 88 | Compiling For Multiple Architectures
 89 | ====================================
 90 | 
 91 | You can compile the package for more than one kind of computer at the
 92 | same time, by placing the object files for each architecture in their
 93 | own directory.  To do this, you can use GNU `make'.  `cd' to the
 94 | directory where you want the object files and executables to go and run
 95 | the `configure' script.  `configure' automatically checks for the
 96 | source code in the directory that `configure' is in and in `..'.
 97 | 
 98 |    With a non-GNU `make', it is safer to compile the package for one
 99 | architecture at a time in the source code directory.  After you have
100 | installed the package for one architecture, use `make distclean' before
101 | reconfiguring for another architecture.
102 | 
103 | Installation Names
104 | ==================
105 | 
106 | By default, `make install' installs the package's commands under
107 | `/usr/local/bin', include files under `/usr/local/include', etc.  You
108 | can specify an installation prefix other than `/usr/local' by giving
109 | `configure' the option `--prefix=PREFIX'.
110 | 
111 |    You can specify separate installation prefixes for
112 | architecture-specific files and architecture-independent files.  If you
113 | pass the option `--exec-prefix=PREFIX' to `configure', the package uses
114 | PREFIX as the prefix for installing programs and libraries.
115 | Documentation and other data files still use the regular prefix.
116 | 
117 |    In addition, if you use an unusual directory layout you can give
118 | options like `--bindir=DIR' to specify different values for particular
119 | kinds of files.  Run `configure --help' for a list of the directories
120 | you can set and what kinds of files go in them.
121 | 
122 |    If the package supports it, you can cause programs to be installed
123 | with an extra prefix or suffix on their names by giving `configure' the
124 | option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.
125 | 
126 | Optional Features
127 | =================
128 | 
129 | Some packages pay attention to `--enable-FEATURE' options to
130 | `configure', where FEATURE indicates an optional part of the package.
131 | They may also pay attention to `--with-PACKAGE' options, where PACKAGE
132 | is something like `gnu-as' or `x' (for the X Window System).  The
133 | `README' should mention any `--enable-' and `--with-' options that the
134 | package recognizes.
135 | 
136 |    For packages that use the X Window System, `configure' can usually
137 | find the X include and library files automatically, but if it doesn't,
138 | you can use the `configure' options `--x-includes=DIR' and
139 | `--x-libraries=DIR' to specify their locations.
140 | 
141 | Specifying the System Type
142 | ==========================
143 | 
144 | There may be some features `configure' cannot figure out automatically,
145 | but needs to determine by the type of machine the package will run on.
146 | Usually, assuming the package is built to be run on the _same_
147 | architectures, `configure' can figure that out, but if it prints a
148 | message saying it cannot guess the machine type, give it the
149 | `--build=TYPE' option.  TYPE can either be a short name for the system
150 | type, such as `sun4', or a canonical name which has the form:
151 | 
152 |      CPU-COMPANY-SYSTEM
153 | 
154 | where SYSTEM can have one of these forms:
155 | 
156 |      OS KERNEL-OS
157 | 
158 |    See the file `config.sub' for the possible values of each field.  If
159 | `config.sub' isn't included in this package, then this package doesn't
160 | need to know the machine type.
161 | 
162 |    If you are _building_ compiler tools for cross-compiling, you should
163 | use the option `--target=TYPE' to select the type of system they will
164 | produce code for.
165 | 
166 |    If you want to _use_ a cross compiler, that generates code for a
167 | platform different from the build platform, you should specify the
168 | "host" platform (i.e., that on which the generated programs will
169 | eventually be run) with `--host=TYPE'.
170 | 
171 | Sharing Defaults
172 | ================
173 | 
174 | If you want to set default values for `configure' scripts to share, you
175 | can create a site shell script called `config.site' that gives default
176 | values for variables like `CC', `cache_file', and `prefix'.
177 | `configure' looks for `PREFIX/share/config.site' if it exists, then
178 | `PREFIX/etc/config.site' if it exists.  Or, you can set the
179 | `CONFIG_SITE' environment variable to the location of the site script.
180 | A warning: not all `configure' scripts look for a site script.
181 | 
182 | Defining Variables
183 | ==================
184 | 
185 | Variables not defined in a site shell script can be set in the
186 | environment passed to `configure'.  However, some packages may run
187 | configure again during the build, and the customized values of these
188 | variables may be lost.  In order to avoid this problem, you should set
189 | them in the `configure' command line, using `VAR=value'.  For example:
190 | 
191 |      ./configure CC=/usr/local2/bin/gcc
192 | 
193 | causes the specified `gcc' to be used as the C compiler (unless it is
194 | overridden in the site shell script).
195 | 
196 | Unfortunately, this technique does not work for `CONFIG_SHELL' due to
197 | an Autoconf bug.  Until the bug is fixed you can use this workaround:
198 | 
199 |      CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash
200 | 
201 | `configure' Invocation
202 | ======================
203 | 
204 | `configure' recognizes the following options to control how it operates.
205 | 
206 | `--help'
207 | `-h'
208 |      Print a summary of the options to `configure', and exit.
209 | 
210 | `--version'
211 | `-V'
212 |      Print the version of Autoconf used to generate the `configure'
213 |      script, and exit.
214 | 
215 | `--cache-file=FILE'
216 |      Enable the cache: use and save the results of the tests in FILE,
217 |      traditionally `config.cache'.  FILE defaults to `/dev/null' to
218 |      disable caching.
219 | 
220 | `--config-cache'
221 | `-C'
222 |      Alias for `--cache-file=config.cache'.
223 | 
224 | `--quiet'
225 | `--silent'
226 | `-q'
227 |      Do not print messages saying which checks are being made.  To
228 |      suppress all normal output, redirect it to `/dev/null' (any error
229 |      messages will still be shown).
230 | 
231 | `--srcdir=DIR'
232 |      Look for the package's source code in directory DIR.  Usually
233 |      `configure' can determine that directory automatically.
234 | 
235 | `configure' also accepts some other, not widely useful, options.  Run
236 | `configure --help' for more details.
237 | 
238 | 


--------------------------------------------------------------------------------
/Makefile.am:
--------------------------------------------------------------------------------
1 | SUBDIRS = src
2 | ACLOCAL_AMFLAGS = -I m4
3 | EXTRA_DIST = LICENSE data documentation
4 | 
5 | 
6 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
 1 | Sparrowhawk - Release 0.1
 2 | 
 3 | This is the alpha version.
 4 | 
 5 | Sparrowhawk - Release 1.0
 6 | 
 7 | * Added new verbalizer serialization, with accompanying grammars.
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | Sparrowhawk - Release 1.0
 2 | 
 3 | Sparrowhawk is an open-source implementation of Google's Kestrel text-to-speech
 4 | text normalization system.  It follows the discussion of the Kestrel system as
 5 | described in:
 6 | 
 7 | Ebden, Peter and Sproat, Richard. 2015. The Kestrel TTS text normalization
 8 | system. Natural Language Engineering, Issue 03, pp 333-353.
 9 | 
10 | After sentence segmentation (sentence_boundary.h), the individual sentences are
11 | first tokenized with each token being classified, and then passed to the
12 | normalizer. The system can output as an unannotated string of words, and richer
13 | annotation with links between input tokens, their input string positions, and
14 | the output words is also available.
15 | 
16 | REQUIREMENTS:
17 | 
18 |   This version is known to work under Linux using g++ (>= 4.6) and
19 |   MacOS X using XCode 5. Expected to work wherever adequate POSIX
20 |   (dlopen, ssize_t, basename), c99 (snprintf, strtoll, <stdint.h>),
21 |   and C++11 (<unordered_set>, <unordered_map>, <forward_list>) support
22 |   are available.
23 | 
24 |   You must have installed the following packages:
25 | 
26 |   - OpenFst 1.5.4 or higher (www.openfst.org)
27 |   - Thrax 1.2.2 or higher (http://www.openfst.org/twiki/bin/view/GRM/Thrax)
28 |   - re2 (https://github.com/google/re2)
29 |   - protobuf (http://protobuf.googlecode.com/files/protobuf-2.5.0.tar.gz ---
30 |     see e.g. http://jugnu-life.blogspot.com/2013/09/install-protobuf-25-on-ubuntu.html)
31 |   	
32 | INSTALLATION:
33 |   Follow the generic GNU build system instructions in ./INSTALL.  We
34 |   recommend configuring with --enable-static=no for faster
35 |   compiles. 
36 | 
37 |   NOTE: In some versions of Mac OS-X we have noticed a problem with configure
38 |   whereby it fails to find fst.h. If this occurs, try configuring as follows: 
39 | 
40 |   CPPFLAGS=-I/usr/local/include LDFLAGS=-L/usr/local/lib ./configure
41 | 
42 | USAGE:
43 |   Assuming you've installed under the default /usr/local, the library will be
44 |   in /usr/local/lib, and the headers in /usr/local/include/sparrowhawk.
45 | 
46 |   To use in your own program, include <sparrowhawk/normalizer.h> and compile
47 |   with '-I /usr/local/include'. The compiler must support C++11 (for g++ add the
48 |   flag "-std=c++11"). Link against /usr/local/lib/libsparrowhawk.so and
49 |   -ldl. Set your LD_LIBRARY_PATH (or equivalent) to contain /usr/local/lib.  The
50 |   linking is, by default, dynamic so that the Fst and Arc type DSO extensions
51 |   can be used correctly if desired.
52 | 
53 | DOCUMENTATION: 
54 |   See ./NEWS for updates since the last release.
55 | 


--------------------------------------------------------------------------------
/ar-lib:
--------------------------------------------------------------------------------
  1 | #! /bin/sh
  2 | # Wrapper for Microsoft lib.exe
  3 | 
  4 | me=ar-lib
  5 | scriptversion=2012-03-01.08; # UTC
  6 | 
  7 | # Copyright (C) 2010-2013 Free Software Foundation, Inc.
  8 | # Written by Peter Rosin <peda@lysator.liu.se>.
  9 | #
 10 | # This program is free software; you can redistribute it and/or modify
 11 | # it under the terms of the GNU General Public License as published by
 12 | # the Free Software Foundation; either version 2, or (at your option)
 13 | # any later version.
 14 | #
 15 | # This program is distributed in the hope that it will be useful,
 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 | # GNU General Public License for more details.
 19 | #
 20 | # You should have received a copy of the GNU General Public License
 21 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 22 | 
 23 | # As a special exception to the GNU General Public License, if you
 24 | # distribute this file as part of a program that contains a
 25 | # configuration script generated by Autoconf, you may include it under
 26 | # the same distribution terms that you use for the rest of that program.
 27 | 
 28 | # This file is maintained in Automake, please report
 29 | # bugs to <bug-automake@gnu.org> or send patches to
 30 | # <automake-patches@gnu.org>.
 31 | 
 32 | 
 33 | # func_error message
 34 | func_error ()
 35 | {
 36 |   echo "$me: $1" 1>&2
 37 |   exit 1
 38 | }
 39 | 
 40 | file_conv=
 41 | 
 42 | # func_file_conv build_file
 43 | # Convert a $build file to $host form and store it in $file
 44 | # Currently only supports Windows hosts.
 45 | func_file_conv ()
 46 | {
 47 |   file=$1
 48 |   case $file in
 49 |     / | /[!/]*) # absolute file, and not a UNC file
 50 |       if test -z "$file_conv"; then
 51 | 	# lazily determine how to convert abs files
 52 | 	case `uname -s` in
 53 | 	  MINGW*)
 54 | 	    file_conv=mingw
 55 | 	    ;;
 56 | 	  CYGWIN*)
 57 | 	    file_conv=cygwin
 58 | 	    ;;
 59 | 	  *)
 60 | 	    file_conv=wine
 61 | 	    ;;
 62 | 	esac
 63 |       fi
 64 |       case $file_conv in
 65 | 	mingw)
 66 | 	  file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'`
 67 | 	  ;;
 68 | 	cygwin)
 69 | 	  file=`cygpath -m "$file" || echo "$file"`
 70 | 	  ;;
 71 | 	wine)
 72 | 	  file=`winepath -w "$file" || echo "$file"`
 73 | 	  ;;
 74 |       esac
 75 |       ;;
 76 |   esac
 77 | }
 78 | 
 79 | # func_at_file at_file operation archive
 80 | # Iterate over all members in AT_FILE performing OPERATION on ARCHIVE
 81 | # for each of them.
 82 | # When interpreting the content of the @FILE, do NOT use func_file_conv,
 83 | # since the user would need to supply preconverted file names to
 84 | # binutils ar, at least for MinGW.
 85 | func_at_file ()
 86 | {
 87 |   operation=$2
 88 |   archive=$3
 89 |   at_file_contents=`cat "$1"`
 90 |   eval set x "$at_file_contents"
 91 |   shift
 92 | 
 93 |   for member
 94 |   do
 95 |     $AR -NOLOGO $operation:"$member" "$archive" || exit $?
 96 |   done
 97 | }
 98 | 
 99 | case $1 in
100 |   '')
101 |      func_error "no command.  Try '$0 --help' for more information."
102 |      ;;
103 |   -h | --h*)
104 |     cat <<EOF
105 | Usage: $me [--help] [--version] PROGRAM ACTION ARCHIVE [MEMBER...]
106 | 
107 | Members may be specified in a file named with @FILE.
108 | EOF
109 |     exit $?
110 |     ;;
111 |   -v | --v*)
112 |     echo "$me, version $scriptversion"
113 |     exit $?
114 |     ;;
115 | esac
116 | 
117 | if test $# -lt 3; then
118 |   func_error "you must specify a program, an action and an archive"
119 | fi
120 | 
121 | AR=$1
122 | shift
123 | while :
124 | do
125 |   if test $# -lt 2; then
126 |     func_error "you must specify a program, an action and an archive"
127 |   fi
128 |   case $1 in
129 |     -lib | -LIB \
130 |     | -ltcg | -LTCG \
131 |     | -machine* | -MACHINE* \
132 |     | -subsystem* | -SUBSYSTEM* \
133 |     | -verbose | -VERBOSE \
134 |     | -wx* | -WX* )
135 |       AR="$AR $1"
136 |       shift
137 |       ;;
138 |     *)
139 |       action=$1
140 |       shift
141 |       break
142 |       ;;
143 |   esac
144 | done
145 | orig_archive=$1
146 | shift
147 | func_file_conv "$orig_archive"
148 | archive=$file
149 | 
150 | # strip leading dash in $action
151 | action=${action#-}
152 | 
153 | delete=
154 | extract=
155 | list=
156 | quick=
157 | replace=
158 | index=
159 | create=
160 | 
161 | while test -n "$action"
162 | do
163 |   case $action in
164 |     d*) delete=yes  ;;
165 |     x*) extract=yes ;;
166 |     t*) list=yes    ;;
167 |     q*) quick=yes   ;;
168 |     r*) replace=yes ;;
169 |     s*) index=yes   ;;
170 |     S*)             ;; # the index is always updated implicitly
171 |     c*) create=yes  ;;
172 |     u*)             ;; # TODO: don't ignore the update modifier
173 |     v*)             ;; # TODO: don't ignore the verbose modifier
174 |     *)
175 |       func_error "unknown action specified"
176 |       ;;
177 |   esac
178 |   action=${action#?}
179 | done
180 | 
181 | case $delete$extract$list$quick$replace,$index in
182 |   yes,* | ,yes)
183 |     ;;
184 |   yesyes*)
185 |     func_error "more than one action specified"
186 |     ;;
187 |   *)
188 |     func_error "no action specified"
189 |     ;;
190 | esac
191 | 
192 | if test -n "$delete"; then
193 |   if test ! -f "$orig_archive"; then
194 |     func_error "archive not found"
195 |   fi
196 |   for member
197 |   do
198 |     case $1 in
199 |       @*)
200 |         func_at_file "${1#@}" -REMOVE "$archive"
201 |         ;;
202 |       *)
203 |         func_file_conv "$1"
204 |         $AR -NOLOGO -REMOVE:"$file" "$archive" || exit $?
205 |         ;;
206 |     esac
207 |   done
208 | 
209 | elif test -n "$extract"; then
210 |   if test ! -f "$orig_archive"; then
211 |     func_error "archive not found"
212 |   fi
213 |   if test $# -gt 0; then
214 |     for member
215 |     do
216 |       case $1 in
217 |         @*)
218 |           func_at_file "${1#@}" -EXTRACT "$archive"
219 |           ;;
220 |         *)
221 |           func_file_conv "$1"
222 |           $AR -NOLOGO -EXTRACT:"$file" "$archive" || exit $?
223 |           ;;
224 |       esac
225 |     done
226 |   else
227 |     $AR -NOLOGO -LIST "$archive" | sed -e 's/\\/\\\\/g' | while read member
228 |     do
229 |       $AR -NOLOGO -EXTRACT:"$member" "$archive" || exit $?
230 |     done
231 |   fi
232 | 
233 | elif test -n "$quick$replace"; then
234 |   if test ! -f "$orig_archive"; then
235 |     if test -z "$create"; then
236 |       echo "$me: creating $orig_archive"
237 |     fi
238 |     orig_archive=
239 |   else
240 |     orig_archive=$archive
241 |   fi
242 | 
243 |   for member
244 |   do
245 |     case $1 in
246 |     @*)
247 |       func_file_conv "${1#@}"
248 |       set x "$@" "@$file"
249 |       ;;
250 |     *)
251 |       func_file_conv "$1"
252 |       set x "$@" "$file"
253 |       ;;
254 |     esac
255 |     shift
256 |     shift
257 |   done
258 | 
259 |   if test -n "$orig_archive"; then
260 |     $AR -NOLOGO -OUT:"$archive" "$orig_archive" "$@" || exit $?
261 |   else
262 |     $AR -NOLOGO -OUT:"$archive" "$@" || exit $?
263 |   fi
264 | 
265 | elif test -n "$list"; then
266 |   if test ! -f "$orig_archive"; then
267 |     func_error "archive not found"
268 |   fi
269 |   $AR -NOLOGO -LIST "$archive" || exit $?
270 | fi
271 | 


--------------------------------------------------------------------------------
/compile:
--------------------------------------------------------------------------------
  1 | #! /bin/sh
  2 | # Wrapper for compilers which do not understand '-c -o'.
  3 | 
  4 | scriptversion=2012-10-14.11; # UTC
  5 | 
  6 | # Copyright (C) 1999-2013 Free Software Foundation, Inc.
  7 | # Written by Tom Tromey <tromey@cygnus.com>.
  8 | #
  9 | # This program is free software; you can redistribute it and/or modify
 10 | # it under the terms of the GNU General Public License as published by
 11 | # the Free Software Foundation; either version 2, or (at your option)
 12 | # any later version.
 13 | #
 14 | # This program is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU General Public License
 20 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 21 | 
 22 | # As a special exception to the GNU General Public License, if you
 23 | # distribute this file as part of a program that contains a
 24 | # configuration script generated by Autoconf, you may include it under
 25 | # the same distribution terms that you use for the rest of that program.
 26 | 
 27 | # This file is maintained in Automake, please report
 28 | # bugs to <bug-automake@gnu.org> or send patches to
 29 | # <automake-patches@gnu.org>.
 30 | 
 31 | nl='
 32 | '
 33 | 
 34 | # We need space, tab and new line, in precisely that order.  Quoting is
 35 | # there to prevent tools from complaining about whitespace usage.
 36 | IFS=" ""	$nl"
 37 | 
 38 | file_conv=
 39 | 
 40 | # func_file_conv build_file lazy
 41 | # Convert a $build file to $host form and store it in $file
 42 | # Currently only supports Windows hosts. If the determined conversion
 43 | # type is listed in (the comma separated) LAZY, no conversion will
 44 | # take place.
 45 | func_file_conv ()
 46 | {
 47 |   file=$1
 48 |   case $file in
 49 |     / | /[!/]*) # absolute file, and not a UNC file
 50 |       if test -z "$file_conv"; then
 51 | 	# lazily determine how to convert abs files
 52 | 	case `uname -s` in
 53 | 	  MINGW*)
 54 | 	    file_conv=mingw
 55 | 	    ;;
 56 | 	  CYGWIN*)
 57 | 	    file_conv=cygwin
 58 | 	    ;;
 59 | 	  *)
 60 | 	    file_conv=wine
 61 | 	    ;;
 62 | 	esac
 63 |       fi
 64 |       case $file_conv/,$2, in
 65 | 	*,$file_conv,*)
 66 | 	  ;;
 67 | 	mingw/*)
 68 | 	  file=`cmd //C echo "$file " | sed -e 's/"\(.*\) " *$/\1/'`
 69 | 	  ;;
 70 | 	cygwin/*)
 71 | 	  file=`cygpath -m "$file" || echo "$file"`
 72 | 	  ;;
 73 | 	wine/*)
 74 | 	  file=`winepath -w "$file" || echo "$file"`
 75 | 	  ;;
 76 |       esac
 77 |       ;;
 78 |   esac
 79 | }
 80 | 
 81 | # func_cl_dashL linkdir
 82 | # Make cl look for libraries in LINKDIR
 83 | func_cl_dashL ()
 84 | {
 85 |   func_file_conv "$1"
 86 |   if test -z "$lib_path"; then
 87 |     lib_path=$file
 88 |   else
 89 |     lib_path="$lib_path;$file"
 90 |   fi
 91 |   linker_opts="$linker_opts -LIBPATH:$file"
 92 | }
 93 | 
 94 | # func_cl_dashl library
 95 | # Do a library search-path lookup for cl
 96 | func_cl_dashl ()
 97 | {
 98 |   lib=$1
 99 |   found=no
100 |   save_IFS=$IFS
101 |   IFS=';'
102 |   for dir in $lib_path $LIB
103 |   do
104 |     IFS=$save_IFS
105 |     if $shared && test -f "$dir/$lib.dll.lib"; then
106 |       found=yes
107 |       lib=$dir/$lib.dll.lib
108 |       break
109 |     fi
110 |     if test -f "$dir/$lib.lib"; then
111 |       found=yes
112 |       lib=$dir/$lib.lib
113 |       break
114 |     fi
115 |     if test -f "$dir/lib$lib.a"; then
116 |       found=yes
117 |       lib=$dir/lib$lib.a
118 |       break
119 |     fi
120 |   done
121 |   IFS=$save_IFS
122 | 
123 |   if test "$found" != yes; then
124 |     lib=$lib.lib
125 |   fi
126 | }
127 | 
128 | # func_cl_wrapper cl arg...
129 | # Adjust compile command to suit cl
130 | func_cl_wrapper ()
131 | {
132 |   # Assume a capable shell
133 |   lib_path=
134 |   shared=:
135 |   linker_opts=
136 |   for arg
137 |   do
138 |     if test -n "$eat"; then
139 |       eat=
140 |     else
141 |       case $1 in
142 | 	-o)
143 | 	  # configure might choose to run compile as 'compile cc -o foo foo.c'.
144 | 	  eat=1
145 | 	  case $2 in
146 | 	    *.o | *.[oO][bB][jJ])
147 | 	      func_file_conv "$2"
148 | 	      set x "$@" -Fo"$file"
149 | 	      shift
150 | 	      ;;
151 | 	    *)
152 | 	      func_file_conv "$2"
153 | 	      set x "$@" -Fe"$file"
154 | 	      shift
155 | 	      ;;
156 | 	  esac
157 | 	  ;;
158 | 	-I)
159 | 	  eat=1
160 | 	  func_file_conv "$2" mingw
161 | 	  set x "$@" -I"$file"
162 | 	  shift
163 | 	  ;;
164 | 	-I*)
165 | 	  func_file_conv "${1#-I}" mingw
166 | 	  set x "$@" -I"$file"
167 | 	  shift
168 | 	  ;;
169 | 	-l)
170 | 	  eat=1
171 | 	  func_cl_dashl "$2"
172 | 	  set x "$@" "$lib"
173 | 	  shift
174 | 	  ;;
175 | 	-l*)
176 | 	  func_cl_dashl "${1#-l}"
177 | 	  set x "$@" "$lib"
178 | 	  shift
179 | 	  ;;
180 | 	-L)
181 | 	  eat=1
182 | 	  func_cl_dashL "$2"
183 | 	  ;;
184 | 	-L*)
185 | 	  func_cl_dashL "${1#-L}"
186 | 	  ;;
187 | 	-static)
188 | 	  shared=false
189 | 	  ;;
190 | 	-Wl,*)
191 | 	  arg=${1#-Wl,}
192 | 	  save_ifs="$IFS"; IFS=','
193 | 	  for flag in $arg; do
194 | 	    IFS="$save_ifs"
195 | 	    linker_opts="$linker_opts $flag"
196 | 	  done
197 | 	  IFS="$save_ifs"
198 | 	  ;;
199 | 	-Xlinker)
200 | 	  eat=1
201 | 	  linker_opts="$linker_opts $2"
202 | 	  ;;
203 | 	-*)
204 | 	  set x "$@" "$1"
205 | 	  shift
206 | 	  ;;
207 | 	*.cc | *.CC | *.cxx | *.CXX | *.[cC]++)
208 | 	  func_file_conv "$1"
209 | 	  set x "$@" -Tp"$file"
210 | 	  shift
211 | 	  ;;
212 | 	*.c | *.cpp | *.CPP | *.lib | *.LIB | *.Lib | *.OBJ | *.obj | *.[oO])
213 | 	  func_file_conv "$1" mingw
214 | 	  set x "$@" "$file"
215 | 	  shift
216 | 	  ;;
217 | 	*)
218 | 	  set x "$@" "$1"
219 | 	  shift
220 | 	  ;;
221 |       esac
222 |     fi
223 |     shift
224 |   done
225 |   if test -n "$linker_opts"; then
226 |     linker_opts="-link$linker_opts"
227 |   fi
228 |   exec "$@" $linker_opts
229 |   exit 1
230 | }
231 | 
232 | eat=
233 | 
234 | case $1 in
235 |   '')
236 |      echo "$0: No command.  Try '$0 --help' for more information." 1>&2
237 |      exit 1;
238 |      ;;
239 |   -h | --h*)
240 |     cat <<\EOF
241 | Usage: compile [--help] [--version] PROGRAM [ARGS]
242 | 
243 | Wrapper for compilers which do not understand '-c -o'.
244 | Remove '-o dest.o' from ARGS, run PROGRAM with the remaining
245 | arguments, and rename the output as expected.
246 | 
247 | If you are trying to build a whole package this is not the
248 | right script to run: please start by reading the file 'INSTALL'.
249 | 
250 | Report bugs to <bug-automake@gnu.org>.
251 | EOF
252 |     exit $?
253 |     ;;
254 |   -v | --v*)
255 |     echo "compile $scriptversion"
256 |     exit $?
257 |     ;;
258 |   cl | *[/\\]cl | cl.exe | *[/\\]cl.exe )
259 |     func_cl_wrapper "$@"      # Doesn't return...
260 |     ;;
261 | esac
262 | 
263 | ofile=
264 | cfile=
265 | 
266 | for arg
267 | do
268 |   if test -n "$eat"; then
269 |     eat=
270 |   else
271 |     case $1 in
272 |       -o)
273 | 	# configure might choose to run compile as 'compile cc -o foo foo.c'.
274 | 	# So we strip '-o arg' only if arg is an object.
275 | 	eat=1
276 | 	case $2 in
277 | 	  *.o | *.obj)
278 | 	    ofile=$2
279 | 	    ;;
280 | 	  *)
281 | 	    set x "$@" -o "$2"
282 | 	    shift
283 | 	    ;;
284 | 	esac
285 | 	;;
286 |       *.c)
287 | 	cfile=$1
288 | 	set x "$@" "$1"
289 | 	shift
290 | 	;;
291 |       *)
292 | 	set x "$@" "$1"
293 | 	shift
294 | 	;;
295 |     esac
296 |   fi
297 |   shift
298 | done
299 | 
300 | if test -z "$ofile" || test -z "$cfile"; then
301 |   # If no '-o' option was seen then we might have been invoked from a
302 |   # pattern rule where we don't need one.  That is ok -- this is a
303 |   # normal compilation that the losing compiler can handle.  If no
304 |   # '.c' file was seen then we are probably linking.  That is also
305 |   # ok.
306 |   exec "$@"
307 | fi
308 | 
309 | # Name of file we expect compiler to create.
310 | cofile=`echo "$cfile" | sed 's|^.*[\\/]||; s|^[a-zA-Z]:||; s/\.c$/.o/'`
311 | 
312 | # Create the lock directory.
313 | # Note: use '[/\\:.-]' here to ensure that we don't use the same name
314 | # that we are using for the .o file.  Also, base the name on the expected
315 | # object file name, since that is what matters with a parallel build.
316 | lockdir=`echo "$cofile" | sed -e 's|[/\\:.-]|_|g'`.d
317 | while true; do
318 |   if mkdir "$lockdir" >/dev/null 2>&1; then
319 |     break
320 |   fi
321 |   sleep 1
322 | done
323 | # FIXME: race condition here if user kills between mkdir and trap.
324 | trap "rmdir '$lockdir'; exit 1" 1 2 15
325 | 
326 | # Run the compile.
327 | "$@"
328 | ret=$?
329 | 
330 | if test -f "$cofile"; then
331 |   test "$cofile" = "$ofile" || mv "$cofile" "$ofile"
332 | elif test -f "${cofile}bj"; then
333 |   test "${cofile}bj" = "$ofile" || mv "${cofile}bj" "$ofile"
334 | fi
335 | 
336 | rmdir "$lockdir"
337 | exit $ret
338 | 
339 | # Local Variables:
340 | # mode: shell-script
341 | # sh-indentation: 2
342 | # eval: (add-hook 'write-file-hooks 'time-stamp)
343 | # time-stamp-start: "scriptversion="
344 | # time-stamp-format: "%:y-%02m-%02d.%02H"
345 | # time-stamp-time-zone: "UTC"
346 | # time-stamp-end: "; # UTC"
347 | # End:
348 | 


--------------------------------------------------------------------------------
/configure.ac:
--------------------------------------------------------------------------------
 1 | AC_INIT([Sparrowhawk], [1.0.0], [rws@google.com])
 2 | AM_INIT_AUTOMAKE([foreign nostdinc -Wall -Werror])
 3 | 
 4 | AM_PROG_AR
 5 | 
 6 | CPPFLAGS="$CPPFLAGS -funsigned-char"
 7 | CXXFLAGS="$CXXFLAGS -std=c++11"
 8 | 
 9 | AC_PROG_CXX
10 | AC_DISABLE_STATIC
11 | AC_PROG_LIBTOOL
12 | 
13 | AC_CONFIG_SRCDIR([src/lib/normalizer.cc])
14 | AC_CONFIG_FILES([
15 |   Makefile
16 |   src/Makefile
17 |   src/include/Makefile
18 |   src/lib/Makefile
19 |   src/proto/Makefile
20 |   src/bin/Makefile
21 | ])
22 | 
23 | AC_CONFIG_MACRO_DIR([m4])
24 | AC_LANG([C++])
25 | 
26 | AC_CHECK_HEADER([fst/fst.h], [],
27 |  [AC_MSG_ERROR([fst/fst.h header not found])]
28 | )
29 | 
30 | AC_CHECK_HEADER([fst/extensions/far/far.h], [],
31 |  [AC_MSG_ERROR([fst/extensions/far/far.h header not found])]
32 | )
33 | 
34 | AC_CHECK_HEADER([fst/extensions/pdt/pdt.h], [],
35 |  [AC_MSG_ERROR([fst/extensions/pdt/pdt.h header not found])]
36 | )
37 | 
38 | dnl TODO(rws): add mpdt at some point
39 | dnl AC_CHECK_HEADER([fst/extensions/mpdt/mpdt.h], [],
40 | dnl  [AC_MSG_ERROR([fst/extensions/mpdt/mpdt.h header not found])]
41 | dnl )
42 | 
43 | AC_CHECK_HEADER([thrax/thrax.h], [],
44 |  [AC_MSG_ERROR([thrax/thrax.h header not found])]
45 | )
46 | 
47 | AC_CHECK_HEADER([re2/re2.h], [],
48 |  [AC_MSG_ERROR([re2/re2.h header not found])]
49 | )
50 | 
51 | AC_CHECK_HEADER([google/protobuf/message.h], [],
52 |  [AC_MSG_ERROR([google/protobuf/message.h header not found])]
53 | )
54 | 
55 | AC_ARG_ENABLE(
56 | 	[bin],
57 | 	[AS_HELP_STRING(
58 | 		[--enable-bin],
59 | 		[enable command-line binaries])],
60 | 	[],
61 | 	[enable_bin=yes])
62 | AM_CONDITIONAL([HAVE_BIN], [test "x$enable_bin" != xno])
63 | 
64 | AC_CHECK_PROG([PROTOC], [protoc], [protoc])
65 | AS_IF([test "x${PROTOC}" == "x"],
66 |     [AC_MSG_ERROR([ProtoBuf compiler "protoc" not found.])])
67 | 
68 | AC_OUTPUT
69 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/README:
--------------------------------------------------------------------------------
 1 | This is a toy classifier for English whose sole purpose is to accompany the
 2 | documentation on GitHub, illustrating how to write grammars for Sparrowhawk.
 3 | 
 4 | It is assumed you have OpenGrm Thrax version 1.2.1 or higher installed. If you
 5 | do not then the first thing to do is go to:
 6 | 
 7 | http://openfst.cs.nyu.edu/twiki/bin/view/GRM/Thrax
 8 | 
 9 | and follow the instructions there.
10 | 
11 | Paul Dixon's Windows port can be found at:
12 | 
13 | http://openfst.cs.nyu.edu/twiki/bin/view/Contrib/OpenGrmThraxWin
14 | 
15 | Once Thrax is installed you should go into the classify directory and do:
16 | 
17 | thraxmakedep tokenize_and_classify.grm
18 | make
19 | 
20 | Similarly in the verbalize directory:
21 | 
22 | thraxmakedep verbalize.grm
23 | make
24 | 
25 | And finally in the verbalize_serialization directory, do the same as above.
26 | 
27 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/byte.far:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/sparrowhawk/a0503e26a433fbd3a9ff81ba7a08819e4a3bb668/documentation/grammars/en_toy/byte.far


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/byte.grm:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | #
13 | # Copyright 2005-2011 Google, Inc.
14 | 
15 | # Standard constants for ASCII (byte) based strings.  This mirrors the
16 | # functions provided by C/C++'s ctype.h library.
17 | 
18 | # Note that [0] is missing.  Matching the string-termination character is kinda weird.
19 | export kBytes = Optimize[
20 |   "[1]" |   "[2]" |   "[3]" |   "[4]" |   "[5]" |   "[6]" |   "[7]" |   "[8]" |   "[9]" |  "[10]" |
21 |  "[11]" |  "[12]" |  "[13]" |  "[14]" |  "[15]" |  "[16]" |  "[17]" |  "[18]" |  "[19]" |  "[20]" |
22 |  "[21]" |  "[22]" |  "[23]" |  "[24]" |  "[25]" |  "[26]" |  "[27]" |  "[28]" |  "[29]" |  "[30]" |
23 |  "[31]" |  "[32]" |  "[33]" |  "[34]" |  "[35]" |  "[36]" |  "[37]" |  "[38]" |  "[39]" |  "[40]" |
24 |  "[41]" |  "[42]" |  "[43]" |  "[44]" |  "[45]" |  "[46]" |  "[47]" |  "[48]" |  "[49]" |  "[50]" |
25 |  "[51]" |  "[52]" |  "[53]" |  "[54]" |  "[55]" |  "[56]" |  "[57]" |  "[58]" |  "[59]" |  "[60]" |
26 |  "[61]" |  "[62]" |  "[63]" |  "[64]" |  "[65]" |  "[66]" |  "[67]" |  "[68]" |  "[69]" |  "[70]" |
27 |  "[71]" |  "[72]" |  "[73]" |  "[74]" |  "[75]" |  "[76]" |  "[77]" |  "[78]" |  "[79]" |  "[80]" |
28 |  "[81]" |  "[82]" |  "[83]" |  "[84]" |  "[85]" |  "[86]" |  "[87]" |  "[88]" |  "[89]" |  "[90]" |
29 |  "[91]" |  "[92]" |  "[93]" |  "[94]" |  "[95]" |  "[96]" |  "[97]" |  "[98]" |  "[99]" | "[100]" |
30 | "[101]" | "[102]" | "[103]" | "[104]" | "[105]" | "[106]" | "[107]" | "[108]" | "[109]" | "[110]" |
31 | "[111]" | "[112]" | "[113]" | "[114]" | "[115]" | "[116]" | "[117]" | "[118]" | "[119]" | "[120]" |
32 | "[121]" | "[122]" | "[123]" | "[124]" | "[125]" | "[126]" | "[127]" | "[128]" | "[129]" | "[130]" |
33 | "[131]" | "[132]" | "[133]" | "[134]" | "[135]" | "[136]" | "[137]" | "[138]" | "[139]" | "[140]" |
34 | "[141]" | "[142]" | "[143]" | "[144]" | "[145]" | "[146]" | "[147]" | "[148]" | "[149]" | "[150]" |
35 | "[151]" | "[152]" | "[153]" | "[154]" | "[155]" | "[156]" | "[157]" | "[158]" | "[159]" | "[160]" |
36 | "[161]" | "[162]" | "[163]" | "[164]" | "[165]" | "[166]" | "[167]" | "[168]" | "[169]" | "[170]" |
37 | "[171]" | "[172]" | "[173]" | "[174]" | "[175]" | "[176]" | "[177]" | "[178]" | "[179]" | "[180]" |
38 | "[181]" | "[182]" | "[183]" | "[184]" | "[185]" | "[186]" | "[187]" | "[188]" | "[189]" | "[190]" |
39 | "[191]" | "[192]" | "[193]" | "[194]" | "[195]" | "[196]" | "[197]" | "[198]" | "[199]" | "[200]" |
40 | "[201]" | "[202]" | "[203]" | "[204]" | "[205]" | "[206]" | "[207]" | "[208]" | "[209]" | "[210]" |
41 | "[211]" | "[212]" | "[213]" | "[214]" | "[215]" | "[216]" | "[217]" | "[218]" | "[219]" | "[220]" |
42 | "[221]" | "[222]" | "[223]" | "[224]" | "[225]" | "[226]" | "[227]" | "[228]" | "[229]" | "[230]" |
43 | "[231]" | "[232]" | "[233]" | "[234]" | "[235]" | "[236]" | "[237]" | "[238]" | "[239]" | "[240]" |
44 | "[241]" | "[242]" | "[243]" | "[244]" | "[245]" | "[246]" | "[247]" | "[248]" | "[249]" | "[250]" |
45 | "[251]" | "[252]" | "[253]" | "[254]" | "[255]"
46 | ];
47 | 
48 | export kDigit = Optimize[
49 |     "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
50 | ];
51 | 
52 | export kLower = Optimize[
53 |     "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m" |
54 |     "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
55 | ];
56 | export kUpper = Optimize[
57 |     "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" |
58 |     "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
59 | ];
60 | export kAlpha = Optimize[kLower | kUpper];
61 | 
62 | export kAlnum = Optimize[kDigit | kAlpha];
63 | 
64 | export kSpace = Optimize[
65 |     " " | "\t" | "\n" | "\r"
66 | ];
67 | export kNotSpace = Optimize[kBytes - kSpace];
68 | 
69 | export kPunct = Optimize[
70 |     "!" | "\"" | "#" | "$" | "%" | "&" | "'" | "(" | ")" | "*" | "+" | "," |
71 |     "-" | "." | "/" | ":" | ";" | "<" | "=" | ">" | "?" | "@" | "\[" | "\\" |
72 |     "\]" | "^" | "_" | "`" | "{" | "|" | "}" | "~"
73 | ];
74 | 
75 | export kGraph = Optimize[kAlnum | kPunct];
76 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/classify/cardinal.grm:
--------------------------------------------------------------------------------
 1 | import '../byte.grm' as b;
 2 | import '../util.grm' as u;
 3 | 
 4 | d = b.kDigit;
 5 | q = u.q;
 6 | 
 7 | # 300 -> cardinal { integer: "300"}
 8 | 
 9 | cardinal =
10 |   u.I["cardinal { "]
11 |   u.I["integer: " q]
12 |   d+
13 |   u.I[q]
14 |   u.I[" }"]
15 | ;
16 | 
17 | export CARDINAL = Optimize[cardinal];
18 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/classify/date.grm:
--------------------------------------------------------------------------------
 1 | import '../byte.grm' as b;
 2 | import '../util.grm' as u;
 3 | 
 4 | d = b.kDigit;
 5 | q = u.q;
 6 | # The weight is to override the analysis of "Jan." etc. as a separate word (see
 7 | # word.grm).
 8 | month_file = StringFile['months.tsv'] <-20>;
 9 | 
10 | # Allows both abbreviations and full names:
11 | month = month_file | Project[month_file, 'output'];
12 | 
13 | # Any number from 1-31:
14 | 
15 | day = (d - "0") | "1" d | "2" d | "30" | "31";
16 | 
17 | # Any four digit number beginning with 1 or 2
18 | 
19 | year = ("1" | "2") d{3};
20 | 
21 | # Maps input of the form
22 | #
23 | # Jan. 3,? 1980
24 | #
25 | # or
26 | #
27 | # 3 Jan.,? 1980
28 | #
29 | # into
30 | #
31 | # date { month: "January" day: "3" year: "1980" }
32 | #
33 | # Etc.
34 | 
35 | mdy =
36 |   u.I["date { "]
37 |   u.I["month: " q]
38 |   month
39 |   u.D[" "+]
40 |   u.I[q " day: " q]
41 |   day
42 |   u.D[","]?
43 |   u.D[" "+]
44 |   u.I[q " year: " q]
45 |   year
46 |   u.I[q]
47 |   u.I[" preserve_order: true"]
48 |   u.I[" }"];
49 | 
50 | dmy =
51 |   u.I["date { "]
52 |   u.I["day: " q]
53 |   day
54 |   u.D[" "+]
55 |   u.I[q " month: " q]
56 |   month
57 |   u.D[","]?
58 |   u.D[" "+]
59 |   u.I[q " year: " q]
60 |   year
61 |   u.I[q]
62 |   u.I[" preserve_order: true"]
63 |   u.I[" }"];
64 | 
65 | export DATE = Optimize[mdy | dmy];
66 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/classify/measure.grm:
--------------------------------------------------------------------------------
 1 | import '../byte.grm' as b;
 2 | import '../util.grm' as u;
 3 | 
 4 | d = b.kDigit;
 5 | q = u.q;
 6 | measures = StringFile['measure.tsv'];
 7 | 
 8 | # Maps input of the form
 9 | #
10 | # 2.5kg
11 | #
12 | # into
13 | #
14 | # measure { decimal { integer_part: "2" fractional_part: "5" } units: "kilogram" }
15 | #
16 | # The fractional part is optional.
17 | 
18 | measure =
19 |   u.I["measure { "]
20 |   u.I[" decimal { "]
21 |   u.I["integer_part: " q]
22 |   d+
23 |   u.I[q]
24 |  (u.D["."]
25 |   u.I[" fractional_part: " q]
26 |   d+
27 |   u.I[q])?
28 |   u.I["}"]
29 |   u.I[" units: " q]
30 |   u.D[" "*]  # We allow spaces between the number and the measure.
31 |   measures
32 |   u.I[q]
33 |   u.I["}"]
34 | ;
35 | 
36 | export MEASURE = Optimize[measure];
37 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/classify/measure.tsv:
--------------------------------------------------------------------------------
1 | kg	kilogram
2 | cm	centimeter
3 | °	degree
4 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/classify/money.grm:
--------------------------------------------------------------------------------
 1 | import '../byte.grm' as b;
 2 | import '../util.grm' as u;
 3 | 
 4 | d = b.kDigit;
 5 | q = u.q;
 6 | currencies = StringFile['money.tsv'];
 7 | 
 8 | # Maps input of the form
 9 | #
10 | # $2.50
11 | #
12 | # into
13 | #
14 | # money { currency: "usd" amount { integer_part: "2" fractional_part: "50"} } }
15 | #
16 | # The fractional part is optional.
17 | 
18 | money =
19 |   u.I["money { "]
20 |   u.I["currency: " q]
21 |   currencies
22 |   u.I[q]
23 |   u.I[" amount { "]
24 |   u.I["integer_part: " q]
25 |   d+
26 |   u.I[q]
27 |  (u.D["."]
28 |   u.I[" fractional_part: " q]
29 |   d{2}
30 |   u.I[q])?
31 |   u.I["} }"]
32 | ;
33 | 
34 | export MONEY = Optimize[money];
35 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/classify/money.tsv:
--------------------------------------------------------------------------------
1 | $	usd
2 | £	gbp
3 | €	eur
4 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/classify/months.tsv:
--------------------------------------------------------------------------------
 1 | Jan.	January
 2 | Feb.	February
 3 | Mar.	March
 4 | Apr.	April
 5 | May
 6 | Jun.	June
 7 | Jul.	July
 8 | Aug.	August
 9 | Sep.	September
10 | Oct.	October
11 | Nov.	November
12 | Dec.	December
13 | Jan	January
14 | Feb	February
15 | Mar	March
16 | Apr	April
17 | Jun	June
18 | Jul	July
19 | Aug	August
20 | Sep	September
21 | Oct	October
22 | Nov	November
23 | Dec	December
24 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/classify/punctuation.grm:
--------------------------------------------------------------------------------
 1 | import '../util.grm' as u;
 2 | 
 3 | # The trick of inserting this material and then replacing the "[PUNCT]" with the
 4 | # relevant punctuation symbols works because of the way that Replace replaces
 5 | # the arcs in the replacement fst.
 6 | 
 7 | # phrase_break: true sets it so that a silence (sil) will be inserted.
 8 | 
 9 | medium =
10 |   u.I["tokens { name: \"[PUNCT]\" pause_length: "
11 |       "PAUSE_MEDIUM phrase_break: true type: PUNCT }"]
12 | ;
13 | 
14 | long =
15 |   u.I["tokens { name: \"[PUNCT]\" pause_length: "
16 |       "PAUSE_LONG phrase_break: true type: PUNCT }"]
17 | ;
18 | 
19 | medium_punct = "," | ";" | "(" | ")";
20 | long_punct = "." | "!" | "?" | ":";
21 | 
22 | export PUNCT = Optimize[
23 |    Replace["[ROOT][PUNCT]", medium, medium_punct]
24 |  | Replace["[ROOT][PUNCT]", long, long_punct]]
25 | ;
26 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/classify/time.grm:
--------------------------------------------------------------------------------
 1 | import '../byte.grm' as b;
 2 | import '../util.grm' as u;
 3 | 
 4 | d = b.kDigit;
 5 | q = u.q;
 6 | 
 7 | # Maps input of the form
 8 | #
 9 | # 3:30
10 | #
11 | # into
12 | #
13 | # time { hour: 3 minute: 30 }
14 | #
15 | # Hours and minutes fields are defined as ints, so here we don't use quotes.
16 | 
17 | 
18 | hour =
19 |     "0"? d
20 |   | "1" d
21 |   | "2" ("0" | "1" | "2" | "3")
22 | ;
23 | 
24 | minute = ("0" | "1" | "2" | "3" | "4" | "5")  d;
25 | 
26 | time =
27 |   u.I["time { "]
28 |   u.I["hours: "]
29 |   hour
30 |   u.D[":"]
31 |   u.I[" minutes: "]
32 |   minute
33 |   u.I["}"]
34 | ;
35 | 
36 | export TIME = Optimize[time];
37 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/classify/tokenize_and_classify.grm:
--------------------------------------------------------------------------------
 1 | import 'cardinal.grm' as c;
 2 | import 'date.grm' as d;
 3 | import 'measure.grm' as M;
 4 | import 'money.grm' as m;
 5 | import 'punctuation.grm' as p;
 6 | import 'time.grm' as t;
 7 | import '../util.grm' as u;
 8 | import 'word.grm' as w;
 9 | 
10 | types = c.CARDINAL | d.DATE | M.MEASURE | m.MONEY | w.WORD | t.TIME;
11 | 
12 | token = u.I["tokens { "] types u.I[" }"];
13 | 
14 | token_plus_punct = (p.PUNCT u.I[" "])* token (u.I[" "] p.PUNCT)*;
15 | 
16 | # Collection of all possible semiotic classes, including ordinary words.
17 | 
18 | export TOKENIZE_AND_CLASSIFY =
19 |   Optimize[token_plus_punct (" " token_plus_punct)*]
20 | ;
21 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/classify/word.grm:
--------------------------------------------------------------------------------
 1 | import '../byte.grm' as b;
 2 | import '../util.grm' as u;
 3 | 
 4 | # Make sure we don't treat periods at the end of abbreviations in
 5 | # sentence_boundary_exceptions.txt as punctuation:
 6 | exceptions = StringFile['../../sentence_boundary_exceptions.txt'] <-10>;
 7 | 
 8 | # Markup for ordinary tokens that don't match anything else.
 9 | # output is a "name:" token. Cost is to make this analysis more expensive.
10 | 
11 | word = u.I["name: " u.q] ((b.kNotSpace <1>)+ | exceptions) u.I[u.q];
12 | 
13 | export WORD = Optimize[word];
14 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/util.far:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/sparrowhawk/a0503e26a433fbd3a9ff81ba7a08819e4a3bb668/documentation/grammars/en_toy/util.far


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/util.grm:
--------------------------------------------------------------------------------
 1 | import '../byte.grm' as b;
 2 | 
 3 | func I[expr] {
 4 |   return "" : expr;
 5 | }
 6 | 
 7 | func D[expr] {
 8 |   return expr : "";
 9 | }
10 | 
11 | export q = "\"";
12 | 
13 | # Allows for arbitrary numbers of spaces on the verbalization side between
14 | # elements of the semiotic class.
15 | export s = " "*;
16 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/verbalize/CARDINAL_NUMBER_NAME:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/sparrowhawk/a0503e26a433fbd3a9ff81ba7a08819e4a3bb668/documentation/grammars/en_toy/verbalize/CARDINAL_NUMBER_NAME


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/verbalize/ORDINAL_NUMBER_NAME:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/sparrowhawk/a0503e26a433fbd3a9ff81ba7a08819e4a3bb668/documentation/grammars/en_toy/verbalize/ORDINAL_NUMBER_NAME


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/verbalize/date.grm:
--------------------------------------------------------------------------------
 1 | import '../byte.grm' as b;
 2 | import '../util.grm' as u;
 3 | import 'numbers.grm' as n;
 4 | 
 5 | # quotation mark
 6 | q = u.q;
 7 | 
 8 | # Used to allow for different numbers of spaces coming out of the serializer.
 9 | s = u.s;
10 | 
11 | month = b.kAlpha+;
12 | 
13 | day = n.ORDINAL;
14 | 
15 | d = b.kDigit;
16 | D = b.kDigit - "0";
17 | 
18 | two_digit =
19 |     ((D d) @ n.CARDINAL)
20 |   | ("0" : "oh ") (D @ n.CARDINAL)
21 |   | ("00" : "hundred")
22 | ;
23 | 
24 | # Years are not read as cardinals, generally:
25 | year =
26 |     (("19" @ n.CARDINAL) u.I[" "] two_digit)
27 |   | (("20" @ n.CARDINAL) u.I[" "] ((D d) @ two_digit))
28 |   | (("200" d) @ n.CARDINAL)
29 | ;
30 | 
31 | # Remove these if they occur
32 | 
33 | field = (b.kAlpha | "_")+;
34 | preserve_order = "preserve_order:" s "true" s;
35 | field_order = "field_order:" s q field q s;
36 | field_order_specs = (preserve_order | field_order)*;
37 | 
38 | # Verbalization for MDY
39 | mdy =
40 |   u.D["date" s "{" s]
41 |   u.D["month:" s q]
42 |   month
43 |   u.I[" the "]
44 |   u.D[q s]
45 |   u.D["day:" s q]
46 |   day
47 |   u.I[" "]
48 |   u.D[q s "year:" s q]
49 |   year
50 |   u.D[q s]
51 |   u.D[field_order_specs]?
52 |   u.D["}"]
53 | ;
54 | 
55 | # Verbalization for DMY
56 | dmy =
57 |   u.D["date" s "{" s]
58 |   u.I["the "]
59 |   u.D["day:" s q]
60 |   day
61 |   u.D[q s]
62 |   u.I[" of "]
63 |   u.D["month:" s q]
64 |   month
65 |   u.I[" "]
66 |   u.D[q s "year:" s q]
67 |   year
68 |   u.D[q s]
69 |   u.D[field_order_specs]?
70 |   u.D["}"]
71 | ;
72 | 
73 | export DATE = Optimize[mdy | dmy];
74 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/verbalize/measure.grm:
--------------------------------------------------------------------------------
 1 | import '../byte.grm' as b;
 2 | import '../util.grm' as u;
 3 | import 'numbers.grm' as n;
 4 | 
 5 | # Except with exactly 1, the plural form is used, so we map to that form, and
 6 | # then singularize below.
 7 | measures =
 8 |     ("centimeter" : "centimeters")
 9 |   | ("kilogram" : "kilograms")
10 |   | ("degree" : "degrees")
11 | ;
12 | 
13 | # quotation mark
14 | q = u.q;
15 | 
16 | # Used to allow for different numbers of spaces coming out of the serializer.
17 | s = u.s;
18 | 
19 | # Removes the markup (allowing for various spacing possibilities in the
20 | # serialization) and verbalizes the remainder.
21 | measure =
22 |   u.D["measure" s "{" s]
23 |   u.D[s "decimal" s "{" s]
24 |   u.D["integer_part:" s q]
25 |   n.CARDINAL
26 |   u.D[q]
27 |  (u.D[s "fractional_part:" s q]
28 |   u.I[" point "]
29 |   n.DIGITS
30 |   u.D[q])?
31 |   u.D[s "}" s]
32 |   u.D[s "units:" s q]
33 |   u.I[" "]
34 |   measures
35 |   u.D[q]
36 |   u.D[s "}" s]
37 | ;
38 | 
39 | sigstar = b.kBytes*;
40 | 
41 | # Uses the singular form after exactly "one".
42 | singularize = CDRewrite[Invert[measures], "[BOS]one ", "", sigstar];
43 | 
44 | export MEASURE = Optimize[measure @ singularize];
45 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/verbalize/money.grm:
--------------------------------------------------------------------------------
  1 | import '../byte.grm' as b;
  2 | import '../util.grm' as u;
  3 | import 'numbers.grm' as n;
  4 | 
  5 | q = u.q;
  6 | 
  7 | # Used to allow for different numbers of spaces coming out of the serializer.
  8 | s = u.s;
  9 | 
 10 | d = b.kDigit;
 11 | 
 12 | currencies = StringFile['money.tsv'];
 13 | 
 14 | # Simple currency amounts such as:
 15 | # money { amount { integer_part: "3"} currency: "usd" }
 16 | 
 17 | sigstar = b.kBytes*;
 18 | 
 19 | # Rules to insert "_maj" and "_min" at the end of the currency terms.
 20 | ins_maj = CDRewrite[u.I["_maj"], "", "[EOS]", sigstar];
 21 | ins_min = CDRewrite[u.I["_min"], "", "[EOS]", sigstar];
 22 | 
 23 | # Removes the markup (allowing for various spacing possibilities in the
 24 | # serialization) and verbalizes the remainder.
 25 | money_whole =
 26 |   u.D["money" s "{" s ""]
 27 |   u.D["amount" s "{" s ""]
 28 |   u.D["integer_part:" s q]
 29 |   n.CARDINAL
 30 |   u.D[q]
 31 |   u.D[s "}" s]
 32 |   u.D["currency: " q]
 33 |   u.I[" "]
 34 |   (ins_maj @ currencies)
 35 |   u.D[q]
 36 |   u.D[s "}"]
 37 | ;
 38 | 
 39 | del_zero = CDRewrite[u.D["0"], "[BOS]", "", sigstar];
 40 | 
 41 | # money { amount { integer_part: "3" fractional_part: "50"} currency: "usd" }
 42 | # Here wa assume that the input has been reduplicated (see REDUP), and then on
 43 | # the lefthand side we delete the minor currency and on the righthand side the
 44 | # major currency. The reduplication is done IN CODE (see
 45 | # RuleSystem::ApplyRules() in rule_system.cc).
 46 | #
 47 | # Removes the markup (allowing for various spacing possibilities in the
 48 | # serialization) and verbalizes the remainder.
 49 | 
 50 | money_all =
 51 |   u.D["money" s "{" s]
 52 |   u.D["amount" s "{" s]
 53 |   u.D["integer_part:" s q]
 54 |   n.CARDINAL
 55 |   u.D[q]
 56 |   u.D[" fractional_part:" s q]
 57 |   u.D[d+]
 58 |   u.D[q]
 59 |   u.D[s "}" s]
 60 |   u.D["currency:" s q]
 61 |   u.I[" "]
 62 |   (ins_maj @ currencies)
 63 |   u.D[q]
 64 |   u.D[s "}" s]
 65 |   u.I[" and "]
 66 |   u.D["money" s "{" s]
 67 |   u.D["amount" s "{" s]
 68 |   u.D["integer_part:" s q]
 69 |   u.D[d+]
 70 |   u.D[q]
 71 |   u.D[s "fractional_part:" s q]
 72 |   (del_zero @ n.CARDINAL)
 73 |   u.D[q]
 74 |   u.D[s "}" s]
 75 |   u.D["currency:" s q]
 76 |   u.I[" "]
 77 |   (ins_min @ currencies)
 78 |   u.D[q]
 79 |   u.D[s "}"]
 80 | ;
 81 | 
 82 | # Singularize after "one" (as in measures).
 83 | 
 84 | singulars =
 85 |    ("dollars" : "dollar")
 86 |  | ("cents" : "cent")
 87 |  | ("pounds" : "pound")
 88 |  | ("pence" : "penny")
 89 |  | ("euros" : "euro")
 90 | ;
 91 | 
 92 | singularize =
 93 |   CDRewrite[singulars, "[BOS]one " | "and one ", "", sigstar]
 94 | ;
 95 | 
 96 | export MONEY =
 97 |    Optimize[
 98 |     (money_whole | money_all)
 99 |    @ singularize]
100 | ;
101 | 
102 | # keep one space
103 | ks = s : " ";
104 | 
105 | # Pattern that matches the currency expression to be reduplicated.
106 | redup =
107 |   "money" ks "{" ks
108 |   "amount" ks "{" ks
109 |   "integer_part:" ks q
110 |   d+
111 |   q
112 |   ks "fractional_part:" ks q
113 |   d+
114 |   q
115 |   ks "}" ks
116 |   "currency:" ks q
117 |   # Match to ins_maj then project back to the 3-letter code.
118 |   Project[ins_maj @ currencies, 'input']
119 |   q
120 |   ks "}"
121 | ;
122 | 
123 | export REDUP = redup;
124 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/verbalize/money.tsv:
--------------------------------------------------------------------------------
1 | usd_maj	dollars
2 | usd_min	cents
3 | gbp_maj	pounds
4 | gbp_min	pence
5 | eur_maj	euros
6 | eur_min	cents
7 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/verbalize/numbers.grm:
--------------------------------------------------------------------------------
 1 | import '../byte.grm' as b;
 2 | import '../util.grm' as u;
 3 | 
 4 | # English cardinal and ordinal number names are FSTs that are trained using the
 5 | # algorithm reported in:
 6 | #
 7 | # Kyle Gorman and Richard Sproat. "Minimally supervised models for number
 8 | # normalization." Transactions of the Association for Computational Linguistics. 2016.
 9 | cardinal = LoadFst['CARDINAL_NUMBER_NAME'];
10 | 
11 | ordinal = LoadFst['ORDINAL_NUMBER_NAME'];
12 | 
13 | d = b.kDigit;
14 | 
15 | digit = d @ cardinal;
16 | 
17 | export CARDINAL = cardinal;
18 | 
19 | export ORDINAL = ordinal;
20 | 
21 | export DIGITS = Optimize[digit (u.I[" "] digit)*];
22 | 
23 | q = u.q;
24 | 
25 | # Used to allow for different numbers of spaces coming out of the serializer.
26 | s = u.s;
27 | 
28 | # Removes the markup (allowing for various spacing possibilities in the
29 | # serialization) and verbalizes the remainder.
30 | cardinal_markup =
31 |   u.D["cardinal" s "{" s]
32 |   u.D["integer:" s q]
33 |   cardinal
34 |   u.D[q]
35 |   u.D[s "}"]
36 | ;
37 | 
38 | export CARDINAL_MARKUP = Optimize[cardinal_markup];
39 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/verbalize/time.grm:
--------------------------------------------------------------------------------
 1 | import '../byte.grm' as b;
 2 | import '../util.grm' as u;
 3 | import 'numbers.grm' as n;
 4 | 
 5 | q = u.q;
 6 | 
 7 | # Used to allow for different numbers of spaces coming out of the serializer.
 8 | s = u.s;
 9 | 
10 | d = b.kDigit;
11 | 
12 | hour = (u.D["0"]? d | (d - "0") d) @ n.CARDINAL;
13 | 
14 | sigstar = b.kBytes*;
15 | 
16 | # Various renditions of minutes:
17 | #
18 | # 03 -> oh three
19 | # 13 -> thirteen
20 | # 00 -> o'clock
21 | #
22 | # Note that trailing 0 is removed so that 3:03 comes in as
23 | #
24 | # hours: 3 minutes: 3
25 | minute =
26 |   (  (("" : "oh ") (d @ n.CARDINAL))
27 |    | (d d) @ n.CARDINAL)
28 |    @ CDRewrite["oh zero" : "o'clock", "", "", sigstar];
29 | 
30 | # Removes the markup (allowing for various spacing possibilities in the
31 | # serialization) and verbalizes the remainder.
32 | time =
33 |   u.D["time" s "{" s]
34 |   u.D["hours:" s]
35 |   hour
36 |   u.D[s "minutes:" s]
37 |   u.I[" "]
38 |   minute
39 |   u.D[s "}"]
40 | ;
41 | 
42 | export TIME = Optimize[time];
43 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/verbalize/verbalize.grm:
--------------------------------------------------------------------------------
 1 | import 'date.grm' as d;
 2 | import 'measure.grm' as M;
 3 | import 'money.grm' as m;
 4 | import 'numbers.grm' as n;
 5 | import 'time.grm' as t;
 6 | import 'verbatim.grm' as v;
 7 | 
 8 | # Combines all of the semiotic classes together.
 9 | 
10 | export ALL = Optimize[
11 |   d.DATE | M.MEASURE | m.MONEY | n.CARDINAL_MARKUP | t.TIME | v.VERBATIM];
12 | 
13 | # Exports the REDUP from money.
14 | export REDUP = m.REDUP;
15 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/verbalize/verbatim.grm:
--------------------------------------------------------------------------------
 1 | import '../byte.grm' as b;
 2 | import '../util.grm' as u;
 3 | 
 4 | # A verbatim grammar is needed as a backoff since if for some reason
 5 | # verbalization fails, it backs off to reading the string as the literal
 6 | # sequence of characters.
 7 | 
 8 | q = u.q;
 9 | # Used to allow for different numbers of spaces coming out of the serializer.
10 | s = u.s;
11 | 
12 | char = b.kNotSpace u.I["_character"];
13 | 
14 | chars = char (u.I[" "] char)*;
15 | 
16 | # Removes the markup (allowing for various spacing possibilities in the
17 | # serialization) and verbalizes the remainder.
18 | export VERBATIM = Optimize[u.D["verbatim:" s q] chars u.D[q]];
19 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/verbalize_serialization/CARDINAL_NUMBER_NAME:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/sparrowhawk/a0503e26a433fbd3a9ff81ba7a08819e4a3bb668/documentation/grammars/en_toy/verbalize_serialization/CARDINAL_NUMBER_NAME


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/verbalize_serialization/ORDINAL_NUMBER_NAME:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/sparrowhawk/a0503e26a433fbd3a9ff81ba7a08819e4a3bb668/documentation/grammars/en_toy/verbalize_serialization/ORDINAL_NUMBER_NAME


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/verbalize_serialization/date.grm:
--------------------------------------------------------------------------------
 1 | import '../byte.grm' as b;
 2 | import '../util.grm' as u;
 3 | import 'numbers.grm' as n;
 4 | 
 5 | # quotation mark
 6 | q = u.q;
 7 | 
 8 | # Used to allow for different numbers of spaces coming out of the serializer.
 9 | s = u.s;
10 | 
11 | month = b.kAlpha+;
12 | 
13 | day = n.ORDINAL;
14 | 
15 | d = b.kDigit;
16 | D = b.kDigit - "0";
17 | 
18 | two_digit =
19 |     ((D d) @ n.CARDINAL)
20 |   | ("0" : "oh ") (D @ n.CARDINAL)
21 |   | ("00" : "hundred")
22 | ;
23 | 
24 | # Years are not read as cardinals, generally:
25 | year =
26 |     (("19" @ n.CARDINAL) u.I[" "] two_digit)
27 |   | (("20" @ n.CARDINAL) u.I[" "] ((D d) @ two_digit))
28 |   | (("200" d) @ n.CARDINAL)
29 | ;
30 | 
31 | # Remove these if they occur
32 | 
33 | field = (b.kAlpha | "_")+;
34 | preserve_order = "preserve_order:true";
35 | field_order = "field_order:" field;
36 | field_order_specs = (preserve_order | field_order)*;
37 | 
38 | # Verbalization for MDY
39 | mdy =
40 |   u.D["date"]
41 |   u.D["|month:"]
42 |   month
43 |   u.I[" the "]
44 |   u.D["|day:"]
45 |   day
46 |   u.I[" "]
47 |   u.D["|year:"]
48 |   year
49 |   u.D[field_order_specs]?
50 |   u.D["|"]
51 | ;
52 | 
53 | # Verbalization for DMY
54 | dmy =
55 |   u.D["date"]
56 |   u.I["the "]
57 |   u.D["|day:"]
58 |   day
59 |   u.I[" of "]
60 |   u.D["|month:"]
61 |   month
62 |   u.D["|year:"]
63 |   u.I[" "]
64 |   year
65 |   u.D[field_order_specs]?
66 |   u.D["|"]
67 | ;
68 | 
69 | export DATE = Optimize[mdy | dmy];
70 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/verbalize_serialization/measure.grm:
--------------------------------------------------------------------------------
 1 | import '../byte.grm' as b;
 2 | import '../util.grm' as u;
 3 | import 'numbers.grm' as n;
 4 | 
 5 | # Except with exactly 1, the plural form is used, so we map to that form, and
 6 | # then singularize below.
 7 | measures =
 8 |     ("centimeter" : "centimeters")
 9 |   | ("kilogram" : "kilograms")
10 |   | ("degree" : "degrees")
11 | ;
12 | 
13 | # quotation mark
14 | q = u.q;
15 | 
16 | # Used to allow for different numbers of spaces coming out of the serializer.
17 | s = u.s;
18 | 
19 | # Removes the markup (allowing for various spacing possibilities in the
20 | # serialization) and verbalizes the remainder.
21 | measure =
22 |   u.D["measure"]
23 |   u.D["|integer_part:"]
24 |   n.CARDINAL
25 |  (u.D["|fractional_part:"]
26 |   u.I[" point "]
27 |   n.DIGITS)?
28 |   u.I[" "]
29 |   u.D["|units:"]
30 |   measures
31 |   u.D["|"]
32 | ;
33 | 
34 | sigstar = b.kBytes*;
35 | 
36 | # Uses the singular form after exactly "one".
37 | singularize = CDRewrite[Invert[measures], "[BOS]one ", "", sigstar];
38 | 
39 | export MEASURE = Optimize[measure @ singularize];
40 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/verbalize_serialization/money.grm:
--------------------------------------------------------------------------------
 1 | import '../byte.grm' as b;
 2 | import '../util.grm' as u;
 3 | import 'numbers.grm' as n;
 4 | 
 5 | q = u.q;
 6 | 
 7 | # Used to allow for different numbers of spaces coming out of the serializer.
 8 | s = u.s;
 9 | 
10 | d = b.kDigit;
11 | 
12 | currencies = StringFile['money.tsv'];
13 | 
14 | # Simple currency amounts such as:
15 | # money|integer_part:3|currency:usd|
16 | 
17 | sigstar = b.kBytes*;
18 | 
19 | # Rules to insert "_maj" and "_min" at the end of the currency terms.
20 | ins_maj = CDRewrite[u.I["_maj"], "", "[EOS]", sigstar];
21 | ins_min = CDRewrite[u.I["_min"], "", "[EOS]", sigstar];
22 | 
23 | del_zero = CDRewrite[u.D["0"], "[BOS]", "", sigstar];
24 | 
25 | # money|integer_part:3|currency:usd|fractional_part:50|currency:usd|
26 | # Here wa assume that the input has been reduplicated (see REDUP), and then on
27 | # the lefthand side we delete the minor currency and on the righthand side the
28 | # major currency. The reduplication is done IN CODE (see
29 | # RuleSystem::ApplyRules() in rule_system.cc).
30 | #
31 | # Removes the markup (allowing for various spacing possibilities in the
32 | # serialization) and verbalizes the remainder.
33 | 
34 | money =
35 |   u.D["money"]
36 |   u.D["|integer_part:"]
37 |   n.CARDINAL
38 |   u.D["|currency:"]
39 |   u.I[" "]
40 |   (ins_maj @ currencies)
41 |  (u.I[" and "]
42 |   u.D["|fractional_part:"]
43 |   (del_zero @ n.CARDINAL)
44 |   u.D["|currency:"]
45 |   u.I[" "]
46 |   (ins_min @ currencies))?
47 |   u.D[s "|"]
48 | ;
49 | 
50 | # Singularize after "one" (as in measures).
51 | 
52 | singulars =
53 |    ("dollars" : "dollar")
54 |  | ("cents" : "cent")
55 |  | ("pounds" : "pound")
56 |  | ("pence" : "penny")
57 |  | ("euros" : "euro")
58 | ;
59 | 
60 | singularize =
61 |   CDRewrite[singulars, "[BOS]one " | "and one ", "", sigstar]
62 | ;
63 | 
64 | export MONEY = Optimize[money @ singularize];
65 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/verbalize_serialization/money.tsv:
--------------------------------------------------------------------------------
1 | usd_maj	dollars
2 | usd_min	cents
3 | gbp_maj	pounds
4 | gbp_min	pence
5 | eur_maj	euros
6 | eur_min	cents
7 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/verbalize_serialization/numbers.grm:
--------------------------------------------------------------------------------
 1 | import '../byte.grm' as b;
 2 | import '../util.grm' as u;
 3 | 
 4 | # English cardinal and ordinal number names are FSTs that are trained using the
 5 | # algorithm reported in:
 6 | #
 7 | # Kyle Gorman and Richard Sproat. "Minimally supervised models for number
 8 | # normalization." Transactions of the Association for Computational Linguistics. 2016.
 9 | cardinal = LoadFst['CARDINAL_NUMBER_NAME'];
10 | 
11 | ordinal = LoadFst['ORDINAL_NUMBER_NAME'];
12 | 
13 | d = b.kDigit;
14 | 
15 | digit = d @ cardinal;
16 | 
17 | export CARDINAL = cardinal;
18 | 
19 | export ORDINAL = ordinal;
20 | 
21 | export DIGITS = Optimize[digit (u.I[" "] digit)*];
22 | 
23 | q = u.q;
24 | 
25 | # Used to allow for different numbers of spaces coming out of the serializer.
26 | s = u.s;
27 | 
28 | # Removes the markup (allowing for various spacing possibilities in the
29 | # serialization) and verbalizes the remainder.
30 | cardinal_markup =
31 |   u.D["cardinal|integer:"]
32 |   cardinal
33 |   u.D[s "|"]
34 | ;
35 | 
36 | export CARDINAL_MARKUP = Optimize[cardinal_markup];
37 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/verbalize_serialization/time.grm:
--------------------------------------------------------------------------------
 1 | import '../byte.grm' as b;
 2 | import '../util.grm' as u;
 3 | import 'numbers.grm' as n;
 4 | 
 5 | q = u.q;
 6 | 
 7 | # Used to allow for different numbers of spaces coming out of the serializer.
 8 | s = u.s;
 9 | 
10 | d = b.kDigit;
11 | 
12 | hour = (u.D["0"]? d | (d - "0") d) @ n.CARDINAL;
13 | 
14 | sigstar = b.kBytes*;
15 | 
16 | # Various renditions of minutes:
17 | #
18 | # 03 -> oh three
19 | # 13 -> thirteen
20 | # 00 -> o'clock
21 | #
22 | # Note that trailing 0 is removed so that 3:03 comes in as
23 | #
24 | # hours: 3 minutes: 3
25 | minute =
26 |   (  (("" : "oh ") (d @ n.CARDINAL))
27 |    | (d d) @ n.CARDINAL)
28 |    @ CDRewrite["oh zero" : "o'clock", "", "", sigstar];
29 | 
30 | # Removes the markup (allowing for various spacing possibilities in the
31 | # serialization) and verbalizes the remainder.
32 | time =
33 |   u.D["time"]
34 |   u.D["|hours:"]
35 |   hour
36 |   u.D["|minutes:"]
37 |   u.I[" "]
38 |   minute
39 |   u.D["|"]
40 | ;
41 | 
42 | export TIME = Optimize[time];
43 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/verbalize_serialization/verbalize.grm:
--------------------------------------------------------------------------------
 1 | import 'date.grm' as d;
 2 | import 'measure.grm' as M;
 3 | import 'money.grm' as m;
 4 | import 'numbers.grm' as n;
 5 | import 'time.grm' as t;
 6 | import 'verbatim.grm' as v;
 7 | 
 8 | # Combines all of the semiotic classes together.
 9 | 
10 | export ALL = Optimize[
11 |   d.DATE | M.MEASURE | m.MONEY | n.CARDINAL_MARKUP | t.TIME | v.VERBATIM];
12 | 


--------------------------------------------------------------------------------
/documentation/grammars/en_toy/verbalize_serialization/verbatim.grm:
--------------------------------------------------------------------------------
 1 | import '../byte.grm' as b;
 2 | import '../util.grm' as u;
 3 | 
 4 | # A verbatim grammar is needed as a backoff since if for some reason
 5 | # verbalization fails, it backs off to reading the string as the literal
 6 | # sequence of characters.
 7 | 
 8 | q = u.q;
 9 | # Used to allow for different numbers of spaces coming out of the serializer.
10 | s = u.s;
11 | 
12 | char = b.kNotSpace u.I["_character"];
13 | 
14 | chars = char (u.I[" "] char)*;
15 | 
16 | # Removes the markup (allowing for various spacing possibilities in the
17 | # serialization) and verbalizes the remainder.
18 | export VERBATIM = Optimize[u.D["verbatim|verbatim:" q?] chars u.D[q? "|"]];
19 | 


--------------------------------------------------------------------------------
/documentation/grammars/sentence_boundary_exceptions.txt:
--------------------------------------------------------------------------------
 1 | Mr.
 2 | Dr.
 3 | Mrs.
 4 | St.
 5 | Jan.
 6 | Feb.
 7 | Mar.
 8 | Apr.
 9 | Jun.
10 | Jul.
11 | Aug.
12 | Sep.
13 | Oct.
14 | Nov.
15 | Dec.
16 | 


--------------------------------------------------------------------------------
/documentation/grammars/sparrowhawk_configuration.ascii_proto:
--------------------------------------------------------------------------------
1 | tokenizer_grammar:  "tokenizer.ascii_proto"
2 | 
3 | verbalizer_grammar:  "verbalizer.ascii_proto"
4 | 
5 | sentence_boundary_regexp: "[\\.:!\\?] "
6 | 
7 | sentence_boundary_exceptions_file: "sentence_boundary_exceptions.txt"
8 | 


--------------------------------------------------------------------------------
/documentation/grammars/sparrowhawk_configuration_serialization.ascii_proto:
--------------------------------------------------------------------------------
 1 | tokenizer_grammar:  "tokenizer.ascii_proto"
 2 | 
 3 | verbalizer_grammar:  "verbalizer_serialization.ascii_proto"
 4 | 
 5 | sentence_boundary_regexp: "[\\.:!\\?] "
 6 | 
 7 | sentence_boundary_exceptions_file: "sentence_boundary_exceptions.txt"
 8 | 
 9 | serialization_spec: "verbalizer_serialization_spec.ascii_proto"
10 | 


--------------------------------------------------------------------------------
/documentation/grammars/test.txt:
--------------------------------------------------------------------------------
1 | The train left at 3:30 from Penn Station on Jan. 3, 2010. Mr. Snookums
2 | was on the train carrying $40.25 (£30.60) of Belgian chocolate in a 3kg box that
3 | was 20cm wide.
4 | 


--------------------------------------------------------------------------------
/documentation/grammars/tokenizer.ascii_proto:
--------------------------------------------------------------------------------
1 | grammar_file: "en_toy/classify/tokenize_and_classify.far"
2 | 
3 | grammar_name: "TokenizerClassifier"
4 | 
5 | rules { main: "TOKENIZE_AND_CLASSIFY" }
6 | 


--------------------------------------------------------------------------------
/documentation/grammars/verbalizer.ascii_proto:
--------------------------------------------------------------------------------
1 | grammar_file: "en_toy/verbalize/verbalize.far"
2 | 
3 | grammar_name: "Verbalizer"
4 | 
5 | rules { main: "ALL" redup: "REDUP" }
6 | 


--------------------------------------------------------------------------------
/documentation/grammars/verbalizer_serialization.ascii_proto:
--------------------------------------------------------------------------------
1 | grammar_file: "en_toy/verbalize_serialization/verbalize.far"
2 | 
3 | grammar_name: "Verbalizer"
4 | 
5 | rules { main: "ALL" }
6 | 


--------------------------------------------------------------------------------
/documentation/grammars/verbalizer_serialization_spec.ascii_proto:
--------------------------------------------------------------------------------
 1 | class_spec {
 2 |   semiotic_class: "measure"
 3 |   style_spec {
 4 |     record_spec {
 5 |       field_path: "measure.decimal.integer_part"
 6 |     }
 7 |     record_spec {
 8 |       field_path: "measure.decimal.fractional_part"
 9 |     }
10 |     record_spec {
11 |       field_path: "measure.units"
12 |     }
13 |     required_fields: "measure.decimal.integer_part"
14 |   }
15 | }
16 | class_spec {
17 |   semiotic_class: "money"
18 |   style_spec {
19 |     record_spec {
20 |       field_path: "money.amount.integer_part"
21 |       suffix_spec {
22 |         field_path: "money.currency"
23 |       }
24 |     }
25 |     record_spec {
26 |       field_path: "money.amount.fractional_part"
27 |       suffix_spec {
28 |         field_path: "money.currency"
29 |       }
30 |     }
31 |   }
32 | }
33 | class_spec {
34 |   semiotic_class: "cardinal"
35 |   style_spec {
36 |     record_spec {
37 |       field_path: "cardinal.integer"
38 |     }
39 |   }
40 | }
41 | class_spec {
42 |   semiotic_class: "time"
43 |   style_spec {
44 |     record_spec {
45 |       field_path: "time.hours"
46 |     }
47 |     record_spec {
48 |       field_path: "time.minutes"
49 |     }
50 |   }
51 | }
52 | class_spec {
53 |   semiotic_class: "date"
54 |   style_spec {
55 |     record_spec {
56 |       field_path: "date.day"
57 |     }
58 |     record_spec {
59 |       field_path: "date.month"
60 |     }
61 |     record_spec {
62 |       field_path: "date.year"
63 |     }
64 |   }
65 |   style_spec {
66 |     record_spec {
67 |       field_path: "date.month"
68 |     }
69 |     record_spec {
70 |       field_path: "date.day"
71 |     }
72 |     record_spec {
73 |       field_path: "date.year"
74 |     }
75 |   }
76 | }
77 | class_spec {
78 |   semiotic_class: "verbatim"
79 |   style_spec {
80 |     record_spec {
81 |       field_path: "verbatim"
82 |     }
83 |   }
84 | }
85 | 


--------------------------------------------------------------------------------
/m4/ltsugar.m4:
--------------------------------------------------------------------------------
  1 | # ltsugar.m4 -- libtool m4 base layer.                         -*-Autoconf-*-
  2 | #
  3 | # Copyright (C) 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
  4 | # Written by Gary V. Vaughan, 2004
  5 | #
  6 | # This file is free software; the Free Software Foundation gives
  7 | # unlimited permission to copy and/or distribute it, with or without
  8 | # modifications, as long as this notice is preserved.
  9 | 
 10 | # serial 6 ltsugar.m4
 11 | 
 12 | # This is to help aclocal find these macros, as it can't see m4_define.
 13 | AC_DEFUN([LTSUGAR_VERSION], [m4_if([0.1])])
 14 | 
 15 | 
 16 | # lt_join(SEP, ARG1, [ARG2...])
 17 | # -----------------------------
 18 | # Produce ARG1SEPARG2...SEPARGn, omitting [] arguments and their
 19 | # associated separator.
 20 | # Needed until we can rely on m4_join from Autoconf 2.62, since all earlier
 21 | # versions in m4sugar had bugs.
 22 | m4_define([lt_join],
 23 | [m4_if([$#], [1], [],
 24 |        [$#], [2], [[$2]],
 25 |        [m4_if([$2], [], [], [[$2]_])$0([$1], m4_shift(m4_shift($@)))])])
 26 | m4_define([_lt_join],
 27 | [m4_if([$#$2], [2], [],
 28 |        [m4_if([$2], [], [], [[$1$2]])$0([$1], m4_shift(m4_shift($@)))])])
 29 | 
 30 | 
 31 | # lt_car(LIST)
 32 | # lt_cdr(LIST)
 33 | # ------------
 34 | # Manipulate m4 lists.
 35 | # These macros are necessary as long as will still need to support
 36 | # Autoconf-2.59 which quotes differently.
 37 | m4_define([lt_car], [[$1]])
 38 | m4_define([lt_cdr],
 39 | [m4_if([$#], 0, [m4_fatal([$0: cannot be called without arguments])],
 40 |        [$#], 1, [],
 41 |        [m4_dquote(m4_shift($@))])])
 42 | m4_define([lt_unquote], $1)
 43 | 
 44 | 
 45 | # lt_append(MACRO-NAME, STRING, [SEPARATOR])
 46 | # ------------------------------------------
 47 | # Redefine MACRO-NAME to hold its former content plus `SEPARATOR'`STRING'.
 48 | # Note that neither SEPARATOR nor STRING are expanded; they are appended
 49 | # to MACRO-NAME as is (leaving the expansion for when MACRO-NAME is invoked).
 50 | # No SEPARATOR is output if MACRO-NAME was previously undefined (different
 51 | # than defined and empty).
 52 | #
 53 | # This macro is needed until we can rely on Autoconf 2.62, since earlier
 54 | # versions of m4sugar mistakenly expanded SEPARATOR but not STRING.
 55 | m4_define([lt_append],
 56 | [m4_define([$1],
 57 | 	   m4_ifdef([$1], [m4_defn([$1])[$3]])[$2])])
 58 | 
 59 | 
 60 | 
 61 | # lt_combine(SEP, PREFIX-LIST, INFIX, SUFFIX1, [SUFFIX2...])
 62 | # ----------------------------------------------------------
 63 | # Produce a SEP delimited list of all paired combinations of elements of
 64 | # PREFIX-LIST with SUFFIX1 through SUFFIXn.  Each element of the list
 65 | # has the form PREFIXmINFIXSUFFIXn.
 66 | # Needed until we can rely on m4_combine added in Autoconf 2.62.
 67 | m4_define([lt_combine],
 68 | [m4_if(m4_eval([$# > 3]), [1],
 69 |        [m4_pushdef([_Lt_sep], [m4_define([_Lt_sep], m4_defn([lt_car]))])]]dnl
 70 | [[m4_foreach([_Lt_prefix], [$2],
 71 | 	     [m4_foreach([_Lt_suffix],
 72 | 		]m4_dquote(m4_dquote(m4_shift(m4_shift(m4_shift($@)))))[,
 73 | 	[_Lt_sep([$1])[]m4_defn([_Lt_prefix])[$3]m4_defn([_Lt_suffix])])])])])
 74 | 
 75 | 
 76 | # lt_if_append_uniq(MACRO-NAME, VARNAME, [SEPARATOR], [UNIQ], [NOT-UNIQ])
 77 | # -----------------------------------------------------------------------
 78 | # Iff MACRO-NAME does not yet contain VARNAME, then append it (delimited
 79 | # by SEPARATOR if supplied) and expand UNIQ, else NOT-UNIQ.
 80 | m4_define([lt_if_append_uniq],
 81 | [m4_ifdef([$1],
 82 | 	  [m4_if(m4_index([$3]m4_defn([$1])[$3], [$3$2$3]), [-1],
 83 | 		 [lt_append([$1], [$2], [$3])$4],
 84 | 		 [$5])],
 85 | 	  [lt_append([$1], [$2], [$3])$4])])
 86 | 
 87 | 
 88 | # lt_dict_add(DICT, KEY, VALUE)
 89 | # -----------------------------
 90 | m4_define([lt_dict_add],
 91 | [m4_define([$1($2)], [$3])])
 92 | 
 93 | 
 94 | # lt_dict_add_subkey(DICT, KEY, SUBKEY, VALUE)
 95 | # --------------------------------------------
 96 | m4_define([lt_dict_add_subkey],
 97 | [m4_define([$1($2:$3)], [$4])])
 98 | 
 99 | 
100 | # lt_dict_fetch(DICT, KEY, [SUBKEY])
101 | # ----------------------------------
102 | m4_define([lt_dict_fetch],
103 | [m4_ifval([$3],
104 | 	m4_ifdef([$1($2:$3)], [m4_defn([$1($2:$3)])]),
105 |     m4_ifdef([$1($2)], [m4_defn([$1($2)])]))])
106 | 
107 | 
108 | # lt_if_dict_fetch(DICT, KEY, [SUBKEY], VALUE, IF-TRUE, [IF-FALSE])
109 | # -----------------------------------------------------------------
110 | m4_define([lt_if_dict_fetch],
111 | [m4_if(lt_dict_fetch([$1], [$2], [$3]), [$4],
112 | 	[$5],
113 |     [$6])])
114 | 
115 | 
116 | # lt_dict_filter(DICT, [SUBKEY], VALUE, [SEPARATOR], KEY, [...])
117 | # --------------------------------------------------------------
118 | m4_define([lt_dict_filter],
119 | [m4_if([$5], [], [],
120 |   [lt_join(m4_quote(m4_default([$4], [[, ]])),
121 |            lt_unquote(m4_split(m4_normalize(m4_foreach(_Lt_key, lt_car([m4_shiftn(4, $@)]),
122 | 		      [lt_if_dict_fetch([$1], _Lt_key, [$2], [$3], [_Lt_key ])])))))])[]dnl
123 | ])
124 | 


--------------------------------------------------------------------------------
/m4/ltversion.m4:
--------------------------------------------------------------------------------
 1 | # ltversion.m4 -- version numbers			-*- Autoconf -*-
 2 | #
 3 | #   Copyright (C) 2004 Free Software Foundation, Inc.
 4 | #   Written by Scott James Remnant, 2004
 5 | #
 6 | # This file is free software; the Free Software Foundation gives
 7 | # unlimited permission to copy and/or distribute it, with or without
 8 | # modifications, as long as this notice is preserved.
 9 | 
10 | # @configure_input@
11 | 
12 | # serial 3337 ltversion.m4
13 | # This file is part of GNU Libtool
14 | 
15 | m4_define([LT_PACKAGE_VERSION], [2.4.2])
16 | m4_define([LT_PACKAGE_REVISION], [1.3337])
17 | 
18 | AC_DEFUN([LTVERSION_VERSION],
19 | [macro_version='2.4.2'
20 | macro_revision='1.3337'
21 | _LT_DECL(, macro_version, 0, [Which release of libtool.m4 was used?])
22 | _LT_DECL(, macro_revision, 0)
23 | ])
24 | 


--------------------------------------------------------------------------------
/m4/lt~obsolete.m4:
--------------------------------------------------------------------------------
 1 | # lt~obsolete.m4 -- aclocal satisfying obsolete definitions.    -*-Autoconf-*-
 2 | #
 3 | #   Copyright (C) 2004, 2005, 2007, 2009 Free Software Foundation, Inc.
 4 | #   Written by Scott James Remnant, 2004.
 5 | #
 6 | # This file is free software; the Free Software Foundation gives
 7 | # unlimited permission to copy and/or distribute it, with or without
 8 | # modifications, as long as this notice is preserved.
 9 | 
10 | # serial 5 lt~obsolete.m4
11 | 
12 | # These exist entirely to fool aclocal when bootstrapping libtool.
13 | #
14 | # In the past libtool.m4 has provided macros via AC_DEFUN (or AU_DEFUN)
15 | # which have later been changed to m4_define as they aren't part of the
16 | # exported API, or moved to Autoconf or Automake where they belong.
17 | #
18 | # The trouble is, aclocal is a bit thick.  It'll see the old AC_DEFUN
19 | # in /usr/share/aclocal/libtool.m4 and remember it, then when it sees us
20 | # using a macro with the same name in our local m4/libtool.m4 it'll
21 | # pull the old libtool.m4 in (it doesn't see our shiny new m4_define
22 | # and doesn't know about Autoconf macros at all.)
23 | #
24 | # So we provide this file, which has a silly filename so it's always
25 | # included after everything else.  This provides aclocal with the
26 | # AC_DEFUNs it wants, but when m4 processes it, it doesn't do anything
27 | # because those macros already exist, or will be overwritten later.
28 | # We use AC_DEFUN over AU_DEFUN for compatibility with aclocal-1.6. 
29 | #
30 | # Anytime we withdraw an AC_DEFUN or AU_DEFUN, remember to add it here.
31 | # Yes, that means every name once taken will need to remain here until
32 | # we give up compatibility with versions before 1.7, at which point
33 | # we need to keep only those names which we still refer to.
34 | 
35 | # This is to help aclocal find these macros, as it can't see m4_define.
36 | AC_DEFUN([LTOBSOLETE_VERSION], [m4_if([1])])
37 | 
38 | m4_ifndef([AC_LIBTOOL_LINKER_OPTION],	[AC_DEFUN([AC_LIBTOOL_LINKER_OPTION])])
39 | m4_ifndef([AC_PROG_EGREP],		[AC_DEFUN([AC_PROG_EGREP])])
40 | m4_ifndef([_LT_AC_PROG_ECHO_BACKSLASH],	[AC_DEFUN([_LT_AC_PROG_ECHO_BACKSLASH])])
41 | m4_ifndef([_LT_AC_SHELL_INIT],		[AC_DEFUN([_LT_AC_SHELL_INIT])])
42 | m4_ifndef([_LT_AC_SYS_LIBPATH_AIX],	[AC_DEFUN([_LT_AC_SYS_LIBPATH_AIX])])
43 | m4_ifndef([_LT_PROG_LTMAIN],		[AC_DEFUN([_LT_PROG_LTMAIN])])
44 | m4_ifndef([_LT_AC_TAGVAR],		[AC_DEFUN([_LT_AC_TAGVAR])])
45 | m4_ifndef([AC_LTDL_ENABLE_INSTALL],	[AC_DEFUN([AC_LTDL_ENABLE_INSTALL])])
46 | m4_ifndef([AC_LTDL_PREOPEN],		[AC_DEFUN([AC_LTDL_PREOPEN])])
47 | m4_ifndef([_LT_AC_SYS_COMPILER],	[AC_DEFUN([_LT_AC_SYS_COMPILER])])
48 | m4_ifndef([_LT_AC_LOCK],		[AC_DEFUN([_LT_AC_LOCK])])
49 | m4_ifndef([AC_LIBTOOL_SYS_OLD_ARCHIVE],	[AC_DEFUN([AC_LIBTOOL_SYS_OLD_ARCHIVE])])
50 | m4_ifndef([_LT_AC_TRY_DLOPEN_SELF],	[AC_DEFUN([_LT_AC_TRY_DLOPEN_SELF])])
51 | m4_ifndef([AC_LIBTOOL_PROG_CC_C_O],	[AC_DEFUN([AC_LIBTOOL_PROG_CC_C_O])])
52 | m4_ifndef([AC_LIBTOOL_SYS_HARD_LINK_LOCKS], [AC_DEFUN([AC_LIBTOOL_SYS_HARD_LINK_LOCKS])])
53 | m4_ifndef([AC_LIBTOOL_OBJDIR],		[AC_DEFUN([AC_LIBTOOL_OBJDIR])])
54 | m4_ifndef([AC_LTDL_OBJDIR],		[AC_DEFUN([AC_LTDL_OBJDIR])])
55 | m4_ifndef([AC_LIBTOOL_PROG_LD_HARDCODE_LIBPATH], [AC_DEFUN([AC_LIBTOOL_PROG_LD_HARDCODE_LIBPATH])])
56 | m4_ifndef([AC_LIBTOOL_SYS_LIB_STRIP],	[AC_DEFUN([AC_LIBTOOL_SYS_LIB_STRIP])])
57 | m4_ifndef([AC_PATH_MAGIC],		[AC_DEFUN([AC_PATH_MAGIC])])
58 | m4_ifndef([AC_PROG_LD_GNU],		[AC_DEFUN([AC_PROG_LD_GNU])])
59 | m4_ifndef([AC_PROG_LD_RELOAD_FLAG],	[AC_DEFUN([AC_PROG_LD_RELOAD_FLAG])])
60 | m4_ifndef([AC_DEPLIBS_CHECK_METHOD],	[AC_DEFUN([AC_DEPLIBS_CHECK_METHOD])])
61 | m4_ifndef([AC_LIBTOOL_PROG_COMPILER_NO_RTTI], [AC_DEFUN([AC_LIBTOOL_PROG_COMPILER_NO_RTTI])])
62 | m4_ifndef([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE], [AC_DEFUN([AC_LIBTOOL_SYS_GLOBAL_SYMBOL_PIPE])])
63 | m4_ifndef([AC_LIBTOOL_PROG_COMPILER_PIC], [AC_DEFUN([AC_LIBTOOL_PROG_COMPILER_PIC])])
64 | m4_ifndef([AC_LIBTOOL_PROG_LD_SHLIBS],	[AC_DEFUN([AC_LIBTOOL_PROG_LD_SHLIBS])])
65 | m4_ifndef([AC_LIBTOOL_POSTDEP_PREDEP],	[AC_DEFUN([AC_LIBTOOL_POSTDEP_PREDEP])])
66 | m4_ifndef([LT_AC_PROG_EGREP],		[AC_DEFUN([LT_AC_PROG_EGREP])])
67 | m4_ifndef([LT_AC_PROG_SED],		[AC_DEFUN([LT_AC_PROG_SED])])
68 | m4_ifndef([_LT_CC_BASENAME],		[AC_DEFUN([_LT_CC_BASENAME])])
69 | m4_ifndef([_LT_COMPILER_BOILERPLATE],	[AC_DEFUN([_LT_COMPILER_BOILERPLATE])])
70 | m4_ifndef([_LT_LINKER_BOILERPLATE],	[AC_DEFUN([_LT_LINKER_BOILERPLATE])])
71 | m4_ifndef([_AC_PROG_LIBTOOL],		[AC_DEFUN([_AC_PROG_LIBTOOL])])
72 | m4_ifndef([AC_LIBTOOL_SETUP],		[AC_DEFUN([AC_LIBTOOL_SETUP])])
73 | m4_ifndef([_LT_AC_CHECK_DLFCN],		[AC_DEFUN([_LT_AC_CHECK_DLFCN])])
74 | m4_ifndef([AC_LIBTOOL_SYS_DYNAMIC_LINKER],	[AC_DEFUN([AC_LIBTOOL_SYS_DYNAMIC_LINKER])])
75 | m4_ifndef([_LT_AC_TAGCONFIG],		[AC_DEFUN([_LT_AC_TAGCONFIG])])
76 | m4_ifndef([AC_DISABLE_FAST_INSTALL],	[AC_DEFUN([AC_DISABLE_FAST_INSTALL])])
77 | m4_ifndef([_LT_AC_LANG_CXX],		[AC_DEFUN([_LT_AC_LANG_CXX])])
78 | m4_ifndef([_LT_AC_LANG_F77],		[AC_DEFUN([_LT_AC_LANG_F77])])
79 | m4_ifndef([_LT_AC_LANG_GCJ],		[AC_DEFUN([_LT_AC_LANG_GCJ])])
80 | m4_ifndef([AC_LIBTOOL_LANG_C_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_C_CONFIG])])
81 | m4_ifndef([_LT_AC_LANG_C_CONFIG],	[AC_DEFUN([_LT_AC_LANG_C_CONFIG])])
82 | m4_ifndef([AC_LIBTOOL_LANG_CXX_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_CXX_CONFIG])])
83 | m4_ifndef([_LT_AC_LANG_CXX_CONFIG],	[AC_DEFUN([_LT_AC_LANG_CXX_CONFIG])])
84 | m4_ifndef([AC_LIBTOOL_LANG_F77_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_F77_CONFIG])])
85 | m4_ifndef([_LT_AC_LANG_F77_CONFIG],	[AC_DEFUN([_LT_AC_LANG_F77_CONFIG])])
86 | m4_ifndef([AC_LIBTOOL_LANG_GCJ_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_GCJ_CONFIG])])
87 | m4_ifndef([_LT_AC_LANG_GCJ_CONFIG],	[AC_DEFUN([_LT_AC_LANG_GCJ_CONFIG])])
88 | m4_ifndef([AC_LIBTOOL_LANG_RC_CONFIG],	[AC_DEFUN([AC_LIBTOOL_LANG_RC_CONFIG])])
89 | m4_ifndef([_LT_AC_LANG_RC_CONFIG],	[AC_DEFUN([_LT_AC_LANG_RC_CONFIG])])
90 | m4_ifndef([AC_LIBTOOL_CONFIG],		[AC_DEFUN([AC_LIBTOOL_CONFIG])])
91 | m4_ifndef([_LT_AC_FILE_LTDLL_C],	[AC_DEFUN([_LT_AC_FILE_LTDLL_C])])
92 | m4_ifndef([_LT_REQUIRED_DARWIN_CHECKS],	[AC_DEFUN([_LT_REQUIRED_DARWIN_CHECKS])])
93 | m4_ifndef([_LT_AC_PROG_CXXCPP],		[AC_DEFUN([_LT_AC_PROG_CXXCPP])])
94 | m4_ifndef([_LT_PREPARE_SED_QUOTE_VARS],	[AC_DEFUN([_LT_PREPARE_SED_QUOTE_VARS])])
95 | m4_ifndef([_LT_PROG_ECHO_BACKSLASH],	[AC_DEFUN([_LT_PROG_ECHO_BACKSLASH])])
96 | m4_ifndef([_LT_PROG_F77],		[AC_DEFUN([_LT_PROG_F77])])
97 | m4_ifndef([_LT_PROG_FC],		[AC_DEFUN([_LT_PROG_FC])])
98 | m4_ifndef([_LT_PROG_CXX],		[AC_DEFUN([_LT_PROG_CXX])])
99 | 


--------------------------------------------------------------------------------
/missing:
--------------------------------------------------------------------------------
  1 | #! /bin/sh
  2 | # Common wrapper for a few potentially missing GNU programs.
  3 | 
  4 | scriptversion=2013-10-28.13; # UTC
  5 | 
  6 | # Copyright (C) 1996-2013 Free Software Foundation, Inc.
  7 | # Originally written by Fran,cois Pinard <pinard@iro.umontreal.ca>, 1996.
  8 | 
  9 | # This program is free software; you can redistribute it and/or modify
 10 | # it under the terms of the GNU General Public License as published by
 11 | # the Free Software Foundation; either version 2, or (at your option)
 12 | # any later version.
 13 | 
 14 | # This program is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU General Public License for more details.
 18 | 
 19 | # You should have received a copy of the GNU General Public License
 20 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 21 | 
 22 | # As a special exception to the GNU General Public License, if you
 23 | # distribute this file as part of a program that contains a
 24 | # configuration script generated by Autoconf, you may include it under
 25 | # the same distribution terms that you use for the rest of that program.
 26 | 
 27 | if test $# -eq 0; then
 28 |   echo 1>&2 "Try '$0 --help' for more information"
 29 |   exit 1
 30 | fi
 31 | 
 32 | case $1 in
 33 | 
 34 |   --is-lightweight)
 35 |     # Used by our autoconf macros to check whether the available missing
 36 |     # script is modern enough.
 37 |     exit 0
 38 |     ;;
 39 | 
 40 |   --run)
 41 |     # Back-compat with the calling convention used by older automake.
 42 |     shift
 43 |     ;;
 44 | 
 45 |   -h|--h|--he|--hel|--help)
 46 |     echo "\
 47 | $0 [OPTION]... PROGRAM [ARGUMENT]...
 48 | 
 49 | Run 'PROGRAM [ARGUMENT]...', returning a proper advice when this fails due
 50 | to PROGRAM being missing or too old.
 51 | 
 52 | Options:
 53 |   -h, --help      display this help and exit
 54 |   -v, --version   output version information and exit
 55 | 
 56 | Supported PROGRAM values:
 57 |   aclocal   autoconf  autoheader   autom4te  automake  makeinfo
 58 |   bison     yacc      flex         lex       help2man
 59 | 
 60 | Version suffixes to PROGRAM as well as the prefixes 'gnu-', 'gnu', and
 61 | 'g' are ignored when checking the name.
 62 | 
 63 | Send bug reports to <bug-automake@gnu.org>."
 64 |     exit $?
 65 |     ;;
 66 | 
 67 |   -v|--v|--ve|--ver|--vers|--versi|--versio|--version)
 68 |     echo "missing $scriptversion (GNU Automake)"
 69 |     exit $?
 70 |     ;;
 71 | 
 72 |   -*)
 73 |     echo 1>&2 "$0: unknown '$1' option"
 74 |     echo 1>&2 "Try '$0 --help' for more information"
 75 |     exit 1
 76 |     ;;
 77 | 
 78 | esac
 79 | 
 80 | # Run the given program, remember its exit status.
 81 | "$@"; st=$?
 82 | 
 83 | # If it succeeded, we are done.
 84 | test $st -eq 0 && exit 0
 85 | 
 86 | # Also exit now if we it failed (or wasn't found), and '--version' was
 87 | # passed; such an option is passed most likely to detect whether the
 88 | # program is present and works.
 89 | case $2 in --version|--help) exit $st;; esac
 90 | 
 91 | # Exit code 63 means version mismatch.  This often happens when the user
 92 | # tries to use an ancient version of a tool on a file that requires a
 93 | # minimum version.
 94 | if test $st -eq 63; then
 95 |   msg="probably too old"
 96 | elif test $st -eq 127; then
 97 |   # Program was missing.
 98 |   msg="missing on your system"
 99 | else
100 |   # Program was found and executed, but failed.  Give up.
101 |   exit $st
102 | fi
103 | 
104 | perl_URL=http://www.perl.org/
105 | flex_URL=http://flex.sourceforge.net/
106 | gnu_software_URL=http://www.gnu.org/software
107 | 
108 | program_details ()
109 | {
110 |   case $1 in
111 |     aclocal|automake)
112 |       echo "The '$1' program is part of the GNU Automake package:"
113 |       echo "<$gnu_software_URL/automake>"
114 |       echo "It also requires GNU Autoconf, GNU m4 and Perl in order to run:"
115 |       echo "<$gnu_software_URL/autoconf>"
116 |       echo "<$gnu_software_URL/m4/>"
117 |       echo "<$perl_URL>"
118 |       ;;
119 |     autoconf|autom4te|autoheader)
120 |       echo "The '$1' program is part of the GNU Autoconf package:"
121 |       echo "<$gnu_software_URL/autoconf/>"
122 |       echo "It also requires GNU m4 and Perl in order to run:"
123 |       echo "<$gnu_software_URL/m4/>"
124 |       echo "<$perl_URL>"
125 |       ;;
126 |   esac
127 | }
128 | 
129 | give_advice ()
130 | {
131 |   # Normalize program name to check for.
132 |   normalized_program=`echo "$1" | sed '
133 |     s/^gnu-//; t
134 |     s/^gnu//; t
135 |     s/^g//; t'`
136 | 
137 |   printf '%s\n' "'$1' is $msg."
138 | 
139 |   configure_deps="'configure.ac' or m4 files included by 'configure.ac'"
140 |   case $normalized_program in
141 |     autoconf*)
142 |       echo "You should only need it if you modified 'configure.ac',"
143 |       echo "or m4 files included by it."
144 |       program_details 'autoconf'
145 |       ;;
146 |     autoheader*)
147 |       echo "You should only need it if you modified 'acconfig.h' or"
148 |       echo "$configure_deps."
149 |       program_details 'autoheader'
150 |       ;;
151 |     automake*)
152 |       echo "You should only need it if you modified 'Makefile.am' or"
153 |       echo "$configure_deps."
154 |       program_details 'automake'
155 |       ;;
156 |     aclocal*)
157 |       echo "You should only need it if you modified 'acinclude.m4' or"
158 |       echo "$configure_deps."
159 |       program_details 'aclocal'
160 |       ;;
161 |    autom4te*)
162 |       echo "You might have modified some maintainer files that require"
163 |       echo "the 'autom4te' program to be rebuilt."
164 |       program_details 'autom4te'
165 |       ;;
166 |     bison*|yacc*)
167 |       echo "You should only need it if you modified a '.y' file."
168 |       echo "You may want to install the GNU Bison package:"
169 |       echo "<$gnu_software_URL/bison/>"
170 |       ;;
171 |     lex*|flex*)
172 |       echo "You should only need it if you modified a '.l' file."
173 |       echo "You may want to install the Fast Lexical Analyzer package:"
174 |       echo "<$flex_URL>"
175 |       ;;
176 |     help2man*)
177 |       echo "You should only need it if you modified a dependency" \
178 |            "of a man page."
179 |       echo "You may want to install the GNU Help2man package:"
180 |       echo "<$gnu_software_URL/help2man/>"
181 |     ;;
182 |     makeinfo*)
183 |       echo "You should only need it if you modified a '.texi' file, or"
184 |       echo "any other file indirectly affecting the aspect of the manual."
185 |       echo "You might want to install the Texinfo package:"
186 |       echo "<$gnu_software_URL/texinfo/>"
187 |       echo "The spurious makeinfo call might also be the consequence of"
188 |       echo "using a buggy 'make' (AIX, DU, IRIX), in which case you might"
189 |       echo "want to install GNU make:"
190 |       echo "<$gnu_software_URL/make/>"
191 |       ;;
192 |     *)
193 |       echo "You might have modified some files without having the proper"
194 |       echo "tools for further handling them.  Check the 'README' file, it"
195 |       echo "often tells you about the needed prerequisites for installing"
196 |       echo "this package.  You may also peek at any GNU archive site, in"
197 |       echo "case some other package contains this missing '$1' program."
198 |       ;;
199 |   esac
200 | }
201 | 
202 | give_advice "$1" | sed -e '1s/^/WARNING: /' \
203 |                        -e '2,$s/^/         /' >&2
204 | 
205 | # Propagate the correct exit status (expected to be 127 for a program
206 | # not found, 63 for a program that failed due to version mismatch).
207 | exit $st
208 | 
209 | # Local variables:
210 | # eval: (add-hook 'write-file-hooks 'time-stamp)
211 | # time-stamp-start: "scriptversion="
212 | # time-stamp-format: "%:y-%02m-%02d.%02H"
213 | # time-stamp-time-zone: "UTC"
214 | # time-stamp-end: "; # UTC"
215 | # End:
216 | 


--------------------------------------------------------------------------------
/src/Makefile.am:
--------------------------------------------------------------------------------
1 | SUBDIRS = proto include lib bin
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/src/bin/Makefile.am:
--------------------------------------------------------------------------------
 1 | if HAVE_BIN
 2 | bin_PROGRAMS = normalizer_main
 3 | 
 4 | AM_CPPFLAGS = -I$(srcdir)/../include
 5 | 
 6 | LDADD= ../lib/libsparrowhawk.la -L/usr/local/lib/fst -lthrax -lfstfar -lfst -lm -ldl -lprotobuf -lre2
 7 | 
 8 | normalizer_main_SOURCES = normalizer_main.cc
 9 | endif
10 | 


--------------------------------------------------------------------------------
/src/bin/normalizer_main.cc:
--------------------------------------------------------------------------------
 1 | // Licensed under the Apache License, Version 2.0 (the "License");
 2 | // you may not use this file except in compliance with the License.
 3 | // You may obtain a copy of the License at
 4 | //
 5 | //     http://www.apache.org/licenses/LICENSE-2.0
 6 | //
 7 | // Unless required by applicable law or agreed to in writing, software
 8 | // distributed under the License is distributed on an "AS IS" BASIS,
 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | // See the License for the specific language governing permissions and
11 | // limitations under the License.
12 | //
13 | // Copyright 2015 and onwards Google, Inc.
14 | // Very simple stand-alone binary to run Sparrowhawk normalizer on a line of
15 | // text.
16 | //
17 | // It runs the sentence boundary detector on the input, and then normalizes each
18 | // sentence.
19 | //
20 | // As an example of use, build the test data here, and put them somewhere, such
21 | // as tmp/sparrowhawk_test
22 | //
23 | // Then copy the relevant fars and protos there, edit the protos and then run:
24 | //
25 | // blaze-bin/speech/tts/open_source/sparrowhawk/normalizer_main \
26 | //  --config tmp/sparrowhawk_test/sparrowhawk_configuration_af.ascii_proto
27 | //
28 | // Then input a few sentences on one line such as:
29 | //
30 | // Kameelperde het 'n kenmerkende voorkoms, met hul lang nekke en relatief \
31 | // kort lywe. Hulle word 4,3 - 5,7m lank. Die bulle is effens langer as die \
32 | // koeie.
33 | 
34 | #include <iostream>
35 | #include <memory>
36 | #include <string>
37 | using std::string;
38 | #include <vector>
39 | using std::vector;
40 | 
41 | #include <sparrowhawk/normalizer.h>
42 | 
43 | DEFINE_bool(multi_line_text, false, "Text is spread across multiple lines.");
44 | DEFINE_string(config, "", "Path to the configuration proto.");
45 | DEFINE_string(path_prefix, "./", "Optional path prefix if not relative.");
46 | 
47 | void NormalizeInput(const string& input,
48 |                     speech::sparrowhawk::Normalizer *normalizer) {
49 |   const std::vector<string> sentences = normalizer->SentenceSplitter(input);
50 |   for (const auto& sentence : sentences) {
51 |     string output;
52 |     normalizer->Normalize(sentence, &output);
53 |     std::cout << output << std::endl;
54 |   }
55 | }
56 | 
57 | int main(int argc, char** argv) {
58 |   using speech::sparrowhawk::Normalizer;
59 |   std::set_new_handler(FailedNewHandler);
60 |   SET_FLAGS(argv[0], &argc, &argv, true);
61 |   std::unique_ptr<Normalizer> normalizer;
62 |   normalizer.reset(new Normalizer());
63 |   CHECK(normalizer->Setup(FLAGS_config, FLAGS_path_prefix));
64 |   string input;
65 |   if (FLAGS_multi_line_text) {
66 |     string line;
67 |     while (std::getline(std::cin, line)) {
68 |       if (!input.empty()) input += " ";
69 |       input += line;
70 |     }
71 |     NormalizeInput(input, normalizer.get());
72 |   } else {
73 |     while (std::getline(std::cin, input)) {
74 |       NormalizeInput(input, normalizer.get());
75 |     }
76 |   }
77 |   return 0;
78 | }
79 | 


--------------------------------------------------------------------------------
/src/include/Makefile.am:
--------------------------------------------------------------------------------
 1 | BUILT_SOURCES = $(srcdir)/sparrowhawk/items.pb.h $(srcdir)/sparrowhawk/links.pb.h \
 2 |                 $(srcdir)/sparrowhawk/rule_order.pb.h \
 3 |                 $(srcdir)/sparrowhawk/semiotic_classes.pb.h \
 4 |                 $(srcdir)/sparrowhawk/sparrowhawk_configuration.pb.h
 5 | 
 6 | nobase_include_HEADERS =  sparrowhawk/field_path.h \
 7 | 		          sparrowhawk/io_utils.h \
 8 | 		          sparrowhawk/logger.h \
 9 | 		          sparrowhawk/normalizer.h \
10 | 		          sparrowhawk/numbers.h \
11 | 		          sparrowhawk/protobuf_parser.h \
12 | 		          sparrowhawk/protobuf_serializer.h \
13 | 		          sparrowhawk/record_serializer.h \
14 | 		          sparrowhawk/regexp.h \
15 | 		          sparrowhawk/rule_system.h \
16 | 		          sparrowhawk/sentence_boundary.h \
17 | 		          sparrowhawk/spec_serializer.h \
18 | 		          sparrowhawk/string_utils.h \
19 | 		          sparrowhawk/style_serializer.h \
20 | 		          $(BUILT_SOURCES)
21 | 
22 | sparrowhawk/items.pb.h:
23 | 	$(MAKE) -C $(srcdir)/../proto/ items.pb.h
24 | 
25 | sparrowhawk/links.pb.h:
26 | 	$(MAKE) -C $(srcdir)/../proto/ links.pb.h
27 | 
28 | sparrowhawk/rule_order.pb.h:
29 | 	$(MAKE) -C $(srcdir)/../proto/ rule_order.pb.h
30 | 
31 | sparrowhawk/semiotic_classes.pb.h:
32 | 	$(MAKE) -C $(srcdir)/../proto/ semiotic_classes.pb.h
33 | 
34 | sparrowhawk/serialization_spec.pb.h:
35 | 	$(MAKE) -C $(srcdir)/../proto/ serialization_spec.pb.h
36 | 
37 | sparrowhawk/sparrowhawk_configuration.pb.h:
38 | 	$(MAKE) -C $(srcdir)/../proto/ sparrowhawk_configuration.pb.h
39 |  
40 | 


--------------------------------------------------------------------------------
/src/include/sparrowhawk/field_path.h:
--------------------------------------------------------------------------------
 1 | // Licensed under the Apache License, Version 2.0 (the "License");
 2 | // you may not use this file except in compliance with the License.
 3 | // You may obtain a copy of the License at
 4 | //
 5 | //     http://www.apache.org/licenses/LICENSE-2.0
 6 | //
 7 | // Unless required by applicable law or agreed to in writing, software
 8 | // distributed under the License is distributed on an "AS IS" BASIS,
 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | // See the License for the specific language governing permissions and
11 | // limitations under the License.
12 | //
13 | // Copyright 2015 and onwards Google, Inc.
14 | // Utility to access specific subfields within a protocol buffer. FieldPath
15 | // objects make subfields available via Follow().
16 | //
17 | 
18 | #ifndef SPARROWHAWK_FIELD_PATH_H_
19 | #define SPARROWHAWK_FIELD_PATH_H_
20 | 
21 | #include <memory>
22 | #include <string>
23 | using std::string;
24 | #include <vector>
25 | using std::vector;
26 | 
27 | #include <fst/compat.h>
28 | #include <google/protobuf/descriptor.h>
29 | #include <google/protobuf/message.h>
30 | 
31 | namespace speech {
32 | namespace sparrowhawk {
33 | 
34 | class FieldPath {
35 |  public:
36 |   // Creates and returns a FieldPath using a descriptor for the type of
37 |   // messages we intend to Follow().
38 |   // Returns a null value if the input pointer is null.
39 |   static std::unique_ptr<FieldPath> Create(const google::protobuf::Descriptor *root_type);
40 | 
41 |   // Replaces this field_path with input path_string of type:
42 |   //               (message_name.)*scalar_field_name
43 |   // Returns false if an error occurs with either the format of the string or
44 |   // with mismatches of type (e.g. a subfield of an integer) or label (i.e. an
45 |   // index is supplied when the field is not repeated.)
46 |   bool Parse(const string& path_string);
47 | 
48 |   // Clear all fields from path.
49 |   void Clear();
50 | 
51 |   inline const google::protobuf::Descriptor *GetRootType() const { return root_type_; }
52 | 
53 |   // Number of fields on this path. Does not count the root as a field.
54 |   inline int GetLength() const { return path_.size(); }
55 | 
56 |   // True if GetLength() == 0.
57 |   inline bool IsEmpty() const { return GetLength() == 0; }
58 | 
59 |   // Follows the path starting from the given base message. *parent is filled
60 |   // in with the immediate parent of the field at the end of the path and *field
61 |   // is filled in with the terminal field's descriptor.
62 |   // You can then use reflection to query the field value.
63 |   //
64 |   // Returns false only if the base message is incorrect (the only error that
65 |   // can't be detected at parsing time); in this case *parent and *field are
66 |   // unchanged.
67 |   bool Follow(const google::protobuf::Message& base, const google::protobuf::Message **parent,
68 |               const google::protobuf::FieldDescriptor **field) const;
69 | 
70 |  private:
71 |   // Only used by the factory function Create.
72 |   explicit FieldPath(const google::protobuf::Descriptor *root_type)
73 |       : root_type_(root_type) {}
74 | 
75 |   // Parse intermediate message fields from input path. The parent is initially
76 |   // root_type_ and is finally set to the penultimate field's descriptor.
77 |   bool TraverseIntermediateFields(std::vector<string> path,
78 |                                   const google::protobuf::Descriptor **parent);
79 | 
80 |   // Parse terminal field "field" with given parent descriptor into path_.
81 |   bool ParseTerminalField(const string &terminal_field_name,
82 |                           const google::protobuf::Descriptor *parent);
83 | 
84 |   std::vector<const google::protobuf::FieldDescriptor*> path_;
85 |   const google::protobuf::Descriptor *root_type_;
86 | };
87 | 
88 | }  // namespace sparrowhawk
89 | }  // namespace speech
90 | 
91 | #endif  // SPARROWHAWK_FIELD_PATH_H_
92 | 


--------------------------------------------------------------------------------
/src/include/sparrowhawk/io_utils.h:
--------------------------------------------------------------------------------
 1 | // Licensed under the Apache License, Version 2.0 (the "License");
 2 | // you may not use this file except in compliance with the License.
 3 | // You may obtain a copy of the License at
 4 | //
 5 | //     http://www.apache.org/licenses/LICENSE-2.0
 6 | //
 7 | // Unless required by applicable law or agreed to in writing, software
 8 | // distributed under the License is distributed on an "AS IS" BASIS,
 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | // See the License for the specific language governing permissions and
11 | // limitations under the License.
12 | //
13 | // Copyright 2015 and onwards Google, Inc.
14 | // Various utilities to replace Google functionality for I/O.
15 | #ifndef SPARROWHAWK_IO_UTILS_H_
16 | #define SPARROWHAWK_IO_UTILS_H_
17 | 
18 | #include <string>
19 | using std::string;
20 | 
21 | #include <fst/compat.h>
22 | namespace speech {
23 | namespace sparrowhawk {
24 | 
25 | class IOStream {
26 |  public:
27 |   static string LoadFileToString(const string &filename);
28 | };
29 | 
30 | }  // namespace sparrowhawk
31 | }  // namespace speech
32 | 
33 | #endif  // SPARROWHAWK_IO_UTILS_H_
34 | 


--------------------------------------------------------------------------------
/src/include/sparrowhawk/links.pb.h:
--------------------------------------------------------------------------------
  1 | // Generated by the protocol buffer compiler.  DO NOT EDIT!
  2 | // source: links.proto
  3 | 
  4 | #ifndef PROTOBUF_links_2eproto__INCLUDED
  5 | #define PROTOBUF_links_2eproto__INCLUDED
  6 | 
  7 | #include <string>
  8 | 
  9 | #include <google/protobuf/stubs/common.h>
 10 | 
 11 | #if GOOGLE_PROTOBUF_VERSION < 2005000
 12 | #error This file was generated by a newer version of protoc which is
 13 | #error incompatible with your Protocol Buffer headers.  Please update
 14 | #error your headers.
 15 | #endif
 16 | #if 2005000 < GOOGLE_PROTOBUF_MIN_PROTOC_VERSION
 17 | #error This file was generated by an older version of protoc which is
 18 | #error incompatible with your Protocol Buffer headers.  Please
 19 | #error regenerate this file with a newer version of protoc.
 20 | #endif
 21 | 
 22 | #include <google/protobuf/generated_message_util.h>
 23 | #include <google/protobuf/message.h>
 24 | #include <google/protobuf/repeated_field.h>
 25 | #include <google/protobuf/extension_set.h>
 26 | #include <google/protobuf/unknown_field_set.h>
 27 | // @@protoc_insertion_point(includes)
 28 | 
 29 | namespace speech {
 30 | namespace sparrowhawk {
 31 | 
 32 | // Internal implementation detail -- do not call these.
 33 | void  protobuf_AddDesc_links_2eproto();
 34 | void protobuf_AssignDesc_links_2eproto();
 35 | void protobuf_ShutdownFile_links_2eproto();
 36 | 
 37 | class Links;
 38 | 
 39 | // ===================================================================
 40 | 
 41 | class Links : public ::google::protobuf::Message {
 42 |  public:
 43 |   Links();
 44 |   virtual ~Links();
 45 | 
 46 |   Links(const Links& from);
 47 | 
 48 |   inline Links& operator=(const Links& from) {
 49 |     CopyFrom(from);
 50 |     return *this;
 51 |   }
 52 | 
 53 |   inline const ::google::protobuf::UnknownFieldSet& unknown_fields() const {
 54 |     return _unknown_fields_;
 55 |   }
 56 | 
 57 |   inline ::google::protobuf::UnknownFieldSet* mutable_unknown_fields() {
 58 |     return &_unknown_fields_;
 59 |   }
 60 | 
 61 |   static const ::google::protobuf::Descriptor* descriptor();
 62 |   static const Links& default_instance();
 63 | 
 64 |   void Swap(Links* other);
 65 | 
 66 |   // implements Message ----------------------------------------------
 67 | 
 68 |   Links* New() const;
 69 |   void CopyFrom(const ::google::protobuf::Message& from);
 70 |   void MergeFrom(const ::google::protobuf::Message& from);
 71 |   void CopyFrom(const Links& from);
 72 |   void MergeFrom(const Links& from);
 73 |   void Clear();
 74 |   bool IsInitialized() const;
 75 | 
 76 |   int ByteSize() const;
 77 |   bool MergePartialFromCodedStream(
 78 |       ::google::protobuf::io::CodedInputStream* input);
 79 |   void SerializeWithCachedSizes(
 80 |       ::google::protobuf::io::CodedOutputStream* output) const;
 81 |   ::google::protobuf::uint8* SerializeWithCachedSizesToArray(::google::protobuf::uint8* output) const;
 82 |   int GetCachedSize() const { return _cached_size_; }
 83 |   private:
 84 |   void SharedCtor();
 85 |   void SharedDtor();
 86 |   void SetCachedSize(int size) const;
 87 |   public:
 88 | 
 89 |   ::google::protobuf::Metadata GetMetadata() const;
 90 | 
 91 |   // nested types ----------------------------------------------------
 92 | 
 93 |   // accessors -------------------------------------------------------
 94 | 
 95 |   // optional int32 own_index = 1;
 96 |   inline bool has_own_index() const;
 97 |   inline void clear_own_index();
 98 |   static const int kOwnIndexFieldNumber = 1;
 99 |   inline ::google::protobuf::int32 own_index() const;
100 |   inline void set_own_index(::google::protobuf::int32 value);
101 | 
102 |   // optional int32 parent = 2;
103 |   inline bool has_parent() const;
104 |   inline void clear_parent();
105 |   static const int kParentFieldNumber = 2;
106 |   inline ::google::protobuf::int32 parent() const;
107 |   inline void set_parent(::google::protobuf::int32 value);
108 | 
109 |   // optional int32 first_child = 3;
110 |   inline bool has_first_child() const;
111 |   inline void clear_first_child();
112 |   static const int kFirstChildFieldNumber = 3;
113 |   inline ::google::protobuf::int32 first_child() const;
114 |   inline void set_first_child(::google::protobuf::int32 value);
115 | 
116 |   // optional int32 last_child = 4;
117 |   inline bool has_last_child() const;
118 |   inline void clear_last_child();
119 |   static const int kLastChildFieldNumber = 4;
120 |   inline ::google::protobuf::int32 last_child() const;
121 |   inline void set_last_child(::google::protobuf::int32 value);
122 | 
123 |   // @@protoc_insertion_point(class_scope:speech.sparrowhawk.Links)
124 |  private:
125 |   inline void set_has_own_index();
126 |   inline void clear_has_own_index();
127 |   inline void set_has_parent();
128 |   inline void clear_has_parent();
129 |   inline void set_has_first_child();
130 |   inline void clear_has_first_child();
131 |   inline void set_has_last_child();
132 |   inline void clear_has_last_child();
133 | 
134 |   ::google::protobuf::UnknownFieldSet _unknown_fields_;
135 | 
136 |   ::google::protobuf::int32 own_index_;
137 |   ::google::protobuf::int32 parent_;
138 |   ::google::protobuf::int32 first_child_;
139 |   ::google::protobuf::int32 last_child_;
140 | 
141 |   mutable int _cached_size_;
142 |   ::google::protobuf::uint32 _has_bits_[(4 + 31) / 32];
143 | 
144 |   friend void  protobuf_AddDesc_links_2eproto();
145 |   friend void protobuf_AssignDesc_links_2eproto();
146 |   friend void protobuf_ShutdownFile_links_2eproto();
147 | 
148 |   void InitAsDefaultInstance();
149 |   static Links* default_instance_;
150 | };
151 | // ===================================================================
152 | 
153 | 
154 | // ===================================================================
155 | 
156 | // Links
157 | 
158 | // optional int32 own_index = 1;
159 | inline bool Links::has_own_index() const {
160 |   return (_has_bits_[0] & 0x00000001u) != 0;
161 | }
162 | inline void Links::set_has_own_index() {
163 |   _has_bits_[0] |= 0x00000001u;
164 | }
165 | inline void Links::clear_has_own_index() {
166 |   _has_bits_[0] &= ~0x00000001u;
167 | }
168 | inline void Links::clear_own_index() {
169 |   own_index_ = 0;
170 |   clear_has_own_index();
171 | }
172 | inline ::google::protobuf::int32 Links::own_index() const {
173 |   return own_index_;
174 | }
175 | inline void Links::set_own_index(::google::protobuf::int32 value) {
176 |   set_has_own_index();
177 |   own_index_ = value;
178 | }
179 | 
180 | // optional int32 parent = 2;
181 | inline bool Links::has_parent() const {
182 |   return (_has_bits_[0] & 0x00000002u) != 0;
183 | }
184 | inline void Links::set_has_parent() {
185 |   _has_bits_[0] |= 0x00000002u;
186 | }
187 | inline void Links::clear_has_parent() {
188 |   _has_bits_[0] &= ~0x00000002u;
189 | }
190 | inline void Links::clear_parent() {
191 |   parent_ = 0;
192 |   clear_has_parent();
193 | }
194 | inline ::google::protobuf::int32 Links::parent() const {
195 |   return parent_;
196 | }
197 | inline void Links::set_parent(::google::protobuf::int32 value) {
198 |   set_has_parent();
199 |   parent_ = value;
200 | }
201 | 
202 | // optional int32 first_child = 3;
203 | inline bool Links::has_first_child() const {
204 |   return (_has_bits_[0] & 0x00000004u) != 0;
205 | }
206 | inline void Links::set_has_first_child() {
207 |   _has_bits_[0] |= 0x00000004u;
208 | }
209 | inline void Links::clear_has_first_child() {
210 |   _has_bits_[0] &= ~0x00000004u;
211 | }
212 | inline void Links::clear_first_child() {
213 |   first_child_ = 0;
214 |   clear_has_first_child();
215 | }
216 | inline ::google::protobuf::int32 Links::first_child() const {
217 |   return first_child_;
218 | }
219 | inline void Links::set_first_child(::google::protobuf::int32 value) {
220 |   set_has_first_child();
221 |   first_child_ = value;
222 | }
223 | 
224 | // optional int32 last_child = 4;
225 | inline bool Links::has_last_child() const {
226 |   return (_has_bits_[0] & 0x00000008u) != 0;
227 | }
228 | inline void Links::set_has_last_child() {
229 |   _has_bits_[0] |= 0x00000008u;
230 | }
231 | inline void Links::clear_has_last_child() {
232 |   _has_bits_[0] &= ~0x00000008u;
233 | }
234 | inline void Links::clear_last_child() {
235 |   last_child_ = 0;
236 |   clear_has_last_child();
237 | }
238 | inline ::google::protobuf::int32 Links::last_child() const {
239 |   return last_child_;
240 | }
241 | inline void Links::set_last_child(::google::protobuf::int32 value) {
242 |   set_has_last_child();
243 |   last_child_ = value;
244 | }
245 | 
246 | 
247 | // @@protoc_insertion_point(namespace_scope)
248 | 
249 | }  // namespace sparrowhawk
250 | }  // namespace speech
251 | 
252 | #ifndef SWIG
253 | namespace google {
254 | namespace protobuf {
255 | 
256 | 
257 | }  // namespace google
258 | }  // namespace protobuf
259 | #endif  // SWIG
260 | 
261 | // @@protoc_insertion_point(global_scope)
262 | 
263 | #endif  // PROTOBUF_links_2eproto__INCLUDED
264 | 


--------------------------------------------------------------------------------
/src/include/sparrowhawk/logger.h:
--------------------------------------------------------------------------------
 1 | // Licensed under the Apache License, Version 2.0 (the "License");
 2 | // you may not use this file except in compliance with the License.
 3 | // You may obtain a copy of the License at
 4 | //
 5 | //     http://www.apache.org/licenses/LICENSE-2.0
 6 | //
 7 | // Unless required by applicable law or agreed to in writing, software
 8 | // distributed under the License is distributed on an "AS IS" BASIS,
 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | // See the License for the specific language governing permissions and
11 | // limitations under the License.
12 | //
13 | // Copyright 2015 and onwards Google, Inc.
14 | // Various utilities to replace Google functionality for logging.
15 | #ifndef SPARROWHAWK_LOGGER_H_
16 | #define SPARROWHAWK_LOGGER_H_
17 | 
18 | // TODO(rws): Write a more respectable logging system or link to some
19 | // open-source substitute.
20 | 
21 | #include <fst/compat.h>
22 | namespace speech {
23 | namespace sparrowhawk {
24 | 
25 | 
26 | }  // namespace sparrowhawk
27 | }  // namespace speech
28 | 
29 | 
30 | #define LoggerFormat(format) \
31 |   string(string("[%s:%s:%d] ") + format).c_str()
32 | 
33 | #define LoggerMessage(type, format, ...)         \
34 |   fprintf(stderr, \
35 |           LoggerFormat(format), \
36 |           type,              \
37 |           __FILE__,             \
38 |           __LINE__,             \
39 |           ##__VA_ARGS__)
40 | 
41 | #define LoggerDebug(format, ...) LoggerMessage("DEBUG", format, ##__VA_ARGS__)
42 | 
43 | #define LoggerError(format, ...) LoggerMessage("ERROR", format, ##__VA_ARGS__)
44 | 
45 | #define LoggerFatal(format, ...) { \
46 |   LoggerMessage("FATAL", format, ##__VA_ARGS__);        \
47 |   exit(1); }                                        \
48 | 
49 | #define LoggerInfo(format, ...) LoggerMessage("INFO", format, ##__VA_ARGS__)
50 | 
51 | #define LoggerWarn(format, ...) LoggerMessage("WARNING", format, ##__VA_ARGS__)
52 | 
53 | 
54 | #endif  // SPARROWHAWK_LOGGER_H_
55 | 


--------------------------------------------------------------------------------
/src/include/sparrowhawk/normalizer.h:
--------------------------------------------------------------------------------
  1 | // Licensed under the Apache License, Version 2.0 (the "License");
  2 | // you may not use this file except in compliance with the License.
  3 | // You may obtain a copy of the License at
  4 | //
  5 | //     http://www.apache.org/licenses/LICENSE-2.0
  6 | //
  7 | // Unless required by applicable law or agreed to in writing, software
  8 | // distributed under the License is distributed on an "AS IS" BASIS,
  9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | // See the License for the specific language governing permissions and
 11 | // limitations under the License.
 12 | //
 13 | // Copyright 2015 and onwards Google, Inc.
 14 | // The normalizer is the main part of Sparrowhawk. Loosely speaking it follows
 15 | // the discussion of the (Google-internal) Kestrel system as described in
 16 | //
 17 | // Ebden, Peter and Sproat, Richard. 2015. The Kestrel TTS text normalization
 18 | // system. Natural Language Engineering, Issue 03, pp 333-353.
 19 | //
 20 | // After sentence segmentation (sentence_boundary.h), the individual sentences
 21 | // are first tokenized with each token being classified, and then passed to the
 22 | // normalizer. The system can output as an unannotated string of words, and
 23 | // richer annotation with links between input tokens, their input string
 24 | // positions, and the output words is also available.
 25 | 
 26 | #ifndef SPARROWHAWK_NORMALIZER_H_
 27 | #define SPARROWHAWK_NORMALIZER_H_
 28 | 
 29 | #include <string>
 30 | using std::string;
 31 | #include <vector>
 32 | using std::vector;
 33 | 
 34 | #include <fst/compat.h>
 35 | #include <sparrowhawk/items.pb.h>
 36 | #include <sparrowhawk/sentence_boundary.h>
 37 | #include <sparrowhawk/sparrowhawk_configuration.pb.h>
 38 | #include <sparrowhawk/rule_system.h>
 39 | #include <sparrowhawk/spec_serializer.h>
 40 | 
 41 | namespace speech {
 42 | namespace sparrowhawk {
 43 | 
 44 | class Normalizer {
 45 |  public:
 46 |   Normalizer();
 47 | 
 48 |   ~Normalizer();
 49 | 
 50 |   // The functions definitions have been split across two files, normalizer.cc
 51 |   // and normalizer_utils.cc, just to keep things a littler tidier. Below we
 52 |   // indicate where each function is found.
 53 | 
 54 |   // normalizer.cc
 55 |   // Method to load and set data for each derived method
 56 |   bool Setup(const string &configuration_proto, const string &pathname_prefix);
 57 | 
 58 |   // normalizer.cc
 59 |   // Interface to the normalization system for callers that want to be agnostic
 60 |   // about utterances.
 61 |   bool Normalize(const string &input, string *output) const;
 62 | 
 63 |   // normalizer.cc
 64 |   // Interface to the normalization system for callers that want to be agnostic
 65 |   // about utterances. Shows the token/word alignment.
 66 |   bool NormalizeAndShowLinks(const string &input, string *output) const;
 67 | 
 68 |   // normalizer_utils.cc
 69 |   // Helper for linearizing words from an utterance into a string
 70 |   string LinearizeWords(Utterance *utt) const;
 71 | 
 72 |   // normalizer_utils.cc
 73 |   // Helper for showing the indices of all tokens, words and their alignment
 74 |   // links.
 75 |   string ShowLinks(Utterance *utt) const;
 76 | 
 77 |   // normalizer.cc
 78 |   // Preprocessor to use the sentence splitter to break up text into
 79 |   // sentences. An application would normally call this first, and then
 80 |   // normalize each of the resulting sentences.
 81 |   std::vector<string> SentenceSplitter(const string &input) const;
 82 | 
 83 |  private:
 84 |   // normalizer.cc
 85 |   // Internal interface to normalization.
 86 |   bool Normalize(Utterance *utt, const string &input) const;
 87 | 
 88 |   // normalizer_utils.cc
 89 |   // As in Kestrel, adds a phrase and silence.
 90 |   // TODO(rws): Possibly remove this since it is actually not being used.
 91 |   void AddPhraseToUtt(Utterance *utt) const { AddPhraseToUtt(utt, false); }
 92 | 
 93 |   // normalizer_utils.cc
 94 |   void AddPhraseToUtt(Utterance *utt, bool addword) const;
 95 | 
 96 |   // normalizer_utils.cc
 97 |   // Adds a single word to the end of the Word stream
 98 |   Word* AddWord(Utterance *utt, Token *token,
 99 |                 const string &spelling) const;
100 | 
101 |   // normalizer_utils.cc
102 |   // Function to add the words in the string 'name' onto the
103 |   // end of the Word stream.
104 |   Word* AddWords(Utterance *utt, Token *token,
105 |                  const string &name) const;
106 | 
107 |   // Finds the index of the provided token.
108 |   int TokenIndex(Utterance *utt, Token *token) const;
109 |   // normalizer_utils.cc
110 |   // As with Peter's comment in
111 |   // speech/patts2/modules/kestrel/verbalize_general.cc, clear out all the mucky
112 |   // fields that we don't want verbalization to see.
113 |   void CleanFields(Token *markup) const;
114 | 
115 |   // normalizer_utils.cc
116 |   // Returns the substring of the input between left and right
117 |   string InputSubstring(int left, int right) const;
118 | 
119 |   // normalizer.cc
120 |   // Performs tokenization and classification on the input utterance, the first
121 |   // step of normalization
122 |   bool TokenizeAndClassifyUtt(Utterance *utt, const string &input) const;
123 | 
124 |   // normalizer_utils.cc
125 |   // Serializes the contents of a Token to a string
126 |   string ToString(const Token &markup) const;
127 | 
128 |   // normalizer.cc
129 |   // Verbalizes semiotic classes, defaulting to verbatim verbalization for
130 |   // something that is marked as a semiotic class but for which the
131 |   // verbalization grammar fails.
132 |   bool VerbalizeSemioticClass(const Token &markup, string *words) const;
133 | 
134 |   // normalizer.cc
135 |   // Performs verbalization on the input utterance, the second step of
136 |   // normalization
137 |   bool VerbalizeUtt(Utterance *utt) const;
138 | 
139 |   string input_;
140 |   std::unique_ptr<RuleSystem> tokenizer_classifier_rules_;
141 |   std::unique_ptr<RuleSystem> verbalizer_rules_;
142 |   std::unique_ptr<SentenceBoundary> sentence_boundary_;
143 |   std::unique_ptr<Serializer> spec_serializer_;
144 |   std::set<string> sentence_boundary_exceptions_;
145 | 
146 |   DISALLOW_COPY_AND_ASSIGN(Normalizer);
147 | };
148 | 
149 | }  // namespace sparrowhawk
150 | }  // namespace speech
151 | 
152 | #endif  // SPARROWHAWK_NORMALIZER_H_
153 | 


--------------------------------------------------------------------------------
/src/include/sparrowhawk/numbers.h:
--------------------------------------------------------------------------------
 1 | // Licensed under the Apache License, Version 2.0 (the "License");
 2 | // you may not use this file except in compliance with the License.
 3 | // You may obtain a copy of the License at
 4 | //
 5 | //     http://www.apache.org/licenses/LICENSE-2.0
 6 | //
 7 | // Unless required by applicable law or agreed to in writing, software
 8 | // distributed under the License is distributed on an "AS IS" BASIS,
 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | // See the License for the specific language governing permissions and
11 | // limitations under the License.
12 | //
13 | // Copyright 2015 and onwards Google, Inc.
14 | // Various utilities to replace Google functionality for safe_strtoX
15 | #ifndef SPARROWHAWK_NUMBERS_H_
16 | #define SPARROWHAWK_NUMBERS_H_
17 | 
18 | #include <stdint.h>
19 | #include <string>
20 | using std::string;
21 | 
22 | #include <fst/compat.h>
23 | namespace speech {
24 | namespace sparrowhawk {
25 | 
26 | typedef int32_t int32;
27 | typedef int64_t int64;
28 | 
29 | bool safe_strtof(const string &value, float *output);
30 | 
31 | bool safe_strtod(const string &value, double *output);
32 | 
33 | bool safe_strto32(const string &value, int32 *output);
34 | 
35 | bool safe_strto64(const string &value, int64 *output);
36 | 
37 | }  // namespace sparrowhawk
38 | }  // namespace speech
39 | 
40 | #endif  // SPARROWHAWK_NUMBERS_H_
41 | 


--------------------------------------------------------------------------------
/src/include/sparrowhawk/protobuf_parser.h:
--------------------------------------------------------------------------------
  1 | // Licensed under the Apache License, Version 2.0 (the "License");
  2 | // you may not use this file except in compliance with the License.
  3 | // You may obtain a copy of the License at
  4 | //
  5 | //     http://www.apache.org/licenses/LICENSE-2.0
  6 | //
  7 | // Unless required by applicable law or agreed to in writing, software
  8 | // distributed under the License is distributed on an "AS IS" BASIS,
  9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | // See the License for the specific language governing permissions and
 11 | // limitations under the License.
 12 | //
 13 | // Copyright 2015 and onwards Google, Inc.
 14 | // This is a basic parser for reading protobufs directly from
 15 | // FSTs. The main advantage this offers for the moment is the
 16 | // ability to track token start/end points, but later can be
 17 | // extended to other types and may ultimately be portable to
 18 | // Android.
 19 | //
 20 | // This class is not thread safe since it needs to store internal
 21 | // parse state. The expectation is to create temporary local instances
 22 | // of it rather than persisting a single shared instance.
 23 | 
 24 | #ifndef SPARROWHAWK_PROTOBUF_PARSER_H_
 25 | #define SPARROWHAWK_PROTOBUF_PARSER_H_
 26 | 
 27 | #include <string>
 28 | using std::string;
 29 | #include <vector>
 30 | using std::vector;
 31 | 
 32 | #include <fst/compat.h>
 33 | #include <google/protobuf/message.h>
 34 | #include <google/protobuf/descriptor.h>
 35 | #include <thrax/grm-manager.h>
 36 | 
 37 | namespace speech {
 38 | namespace sparrowhawk {
 39 | 
 40 | using thrax::GrmManager;
 41 | class Utterance;
 42 | class Token;
 43 | 
 44 | class ProtobufParser {
 45 |  public:
 46 |   typedef GrmManager::Transducer Transducer;
 47 | 
 48 |   explicit ProtobufParser(const Transducer *fst);
 49 |   ~ProtobufParser();
 50 | 
 51 |   // Parses tokens from the member FST into the Token stream of the
 52 |   // utterance. Note that, as the name suggests, it *cannot* parse
 53 |   // other streams such as Word, Specification, etc.
 54 |   // This assumes that the FST has a unique path through it
 55 |   // (ie. has been created via ShortestPath())
 56 |   bool ParseTokensFromFST(Utterance *utt,
 57 |                           bool set_semiotic_class = true,
 58 |                           bool fix_lookahead = false);
 59 | 
 60 |   // Parses the given message from the member FST.
 61 |   // Message must have been registered with ProtobufField for this
 62 |   // to succeed.
 63 |   // This assumes that the FST has a unique path through it
 64 |   // (ie. has been created via ShortestPath())
 65 |   bool ParseMessageFromFST(google::protobuf::Message *message);
 66 | 
 67 |  protected:
 68 |   typedef fst::StateIterator<Transducer> StateIterator;
 69 |   typedef fst::ArcIterator<Transducer> ArcIterator;
 70 |   typedef Transducer::Arc::Label Label;
 71 |   typedef Transducer::StateId StateId;
 72 | 
 73 |   // Parses a single message from the FST. The message name and opening brace
 74 |   // have already been consumed; this goes until the closing brace.
 75 |   // If eof_allowed is true then it's not a failure to reach the end of the FST
 76 |   // before finding a closing brace.
 77 |   bool ParseMessage(bool eof_allowed, google::protobuf::Message *message);
 78 | 
 79 |   // Parses a single field value from the FST. The field name has already been
 80 |   // consumed, this just stores the value in the given string.
 81 |   bool ParseFieldValue(string *value);
 82 | 
 83 |   // As above, but deals with a quoted field which is rather trickier due to
 84 |   // escaping and suchforth. The first quote has already been consumed.
 85 |   bool ParseQuotedFieldValue(bool ignore_backslashes, string *value);
 86 | 
 87 |   // Consumes a single token label from the FST, ie. a message or field name.
 88 |   // Returns true if label found, false if not.
 89 |   bool ConsumeLabel(string *label);
 90 | 
 91 |   // Consumes any output whitespace from the FST.
 92 |   void ConsumeWhitespace();
 93 | 
 94 |   // Moves to the next state in the FST. Returns true if one was found, false
 95 |   // if the end has been reached.
 96 |   bool NextState();
 97 | 
 98 |   // Backs up to the previous state. Can only back up once, so should only be
 99 |   // called once between each call to NextState().
100 |   void PrevState();
101 | 
102 |   // Updates start/end indices on a token that we've just parsed.
103 |   void UpdateTokenIndices(Token *token,
104 |                           bool set_semiotic_class,
105 |                           bool fix_lookahead);
106 | 
107 |   // Logs an error message on parsing fail.
108 |   void LogError();
109 | 
110 |   // Records the field orders if there is a preserve_order field and it's true
111 |   bool RecordFieldOrder(google::protobuf::Message *message,
112 |                         const std::vector<string> &field_order);
113 | 
114 |   // Applies fixes to the token names caused by lookahead FSTs.
115 |   void FixLookahead(Utterance *utt);
116 | 
117 |   // Sets the content of a field.
118 |   void SetField(google::protobuf::Message *message,
119 |                 const google::protobuf::Reflection *reflection,
120 |                 const google::protobuf::FieldDescriptor *descriptor,
121 |                 const string &value) const;
122 | 
123 |   // FST we're parsing from.
124 |   const Transducer *fst_;
125 |   // Current state that we're up to.
126 |   StateId state_;
127 |   // The previous state
128 |   StateId last_state_;
129 |   // Input/output labels from the last arc.
130 |   Label ilabel_;
131 |   Label olabel_;
132 |   // Start index of the current token.
133 |   int token_start_;
134 |   // End index of the immediately preceding token.
135 |   int last_token_end_;
136 |   // Name of the current token (ie. its input text).
137 |   string token_name_;
138 |   // Name (input labels) of the immediately preceding token.
139 |   string last_token_name_;
140 | };
141 | 
142 | }  // namespace sparrowhawk
143 | }  // namespace speech
144 | 
145 | #endif  // SPARROWHAWK_PROTOBUF_PARSER_H_
146 | 


--------------------------------------------------------------------------------
/src/include/sparrowhawk/protobuf_serializer.h:
--------------------------------------------------------------------------------
  1 | // Licensed under the Apache License, Version 2.0 (the "License");
  2 | // you may not use this file except in compliance with the License.
  3 | // You may obtain a copy of the License at
  4 | //
  5 | //     http://www.apache.org/licenses/LICENSE-2.0
  6 | //
  7 | // Unless required by applicable law or agreed to in writing, software
  8 | // distributed under the License is distributed on an "AS IS" BASIS,
  9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | // See the License for the specific language governing permissions and
 11 | // limitations under the License.
 12 | //
 13 | // Copyright 2015 and onwards Google, Inc.
 14 | // This is a class to serialize protocol buffers directly
 15 | // into a FST in preparation for them to be verbalized.
 16 | // The main advantage of this for us is that we produce a FST with multiple
 17 | // orderings which the verbalizer can consume however it wants; this
 18 | // removes the necessity for the prior reordering hacks etc.
 19 | //
 20 | // As with ProtobufParser, this class is not threadsafe as it stores
 21 | // internal state; the expectation is to create temporary local instances
 22 | // of it rather than persisting a single shared instance.
 23 | 
 24 | #ifndef SPARROWHAWK_PROTOBUF_SERIALIZER_H_
 25 | #define SPARROWHAWK_PROTOBUF_SERIALIZER_H_
 26 | 
 27 | #include <vector>
 28 | using std::vector;
 29 | 
 30 | #include <fst/compat.h>
 31 | #include <google/protobuf/descriptor.h>
 32 | #include <google/protobuf/message.h>
 33 | #include <thrax/grm-manager.h>
 34 | #include <re2/re2.h>
 35 | 
 36 | namespace speech {
 37 | namespace sparrowhawk {
 38 | 
 39 | using thrax::GrmManager;
 40 | class Utterance;
 41 | 
 42 | class ProtobufSerializer {
 43 |  public:
 44 |   typedef GrmManager::MutableTransducer MutableTransducer;
 45 |   typedef MutableTransducer::Arc::StateId StateId;
 46 | 
 47 |   // Serializes message into given fst.
 48 |   ProtobufSerializer(const google::protobuf::Message *message,
 49 |                      MutableTransducer *fst);
 50 |   ~ProtobufSerializer();
 51 | 
 52 |   // Serializes the message into the FST.
 53 |   void SerializeToFst();
 54 | 
 55 |   // Serializes the message into a string
 56 |   string SerializeToString() const;
 57 | 
 58 |  protected:
 59 |   typedef google::protobuf::FieldDescriptor FieldDescriptor;
 60 |   typedef std::vector<const FieldDescriptor *> FieldDescriptorVector;
 61 |   typedef fst::StateIterator<MutableTransducer> StateIterator;
 62 |   typedef fst::ArcIterator<MutableTransducer> ArcIterator;
 63 |   typedef MutableTransducer::Arc Arc;
 64 |   typedef Arc::Label Label;
 65 | 
 66 |   // Internal constructor that allows selecting the state to begin from.
 67 |   ProtobufSerializer(const google::protobuf::Message *message,
 68 |                      MutableTransducer *fst,
 69 |                      StateId state);
 70 | 
 71 |   // Serializes the entire message into the FST, and returns the final state id.
 72 |   StateId SerializeToFstInternal();
 73 | 
 74 |   // Serializes a single permutation into the FST.
 75 |   void SerializePermutation(const FieldDescriptorVector &fields);
 76 | 
 77 |   // Serializes a single field into the FST.
 78 |   StateId SerializeField(const FieldDescriptor *field,
 79 |                          int index,
 80 |                          StateId state);
 81 | 
 82 |   // Serializes a string into the FST.
 83 |   StateId SerializeString(const string &str, StateId state);
 84 | 
 85 |   // As above, allowing control of whether quotes are optional or not.
 86 |   StateId SerializeString(const string &str,
 87 |                           StateId state,
 88 |                           bool optional_quotes);
 89 | 
 90 |   // Serializes a single character into the FST.
 91 |   StateId SerializeChar(char c, StateId state);
 92 | 
 93 |   // Links the last arc that has a non-space output symbol to the new final
 94 |   // state by adding an epsilon arc from this arc's destination state to the new
 95 |   // final state, cutting out unnecessary whitespace and connecting multiple
 96 |   // permutations to a common destination.
 97 |   void StripTrailingSpace(StateId new_final_state);
 98 | 
 99 |   const google::protobuf::Message *message_;
100 |   const google::protobuf::Reflection *reflection_;
101 |   MutableTransducer *fst_;
102 |   const StateId initial_state_;
103 |   static const RE2 kReTrailingZeroes;
104 |   static const int kReNumMatchGroups;
105 | 
106 |  private:
107 |   DISALLOW_COPY_AND_ASSIGN(ProtobufSerializer);
108 | };
109 | 
110 | }  // namespace sparrowhawk
111 | }  // namespace speech
112 | 
113 | #endif  // SPARROWHAWK_PROTOBUF_SERIALIZER_H_
114 | 


--------------------------------------------------------------------------------
/src/include/sparrowhawk/record_serializer.h:
--------------------------------------------------------------------------------
  1 | // Licensed under the Apache License, Version 2.0 (the "License");
  2 | // you may not use this file except in compliance with the License.
  3 | // You may obtain a copy of the License at
  4 | //
  5 | //     http://www.apache.org/licenses/LICENSE-2.0
  6 | //
  7 | // Unless required by applicable law or agreed to in writing, software
  8 | // distributed under the License is distributed on an "AS IS" BASIS,
  9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | // See the License for the specific language governing permissions and
 11 | // limitations under the License.
 12 | //
 13 | // Copyright 2015 and onwards Google, Inc.
 14 | // Recursively serializes a single record in the spec and concatenates onto a
 15 | // transducer.
 16 | //
 17 | // Typically the serialized field content looks like
 18 | //           <field_name>:<field_value>|
 19 | // Note that nothing is serialized if the field corresponding to the record_spec
 20 | // field_path is missing in the token.
 21 | //
 22 | // This is used by the StyleSerializer for serializing all the records in a
 23 | // given style. It constructs the RecordSerializer for each record in the
 24 | // style_spec. Given a token it sequentially invokes the Serialize function of
 25 | // the records in the style being serialized.
 26 | 
 27 | #ifndef SPARROWHAWK_RECORD_SERIALIZER_H_
 28 | #define SPARROWHAWK_RECORD_SERIALIZER_H_
 29 | 
 30 | #include <memory>
 31 | #include <vector>
 32 | using std::vector;
 33 | 
 34 | #include <fst/compat.h>
 35 | #include <thrax/grm-manager.h>
 36 | #include <sparrowhawk/items.pb.h>
 37 | #include <sparrowhawk/serialization_spec.pb.h>
 38 | #include <sparrowhawk/field_path.h>
 39 | #include <re2/re2.h>
 40 | 
 41 | namespace speech {
 42 | namespace sparrowhawk {
 43 | 
 44 | class RecordSerializer {
 45 |  public:
 46 |   typedef fst::StdVectorFst MutableTransducer;
 47 | 
 48 |   // Creates and returns a RecordSerializer from the record_spec by noting the
 49 |   // field path and path label the record and recursively building
 50 |   // record_serializers for prefix and suffix specs.
 51 |   // Returns a null value if the spec is not well-formed.
 52 |   static std::unique_ptr<RecordSerializer> Create(
 53 |       const RecordSpec &record_spec);
 54 | 
 55 |   // Serializes a token using the record spec, returns true only if the token
 56 |   // serializes correctly as per the record spec. For the input token, it
 57 |   // recursively traverses field_paths noted in the record_serializer and its
 58 |   // affix_serializers and concatenates serialized field content onto the
 59 |   // input fst.
 60 |   bool Serialize(const Token &token, MutableTransducer *fst) const;
 61 | 
 62 |  private:
 63 |   typedef MutableTransducer::Arc Arc;
 64 |   typedef Arc::StateId StateId;
 65 |   typedef Arc::Weight Weight;
 66 |   typedef fst::StringCompiler<Arc> StringCompiler;
 67 | 
 68 |   // Only used by the factory function Create.
 69 |   RecordSerializer();
 70 | 
 71 |   // Serializers for prefix specs in the specification.
 72 |   std::vector<std::unique_ptr<RecordSerializer>> prefix_serializers_;
 73 | 
 74 |   // Serializers for suffix specs in the specification.
 75 |   std::vector<std::unique_ptr<RecordSerializer>> suffix_serializers_;
 76 | 
 77 |   // Field path for the record_spec field.
 78 |   std::unique_ptr<FieldPath> field_path_;
 79 | 
 80 |   // String denoting the terminating field's name for the record spec.
 81 |   string field_name_;
 82 | 
 83 |   // Default value to be emitted when field is not set.
 84 |   string default_value_;
 85 | 
 86 |   // Pattern to be escaped - record_separator or escape_character.
 87 |   RE2 escape_re_;
 88 | 
 89 |   // Replacement string for escape pattern - prepended escape_character.
 90 |   string escape_replacement_;
 91 | 
 92 |   // String Compiler for making fsts from strings.
 93 |   StringCompiler string_compiler_;
 94 | 
 95 |   // Serializes a record, escaping record_separator and escape_character.
 96 |   // Also serializes various factorizations as parallel arcs into the FST.
 97 |   void SerializeRecord(string *value,
 98 |                        MutableTransducer *fst) const;
 99 | 
100 |   // Assumes that the (non-repeated) field is set for the parent, and checks
101 |   // that it corresponds to a scalar value. Also, in this case, adds an arc to
102 |   // fst between states start and end, optionally adding a new state for end if
103 |   // a sentinel is passed for end. It is an error to invoke this with a
104 |   // repeated field.
105 |   bool SerializeToFst(const google::protobuf::Message &parent,
106 |                       const google::protobuf::FieldDescriptor &field,
107 |                       MutableTransducer *fst) const;
108 | 
109 |   // Assumes that the (repeated) field is set for the parent, and checks that it
110 |   // corresponds to a scalar value. Also, in this case, adds an arc to
111 |   // fst between states start and end, optionally adding a new state for end if
112 |   // a sentinel is passed for end. It is an error to invoke this with a
113 |   // non-repeated field.
114 |   bool SerializeToFstRepeated(const google::protobuf::Message &parent,
115 |                               const google::protobuf::FieldDescriptor &field,
116 |                               const int index,
117 |                               MutableTransducer *fst) const;
118 | 
119 |   // Recursively serializes prefix and suffix records into respective
120 |   // transducers using appropriate record serializers.
121 |   bool SerializeAffixes(const Token &token,
122 |                         MutableTransducer *prefix_fst,
123 |                         MutableTransducer *suffix_fst) const;
124 | 
125 |   DISALLOW_COPY_AND_ASSIGN(RecordSerializer);
126 | };
127 | 
128 | }  // namespace sparrowhawk
129 | }  // namespace speech
130 | 
131 | #endif  // SPARROWHAWK_RECORD_SERIALIZER_H_
132 | 


--------------------------------------------------------------------------------
/src/include/sparrowhawk/regexp.h:
--------------------------------------------------------------------------------
 1 | // Licensed under the Apache License, Version 2.0 (the "License");
 2 | // you may not use this file except in compliance with the License.
 3 | // You may obtain a copy of the License at
 4 | //
 5 | //     http://www.apache.org/licenses/LICENSE-2.0
 6 | //
 7 | // Unless required by applicable law or agreed to in writing, software
 8 | // distributed under the License is distributed on an "AS IS" BASIS,
 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | // See the License for the specific language governing permissions and
11 | // limitations under the License.
12 | //
13 | // Copyright 2015 and onwards Google, Inc.
14 | // Various utilities to replace Google wrapper for re2.
15 | // Wrapper class regex library, it's very basic, and wraps the re2
16 | // regexp library.
17 | 
18 | #ifndef SPARROWHAWK_REGEXP_H_
19 | #define SPARROWHAWK_REGEXP_H_
20 | 
21 | #include <string>
22 | using std::string;
23 | #include <vector>
24 | using std::vector;
25 | 
26 | #include <fst/compat.h>
27 | #include <re2/re2.h>
28 | 
29 | namespace speech {
30 | namespace sparrowhawk {
31 | 
32 | // A regmatch is one match result - there may be one or more per string.
33 | struct RegMatch {
34 |   int start_char;
35 |   int end_char;
36 |   string full_str;
37 |   // number of sub-expressions
38 |   int n_sub;
39 |   int len;
40 |   // if the regexp contained subexpressions
41 |   std::vector<string> sub_str;
42 |   std::vector<int> sub_start;
43 |   std::vector<int> sub_end;
44 | };
45 | 
46 | class Regexp {
47 |  public:
48 |   Regexp();
49 |   ~Regexp();
50 | 
51 |   // Compiles a regexp. Returns true if compile successful.
52 |   bool Compile(const string &pattern);
53 | 
54 |   // The number of sub expressions for this regexp.
55 |   int nsubexp() const;
56 | 
57 |   // Checks for any match at all. Returns true if match.
58 |   bool CheckFullMatch(const string &input) const;
59 | 
60 |   // Checks for any match at all. Returns true if match.
61 |   bool CheckMatch(const string &input) const;
62 | 
63 |   // Checks for any match at all. Returns true if match.
64 |   static bool CheckMatch(const string &input, const string &pattern);
65 | 
66 |   // Gets vector of start and end chars for all matching string parts
67 |   // returns number of matches.  Fills the matches vector with RegMatch objects.
68 |   int GetAllMatches(const string &input,
69 |                     std::vector<RegMatch> *matches) const;
70 | 
71 |   // Accessor for boolean whether this has been successfully compiled
72 |   bool ok() const;
73 | 
74 |   // Deletes and resets internal data.
75 |   void Clear();
76 | 
77 |  private:
78 |   // The underlying compiled regexp object internal structure
79 |   RE2 *re_;
80 | 
81 |   int32 nsubexp_;
82 | 
83 |   DISALLOW_COPY_AND_ASSIGN(Regexp);
84 | };
85 | 
86 | }  // namespace sparrowhawk
87 | }  // namespace speech
88 | 
89 | #endif  // SPARROWHAWK_REGEXP_H_
90 | 


--------------------------------------------------------------------------------
/src/include/sparrowhawk/rule_system.h:
--------------------------------------------------------------------------------
 1 | // Licensed under the Apache License, Version 2.0 (the "License");
 2 | // you may not use this file except in compliance with the License.
 3 | // You may obtain a copy of the License at
 4 | //
 5 | //     http://www.apache.org/licenses/LICENSE-2.0
 6 | //
 7 | // Unless required by applicable law or agreed to in writing, software
 8 | // distributed under the License is distributed on an "AS IS" BASIS,
 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | // See the License for the specific language governing permissions and
11 | // limitations under the License.
12 | //
13 | // Copyright 2015 and onwards Google, Inc.
14 | // A rule system consists of a cascaded set of grammar targets defined by
15 | // Thrax. See rule_order.proto for a description of what each rule complex can
16 | // contain.
17 | #ifndef SPARROWHAWK_RULE_SYSTEM_H_
18 | #define SPARROWHAWK_RULE_SYSTEM_H_
19 | 
20 | #include <map>
21 | #include <memory>
22 | #include <string>
23 | using std::string;
24 | 
25 | #include <fst/compat.h>
26 | #include <google/protobuf/text_format.h>
27 | #include <thrax/grm-manager.h>
28 | #include <sparrowhawk/rule_order.pb.h>
29 | 
30 | namespace speech {
31 | namespace sparrowhawk {
32 | 
33 | using fst::LabelLookAheadRelabeler;
34 | using fst::StdArcLookAheadFst;
35 | using fst::StdILabelLookAheadFst;
36 | using fst::StdOLabelLookAheadFst;
37 | using thrax::GrmManager;
38 | 
39 | typedef fst::Fst<fst::StdArc> Transducer;
40 | typedef fst::VectorFst<fst::StdArc>  MutableTransducer;
41 | 
42 | typedef StdILabelLookAheadFst LookaheadFst;
43 | 
44 | class RuleSystem {
45 |  public:
46 |   RuleSystem() { }
47 |   ~RuleSystem();
48 | 
49 |   // Loads a protobuf containing the filename of the grammar far
50 |   // and the rule specifications as defined in rule_order.proto.
51 |   bool LoadGrammar(const string& filename, const string& prefix);
52 | 
53 |   // This one returns the epsilon-free output projection of all
54 |   // paths. use_lookahead constructs a lookahead FST for the composition.
55 |   bool ApplyRules(const Transducer& input,
56 |                   MutableTransducer* output,
57 |                   bool use_lookahead) const;
58 | 
59 |   // These two return the string of the shortest path.
60 |   bool ApplyRules(const string& input,
61 |                   string* output,
62 |                   bool use_lookahead) const;
63 | 
64 |   bool ApplyRules(const Transducer& input,
65 |                   string* output,
66 |                   bool use_lookahead) const;
67 | 
68 |   // Find the named transducer or NULL if nonexistent.
69 |   const Transducer* FindRule(const string& name) const;
70 | 
71 |   const string& grammar_name() const { return grammar_name_; }
72 | 
73 |  private:
74 |   Grammar grammar_;
75 |   string grammar_name_;
76 |   std::unique_ptr<GrmManager> grm_;
77 |   // Precomputed lookahead transducers
78 |   mutable std::map<string, LookaheadFst*> lookaheads_;
79 | };
80 | 
81 | }  // namespace sparrowhawk
82 | }  // namespace speech
83 | 
84 | #endif  // SPARROWHAWK_RULE_SYSTEM_H_
85 | 


--------------------------------------------------------------------------------
/src/include/sparrowhawk/sentence_boundary.h:
--------------------------------------------------------------------------------
 1 | // Licensed under the Apache License, Version 2.0 (the "License");
 2 | // you may not use this file except in compliance with the License.
 3 | // You may obtain a copy of the License at
 4 | //
 5 | //     http://www.apache.org/licenses/LICENSE-2.0
 6 | //
 7 | // Unless required by applicable law or agreed to in writing, software
 8 | // distributed under the License is distributed on an "AS IS" BASIS,
 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | // See the License for the specific language governing permissions and
11 | // limitations under the License.
12 | //
13 | // Copyright 2015 and onwards Google, Inc.
14 | // Simple interface for splitting text into sentences. Uses a regular expression
15 | // to define plausible end-of-sentence markers, and allows for a list of
16 | // exceptions --- e.g. abbreviations that end in periods that would not normally
17 | // signal a sentence boundary.
18 | #ifndef SPARROWHAWK_SENTENCE_BOUNDARY_H_
19 | #define SPARROWHAWK_SENTENCE_BOUNDARY_H_
20 | 
21 | #include <memory>
22 | #include <string>
23 | using std::string;
24 | #include <vector>
25 | using std::vector;
26 | 
27 | #include <fst/compat.h>
28 | #include <sparrowhawk/regexp.h>
29 | 
30 | namespace speech {
31 | namespace sparrowhawk {
32 | 
33 | class SentenceBoundary {
34 |  public:
35 |   explicit SentenceBoundary(const string &regexp);
36 | 
37 |   // Loads exceptions, such as abbreviations that end in periods, things like
38 |   // "Y!", or whatever.  Note that these are all case sensitive, so one must
39 |   // provide alternate forms if one expects that the form may be cased
40 |   // differently.
41 |   bool LoadSentenceBoundaryExceptions(const string &filename);
42 | 
43 |   std::vector<string> ExtractSentences(const string &input_text) const;
44 | 
45 |   // If true, then prefixes each exception in the exception list with a space,
46 |   // so that it when matching against a potential end-of-sentence position, it
47 |   // will force the match to occur only when there is a preceding space, or at
48 |   // the beginning of the string.
49 |   void set_pad_exceptions_with_space_prefix(bool
50 |                                             pad_exceptions_with_space_prefix) {
51 |     pad_exceptions_with_space_prefix_ = pad_exceptions_with_space_prefix;
52 |   }
53 | 
54 |  private:
55 |   // Returns true if the candidate position is a plausible sentence
56 |   // boundary. Currently uses the regexp and the sentence boundary exceptions
57 |   // list, but could be replaced with something learned.
58 |   bool EvaluateCandidate(const string &input_text, const string &marker) const;
59 | 
60 |   std::unique_ptr<Regexp> regexp_;
61 |   std::vector<string> sentence_boundary_exceptions_;
62 |   bool pad_exceptions_with_space_prefix_;
63 |   DISALLOW_COPY_AND_ASSIGN(SentenceBoundary);
64 | };
65 | 
66 | }  // namespace sparrowhawk
67 | }  // namespace speech
68 | 
69 | #endif  // SPARROWHAWK_SENTENCE_BOUNDARY_H_
70 | 


--------------------------------------------------------------------------------
/src/include/sparrowhawk/spec_serializer.h:
--------------------------------------------------------------------------------
 1 | // Licensed under the Apache License, Version 2.0 (the "License");
 2 | // you may not use this file except in compliance with the License.
 3 | // You may obtain a copy of the License at
 4 | //
 5 | //     http://www.apache.org/licenses/LICENSE-2.0
 6 | //
 7 | // Unless required by applicable law or agreed to in writing, software
 8 | // distributed under the License is distributed on an "AS IS" BASIS,
 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | // See the License for the specific language governing permissions and
11 | // limitations under the License.
12 | //
13 | // Copyright 2015 and onwards Google, Inc.
14 | // Serializes a token based on a given spec for simple, fast verbalization.
15 | // Iteratively serializes the styles in a class_spec which are concatenated as
16 | // parallel arcs onto a transducer, which is returned as output.
17 | 
18 | #ifndef SPARROWHAWK_SPEC_SERIALIZER_H_
19 | #define SPARROWHAWK_SPEC_SERIALIZER_H_
20 | 
21 | #include <map>
22 | #include <memory>
23 | #include <vector>
24 | using std::vector;
25 | 
26 | #include <fst/compat.h>
27 | #include <google/protobuf/descriptor.h>
28 | #include <thrax/grm-manager.h>
29 | #include <sparrowhawk/items.pb.h>
30 | #include <sparrowhawk/serialization_spec.pb.h>
31 | #include <sparrowhawk/style_serializer.h>
32 | 
33 | namespace speech {
34 | namespace sparrowhawk {
35 | 
36 | class Serializer {
37 |  public:
38 |   typedef fst::StdVectorFst MutableTransducer;
39 | 
40 |   // Creates and returns a Serializer from the serialize_spec by creating
41 |   // style_serializers for all its style_specs and storing the name of the
42 |   // semiotic class.
43 |   // Returns a null value if the spec is not well-formed.
44 |   static std::unique_ptr<Serializer> Create(
45 |       const SerializeSpec &serialize_spec);
46 | 
47 |   // Serializes a token using the serialization spec, i.e. builds an fst
48 |   // corresponding to the serialization of the token. Appends a label for the
49 |   // semiotic class name at the front and then adds parallel arcs for the
50 |   // different valid style_specs.
51 |   MutableTransducer Serialize(const Token &token) const;
52 | 
53 |  private:
54 |   typedef MutableTransducer::Arc Arc;
55 |   typedef fst::StringCompiler<Arc> StringCompiler;
56 | 
57 |   // Only used by the factory function Create.
58 |   Serializer() : string_compiler_(fst::StringTokenType::BYTE) {}
59 | 
60 |   // String Compiler for making fsts from strings.
61 |   StringCompiler string_compiler_;
62 | 
63 |   // Map to store the serialization indexed by field descriptors.
64 |   std::map<const google::protobuf::FieldDescriptor*,
65 |   std::vector<std::unique_ptr<StyleSerializer>>> serializers_;
66 | 
67 |   DISALLOW_COPY_AND_ASSIGN(Serializer);
68 | };
69 | 
70 | }  // namespace sparrowhawk
71 | }  // namespace speech
72 | 
73 | #endif  // SPARROWHAWK_SPEC_SERIALIZER_H_
74 | 


--------------------------------------------------------------------------------
/src/include/sparrowhawk/string_utils.h:
--------------------------------------------------------------------------------
 1 | // Licensed under the Apache License, Version 2.0 (the "License");
 2 | // you may not use this file except in compliance with the License.
 3 | // You may obtain a copy of the License at
 4 | //
 5 | //     http://www.apache.org/licenses/LICENSE-2.0
 6 | //
 7 | // Unless required by applicable law or agreed to in writing, software
 8 | // distributed under the License is distributed on an "AS IS" BASIS,
 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | // See the License for the specific language governing permissions and
11 | // limitations under the License.
12 | //
13 | // Copyright 2015 and onwards Google, Inc.
14 | // Various utilities to replace Google functionality for strings.
15 | #ifndef SPARROWHAWK_STRING_UTILS_H_
16 | #define SPARROWHAWK_STRING_UTILS_H_
17 | 
18 | #include <string>
19 | using std::string;
20 | #include <vector>
21 | using std::vector;
22 | 
23 | #include <fst/compat.h>
24 | namespace speech {
25 | namespace sparrowhawk {
26 | 
27 | // Splits string s by sep and returns a vector of strings.
28 | std::vector<string> SplitString(const string &s, const string &delims);
29 | 
30 | // Splits string s by sep and returns a vector of strings, skipping empties.
31 | std::vector<string> SplitString(const string &s,
32 |                            const string &delims,
33 |                            bool skip_empty);
34 | 
35 | // Strips whitespace off the beginning and end
36 | string StripWhitespace(const string &s);
37 | 
38 | }  // namespace sparrowhawk
39 | }  // namespace speech
40 | 
41 | #endif  // SPARROWHAWK_STRING_UTILS_H_
42 | 


--------------------------------------------------------------------------------
/src/include/sparrowhawk/style_serializer.h:
--------------------------------------------------------------------------------
  1 | // Licensed under the Apache License, Version 2.0 (the "License");
  2 | // you may not use this file except in compliance with the License.
  3 | // You may obtain a copy of the License at
  4 | //
  5 | //     http://www.apache.org/licenses/LICENSE-2.0
  6 | //
  7 | // Unless required by applicable law or agreed to in writing, software
  8 | // distributed under the License is distributed on an "AS IS" BASIS,
  9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | // See the License for the specific language governing permissions and
 11 | // limitations under the License.
 12 | //
 13 | // Copyright 2015 and onwards Google, Inc.
 14 | // Iteratively serializes the records in a style_spec which are serially
 15 | // concatenated onto a transducer.
 16 | //
 17 | // Typically the serialized field content looks like
 18 | //           (<field_name>:<field_value>|)*
 19 | // where each unit is the serialization of a record.
 20 | //
 21 | // This is used by the Serializer for serializing all the styles in a given
 22 | // semiotic class. It constructs the StyleSerializer for each style in the
 23 | // class_spec permitted by the prohibited/requested values. Given a token it
 24 | // sequentially invokes the Serialize function of the styles in the class being
 25 | // serialized.
 26 | 
 27 | #ifndef SPARROWHAWK_STYLE_SERIALIZER_H_
 28 | #define SPARROWHAWK_STYLE_SERIALIZER_H_
 29 | 
 30 | #include <memory>
 31 | #include <vector>
 32 | using std::vector;
 33 | 
 34 | #include <fst/compat.h>
 35 | #include <google/protobuf/message.h>
 36 | #include <thrax/grm-manager.h>
 37 | #include <sparrowhawk/items.pb.h>
 38 | #include <sparrowhawk/serialization_spec.pb.h>
 39 | #include <sparrowhawk/field_path.h>
 40 | #include <sparrowhawk/record_serializer.h>
 41 | 
 42 | namespace speech {
 43 | namespace sparrowhawk {
 44 | 
 45 | class StyleSerializer {
 46 |  public:
 47 |   typedef fst::StdVectorFst MutableTransducer;
 48 | 
 49 |   // Creates and returns a StyleSerializer from the style_spec by creating
 50 |   // record_serializers for all its record_specs and storing field_paths of
 51 |   // required and prohibited fields.
 52 |   // Returns a null value if the spec is not well-formed.
 53 |   static std::unique_ptr<StyleSerializer> Create(const StyleSpec &style_spec);
 54 | 
 55 |   // Serializes a token using the style spec, returns true only for valid
 56 |   // styles satisfying required/prohibited field constraints. If so, all the
 57 |   // records in the style are serialized onto the input fst.
 58 |   bool Serialize(const Token &token, MutableTransducer *serialization) const;
 59 | 
 60 |  private:
 61 |   // Only used by the factory function Create.
 62 |   StyleSerializer() {}
 63 | 
 64 |   // Populates record_serializers_ using style_spec.
 65 |   static bool CreateRecordSerializers(const StyleSpec &style_spec,
 66 |       const std::unique_ptr<StyleSerializer> &style_serializer);
 67 | 
 68 |   // Populates required_fields_ using style_spec.
 69 |   static bool SetRequiredFieldPaths(const StyleSpec &style_spec,
 70 |       const std::unique_ptr<StyleSerializer> &style_serializer);
 71 | 
 72 |   // Populates prohibited_fields_ using style_spec.
 73 |   static bool SetProhibitedFieldPaths(const StyleSpec &style_spec,
 74 |       const std::unique_ptr<StyleSerializer> &style_serializer);
 75 | 
 76 |   // Checks required_fields_ in token.
 77 |   bool CheckRequiredFields(const Token &token) const;
 78 | 
 79 |   // Checks prohibited_fields_ in token.
 80 |   bool CheckProhibitedFields(const Token &token) const;
 81 | 
 82 |   // FieldPaths to required fields in the specification.
 83 |   std::vector<std::vector<FieldPath>> required_fields_;
 84 | 
 85 |   // FieldPaths to prohibited fields in the specification.
 86 |   std::vector<FieldPath> prohibited_fields_;
 87 | 
 88 |   // Record serializers for the record specs in the style.
 89 |   std::vector<std::unique_ptr<RecordSerializer>> record_serializers_;
 90 | 
 91 |   // Takes as input a message and a target field path ending in a scalar field
 92 |   // to within the input message and returns true if the field at the end of the
 93 |   // path is set. It further assumes that all the intermediate messages are
 94 |   // non-repeated, although the terminating field itself may be repeated.
 95 |   bool IsFieldSet(const google::protobuf::Message &root,
 96 |                   const FieldPath &field_path) const;
 97 | 
 98 |   DISALLOW_COPY_AND_ASSIGN(StyleSerializer);
 99 | };
100 | 
101 | }  // namespace sparrowhawk
102 | }  // namespace speech
103 | 
104 | #endif  // SPARROWHAWK_STYLE_SERIALIZER_H_
105 | 


--------------------------------------------------------------------------------
/src/lib/Makefile.am:
--------------------------------------------------------------------------------
 1 | # Need both because of the output of protoc
 2 | AM_CPPFLAGS =  -I$(srcdir)/../include -I$(srcdir)/../include/sparrowhawk
 3 | 
 4 | lib_LTLIBRARIES = libsparrowhawk.la
 5 | proto_sources = items.pb.cc \
 6 |                 links.pb.cc \
 7 |                 rule_order.pb.cc \
 8 |                 semiotic_classes.pb.cc \
 9 |                 serialization_spec.pb.cc \
10 |                 sparrowhawk_configuration.pb.cc
11 | 
12 | libsparrowhawk_la_SOURCES = field_path.cc \
13 |                             io_utils.cc \
14 |                             normalizer.cc \
15 |                             normalizer_utils.cc \
16 |                             numbers.cc \
17 |                             protobuf_parser.cc \
18 |                             protobuf_serializer.cc \
19 |                             record_serializer.cc \
20 |                             regexp.cc \
21 |                             rule_system.cc \
22 |                             sentence_boundary.cc \
23 |                             spec_serializer.cc \
24 |                             string_utils.cc \
25 |                             style_serializer.cc \
26 | 			    $(proto_sources)
27 | 
28 | libsparrowhawk_la_LDFLAGS = -version-info 0:0:0
29 | 
30 | items.pb.cc:
31 | 	$(MAKE) -C $(srcdir)/../proto/ items.pb.cc
32 | 
33 | links.pb.cc:
34 | 	$(MAKE) -C $(srcdir)/../proto/ links.pb.cc
35 | 
36 | rule_order.pb.cc:
37 | 	$(MAKE) -C $(srcdir)/../proto/ rule_order.pb.cc
38 | 
39 | semiotic_classes.pb.cc:
40 | 	$(MAKE) -C $(srcdir)/../proto/ semiotic_classes.pb.cc
41 | 
42 | serialization_spec.pb.cc:
43 | 	$(MAKE) -C $(srcdir)/../proto/ serialization_spec.pb.cc
44 | 
45 | sparrowhawk_configuration.pb.cc:
46 | 	$(MAKE) -C $(srcdir)/../proto/ sparrowhawk_configuration.pb.cc
47 | 
48 | 


--------------------------------------------------------------------------------
/src/lib/field_path.cc:
--------------------------------------------------------------------------------
  1 | // Licensed under the Apache License, Version 2.0 (the "License");
  2 | // you may not use this file except in compliance with the License.
  3 | // You may obtain a copy of the License at
  4 | //
  5 | //     http://www.apache.org/licenses/LICENSE-2.0
  6 | //
  7 | // Unless required by applicable law or agreed to in writing, software
  8 | // distributed under the License is distributed on an "AS IS" BASIS,
  9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | // See the License for the specific language governing permissions and
 11 | // limitations under the License.
 12 | //
 13 | // Copyright 2015 and onwards Google, Inc.
 14 | #include <sparrowhawk/field_path.h>
 15 | 
 16 | #include <memory>
 17 | #include <string>
 18 | using std::string;
 19 | #include <vector>
 20 | using std::vector;
 21 | 
 22 | #include <google/protobuf/descriptor.h>
 23 | #include <google/protobuf/message.h>
 24 | #include <sparrowhawk/string_utils.h>
 25 | 
 26 | namespace speech {
 27 | namespace sparrowhawk {
 28 | 
 29 | using google::protobuf::Descriptor;
 30 | using google::protobuf::FieldDescriptor;
 31 | using google::protobuf::Message;
 32 | 
 33 | std::unique_ptr<FieldPath> FieldPath::Create(
 34 |     const Descriptor *root_type) {
 35 |   if (root_type == nullptr) {
 36 |     return nullptr;
 37 |   } else {
 38 |     std::unique_ptr<FieldPath> field_path(new FieldPath(root_type));
 39 |     return field_path;
 40 |   }
 41 | }
 42 | 
 43 | void FieldPath::Clear() {
 44 |   path_.clear();
 45 | }
 46 | 
 47 | bool FieldPath::Follow(const Message &base, const Message **parent,
 48 |                              const FieldDescriptor **field) const {
 49 |   if (base.GetDescriptor() != root_type_) {
 50 |     LOG(ERROR) << "Input Message to Follow is of type "
 51 |                << base.GetDescriptor()->name()
 52 |                << " while the field_path root type is " << root_type_->name();
 53 |     return false;
 54 |   }
 55 |   const Message *inner_message = &base;
 56 |   int size = path_.size();
 57 |   for (int i = 0; i < size - 1; ++i) {
 58 |     // Iterating over singular messages.
 59 |     inner_message = &inner_message->GetReflection()->GetMessage(*inner_message,
 60 |                                                                 path_[i]);
 61 |   }
 62 |   *parent = inner_message;
 63 |   *field = path_[size - 1];
 64 |   return true;
 65 | }
 66 | 
 67 | // Helper function to go through the intermediate message fields.
 68 | bool FieldPath::TraverseIntermediateFields(
 69 |     std::vector<string> path,
 70 |     const google::protobuf::Descriptor **parent) {
 71 |   for (int i = 0; i < path.size() - 1; ++i) {
 72 |     string &field_name = path[i];
 73 |     const FieldDescriptor *field = (*parent)->FindFieldByName(field_name);
 74 |     if (field == nullptr) {
 75 |       LOG(ERROR) << (*parent)->full_name()
 76 |                  << " does not contain a field named '"
 77 |                  << field_name << "'.";
 78 |       return false;
 79 |     }
 80 |     if (field->type() != FieldDescriptor::TYPE_MESSAGE) {
 81 |       LOG(ERROR) << "Non-terminal field " << field->full_name()
 82 |                  << " is not a message.";
 83 |       return false;
 84 |     }
 85 |     path_.push_back(field);
 86 |     *parent = field->message_type();
 87 |   }
 88 |   return true;
 89 | }
 90 | 
 91 | // Helper function to parse the terminal scalar field.
 92 | bool FieldPath::ParseTerminalField(const string &terminal_field_name,
 93 |                                          const Descriptor *parent) {
 94 |   const FieldDescriptor *terminal_field =
 95 |       parent->FindFieldByName(terminal_field_name);
 96 |   if (terminal_field == nullptr) {
 97 |     LOG(ERROR) << parent->full_name() << " does not contain a field named '"
 98 |                << terminal_field_name << "'.";
 99 |     return false;
100 |   } else if (terminal_field->type() == FieldDescriptor::TYPE_MESSAGE) {
101 |     LOG(ERROR) << "Terminal field " << terminal_field->full_name()
102 |                << " is a message.";
103 |     return false;
104 |   } else {
105 |     path_.push_back(terminal_field);
106 |   }
107 |   return true;
108 | }
109 | 
110 | bool FieldPath::Parse(const string &path_string) {
111 |   // Overwriting without clearing the field_path is illegal.
112 |   if (!IsEmpty()) {
113 |     LOG(ERROR) << "Cannot overwrite field_path. Use Clear() to reset.";
114 |     return false;
115 |   }
116 |   std::vector<string> path = SplitString(path_string, ".");
117 |   const Descriptor *parent = root_type_;
118 |   if (TraverseIntermediateFields(path, &parent) &&
119 |       ParseTerminalField(path.back(), parent)) {
120 |     return true;
121 |   }
122 |   Clear();
123 |   return false;
124 | }
125 | 
126 | }  // namespace sparrowhawk
127 | }  // namespace speech
128 | 


--------------------------------------------------------------------------------
/src/lib/io_utils.cc:
--------------------------------------------------------------------------------
 1 | // Licensed under the Apache License, Version 2.0 (the "License");
 2 | // you may not use this file except in compliance with the License.
 3 | // You may obtain a copy of the License at
 4 | //
 5 | //     http://www.apache.org/licenses/LICENSE-2.0
 6 | //
 7 | // Unless required by applicable law or agreed to in writing, software
 8 | // distributed under the License is distributed on an "AS IS" BASIS,
 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | // See the License for the specific language governing permissions and
11 | // limitations under the License.
12 | //
13 | // Copyright 2015 and onwards Google, Inc.
14 | #include <sparrowhawk/io_utils.h>
15 | 
16 | #include <iostream>
17 | #include <fstream>
18 | using std::ifstream;
19 | #include <memory>
20 | 
21 | #include <sparrowhawk/logger.h>
22 | 
23 | namespace speech {
24 | namespace sparrowhawk {
25 | 
26 | string IOStream::LoadFileToString(const string &filename) {
27 |   std::ifstream strm(filename.c_str(), std::ios_base::in);
28 |   if (!strm) {
29 |     LoggerFatal("Error opening file %s", filename.c_str());
30 |   }
31 |   strm.seekg(0, strm.end);
32 |   int length = strm.tellg();
33 |   strm.seekg(0, strm.beg);
34 |   std::unique_ptr<char[]> data(new char[length + 1],
35 |                                std::default_delete<char[]>());
36 |   strm.read(data.get(), length);
37 |   if (strm.fail()) {
38 |     LoggerFatal("Error loading from file %s", filename.c_str());
39 |   }
40 |   data.get()[length] = 0;
41 |   return string(data.get(), length);
42 | }
43 | 
44 | }  // namespace sparrowhawk
45 | }  // namespace speech
46 | 


--------------------------------------------------------------------------------
/src/lib/normalizer.cc:
--------------------------------------------------------------------------------
  1 | // Licensed under the Apache License, Version 2.0 (the "License");
  2 | // you may not use this file except in compliance with the License.
  3 | // You may obtain a copy of the License at
  4 | //
  5 | //     http://www.apache.org/licenses/LICENSE-2.0
  6 | //
  7 | // Unless required by applicable law or agreed to in writing, software
  8 | // distributed under the License is distributed on an "AS IS" BASIS,
  9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | // See the License for the specific language governing permissions and
 11 | // limitations under the License.
 12 | //
 13 | // Copyright 2015 and onwards Google, Inc.
 14 | #include <sparrowhawk/normalizer.h>
 15 | 
 16 | #include <memory>
 17 | #include <string>
 18 | using std::string;
 19 | 
 20 | #include <google/protobuf/text_format.h>
 21 | #include <sparrowhawk/items.pb.h>
 22 | #include <sparrowhawk/sentence_boundary.h>
 23 | #include <sparrowhawk/serialization_spec.pb.h>
 24 | #include <sparrowhawk/sparrowhawk_configuration.pb.h>
 25 | #include <sparrowhawk/io_utils.h>
 26 | #include <sparrowhawk/logger.h>
 27 | #include <sparrowhawk/protobuf_parser.h>
 28 | #include <sparrowhawk/protobuf_serializer.h>
 29 | #include <sparrowhawk/spec_serializer.h>
 30 | #include <sparrowhawk/string_utils.h>
 31 | 
 32 | namespace speech {
 33 | namespace sparrowhawk {
 34 | 
 35 | // TODO(rws): We actually need to do something with this.
 36 | const char kDefaultSentenceBoundaryRegexp[] = "[\\.:!\\?] ";
 37 | 
 38 | Normalizer::Normalizer() { }
 39 | 
 40 | Normalizer::~Normalizer() { }
 41 | 
 42 | bool Normalizer::Setup(const string &configuration_proto,
 43 |                        const string &pathname_prefix) {
 44 |   SparrowhawkConfiguration configuration;
 45 |   string proto_string = IOStream::LoadFileToString(pathname_prefix +
 46 |                                                    "/" + configuration_proto);
 47 |   if (!google::protobuf::TextFormat::ParseFromString(proto_string, &configuration))
 48 |     return false;
 49 |   if (!(configuration.has_tokenizer_grammar()))
 50 |     LoggerError("Configuration does not define a tokenizer-classifier grammar");
 51 |   if (!(configuration.has_verbalizer_grammar()))
 52 |     LoggerError("Configuration does not define a verbalizer grammar");
 53 |   tokenizer_classifier_rules_.reset(new RuleSystem);
 54 |   if (!tokenizer_classifier_rules_->LoadGrammar(
 55 |           configuration.tokenizer_grammar(),
 56 |           pathname_prefix))
 57 |     return false;
 58 |   verbalizer_rules_.reset(new RuleSystem);
 59 |   if (!verbalizer_rules_->LoadGrammar(configuration.verbalizer_grammar(),
 60 |                                       pathname_prefix))
 61 |     return false;
 62 |   string sentence_boundary_regexp;
 63 |   if (configuration.has_sentence_boundary_regexp()) {
 64 |     sentence_boundary_regexp = configuration.sentence_boundary_regexp();
 65 |   } else {
 66 |     sentence_boundary_regexp = kDefaultSentenceBoundaryRegexp;
 67 |   }
 68 |   sentence_boundary_.reset(new SentenceBoundary(sentence_boundary_regexp));
 69 |   if (configuration.has_sentence_boundary_exceptions_file()) {
 70 |     if (!sentence_boundary_->LoadSentenceBoundaryExceptions(
 71 |             configuration.sentence_boundary_exceptions_file())) {
 72 |       LoggerError("Cannot load sentence boundary exceptions file: %s",
 73 |                   configuration.sentence_boundary_exceptions_file().c_str());
 74 |     }
 75 |   }
 76 |   if (configuration.has_serialization_spec()) {
 77 |     string spec_string = IOStream::LoadFileToString(
 78 |         pathname_prefix + "/" + configuration.serialization_spec());
 79 |     SerializeSpec spec;
 80 |     if (spec_string.empty() ||
 81 |         !google::protobuf::TextFormat::ParseFromString(spec_string, &spec) ||
 82 |         (spec_serializer_ = Serializer::Create(spec)) == nullptr) {
 83 |       LoggerError("Failed to load a valid serialization spec from file: %s",
 84 |                   configuration.serialization_spec().c_str());
 85 |       return false;
 86 |     }
 87 |   }
 88 |   return true;
 89 | }
 90 | 
 91 | bool Normalizer::Normalize(const string &input, string *output) const {
 92 |   std::unique_ptr<Utterance> utt;
 93 |   utt.reset(new Utterance);
 94 |   if (!Normalize(utt.get(), input)) return false;
 95 |   *output = LinearizeWords(utt.get());
 96 |   return true;
 97 | }
 98 | 
 99 | bool Normalizer::Normalize(Utterance *utt, const string &input) const {
100 |   return TokenizeAndClassifyUtt(utt, input) && VerbalizeUtt(utt);
101 | }
102 | 
103 | bool Normalizer::NormalizeAndShowLinks(
104 |     const string &input, string *output) const {
105 |   std::unique_ptr<Utterance> utt;
106 |   utt.reset(new Utterance);
107 |   if (!Normalize(utt.get(), input)) return false;
108 |   *output = ShowLinks(utt.get());
109 |   return true;
110 | }
111 | 
112 | bool Normalizer::TokenizeAndClassifyUtt(Utterance *utt,
113 |                                         const string &input) const {
114 |   typedef fst::StringCompiler<fst::StdArc> Compiler;
115 |   Compiler compiler(fst::StringTokenType::BYTE);
116 |   MutableTransducer input_fst, output;
117 |   if (!compiler(input, &input_fst)) {
118 |     LoggerError("Failed to compile input string \"%s\"", input.c_str());
119 |     return false;
120 |   }
121 |   if (!tokenizer_classifier_rules_->ApplyRules(input_fst,
122 |                                                &output,
123 |                                                true /*  use_lookahead */)) {
124 |     LoggerError("Failed to tokenize \"%s\"", input.c_str());
125 |     return false;
126 |   }
127 |   MutableTransducer shortest_path;
128 |   fst::ShortestPath(output, &shortest_path);
129 |   ProtobufParser parser(&shortest_path);
130 |   if (!parser.ParseTokensFromFST(utt, true /* set SEMIOTIC_CLASS */)) {
131 |     LoggerError("Failed to parse tokens from FST for \"%s\"", input.c_str());
132 |     return false;
133 |   }
134 |   return true;
135 | }
136 | 
137 | // As in Kestrel's Run(), this processes each token in turn and creates the Word
138 | // stream, adding words each with a unique wordid.  Takes a different action on
139 | // the type:
140 | //
141 | // PUNCT: do nothing
142 | // SEMIOTIC_CLASS: call verbalizer FSTs
143 | // WORD: add to word stream
144 | bool Normalizer::VerbalizeUtt(Utterance *utt) const {
145 |   for (int i = 0; i < utt->linguistic().tokens_size(); ++i) {
146 |     Token *token = utt->mutable_linguistic()->mutable_tokens(i);
147 |     string token_form = ToString(*token);
148 |     token->set_first_daughter(-1);  // Sets to default unset.
149 |     token->set_last_daughter(-1);   // Sets to default unset.
150 |     // Add a single silence for punctuation that forms phrase breaks. This is
151 |     // set via the grammar, though ultimately we'd like a proper phrasing
152 |     // module.
153 |     if (token->type() == Token::PUNCT) {
154 |       if (token->phrase_break() &&
155 |           (utt->linguistic().words_size() == 0 ||
156 |            utt->linguistic().words(
157 |                utt->linguistic().words_size() - 1).id() != "sil")) {
158 |         AddWord(utt, token, "sil");
159 |       }
160 |     } else if (token->type() == Token::SEMIOTIC_CLASS) {
161 |       if (!token->skip()) {
162 |         LoggerDebug("Verbalizing: [%s]\n", token_form.c_str());
163 |         string words;
164 |         if (VerbalizeSemioticClass(*token, &words)) {
165 |           AddWords(utt, token, words);
166 |         } else {
167 |           LoggerWarn("First-pass verbalization FAILED for [%s]",
168 |                      token_form.c_str());
169 |           // Back off to verbatim reading
170 |           string original_token = token->name();
171 |           token->Clear();
172 |           token->set_name(original_token);
173 |           token->set_verbatim(original_token);
174 |           if (VerbalizeSemioticClass(*token, &words)) {
175 |             LoggerWarn("Reversion to verbatim succeeded for [%s]",
176 |                        original_token.c_str());
177 |             AddWords(utt, token, words);
178 |           } else {
179 |             // If we've done our checks right, we should never get here
180 |             LoggerError("Verbalization FAILED for [%s]", token_form.c_str());
181 |           }
182 |         }
183 |       }
184 |     } else if (token->type() == Token::WORD) {
185 |       if (token->has_wordid()) {
186 |         AddWord(utt, token, token->wordid());
187 |       } else {
188 |         LoggerError("Token [%s] has type WORD but there is no word id",
189 |                     token_form.c_str());
190 |       }
191 |     } else {
192 |       LoggerError("No type found for [%s]", token_form.c_str());
193 |     }
194 |   }
195 |   LoggerDebug("Verbalize output: Words\n%s\n\n", LinearizeWords(utt).c_str());
196 |   return true;
197 | }
198 | 
199 | bool Normalizer::VerbalizeSemioticClass(const Token &markup,
200 |                                         string *words) const {
201 |   Token local(markup);
202 |   CleanFields(&local);
203 |   MutableTransducer input_fst;
204 |   if (spec_serializer_ == nullptr) {
205 |     ProtobufSerializer serializer(&local, &input_fst);
206 |     serializer.SerializeToFst();
207 |   } else {
208 |     input_fst = spec_serializer_->Serialize(local);
209 |   }
210 |   if (!verbalizer_rules_->ApplyRules(input_fst,
211 |                                      words,
212 |                                      false /* use_lookahead */)) {
213 |     LoggerError("Failed to verbalize \"%s\"", ToString(local).c_str());
214 |     return false;
215 |   }
216 |   return true;
217 | }
218 | 
219 | std::vector<string> Normalizer::SentenceSplitter(const string &input) const {
220 |   return sentence_boundary_->ExtractSentences(input);
221 | }
222 | 
223 | }  // namespace sparrowhawk
224 | }  // namespace speech
225 | 


--------------------------------------------------------------------------------
/src/lib/normalizer_utils.cc:
--------------------------------------------------------------------------------
  1 | // Licensed under the Apache License, Version 2.0 (the "License");
  2 | // you may not use this file except in compliance with the License.
  3 | // You may obtain a copy of the License at
  4 | //
  5 | //     http://www.apache.org/licenses/LICENSE-2.0
  6 | //
  7 | // Unless required by applicable law or agreed to in writing, software
  8 | // distributed under the License is distributed on an "AS IS" BASIS,
  9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | // See the License for the specific language governing permissions and
 11 | // limitations under the License.
 12 | //
 13 | // Copyright 2015 and onwards Google, Inc.
 14 | // TODO(rws): This is small enough now that maybe we don't really need this
 15 | // separate file.
 16 | //
 17 | // More definitions for the Normalizer class, put here because they are icky
 18 | // low-level hanky panky.
 19 | 
 20 | // utt->AppendToken()
 21 | // utt->AppendWord()
 22 | 
 23 | #include <memory>
 24 | #include <string>
 25 | using std::string;
 26 | 
 27 | #include <sparrowhawk/items.pb.h>
 28 | #include <sparrowhawk/normalizer.h>
 29 | #include <sparrowhawk/protobuf_serializer.h>
 30 | #include <sparrowhawk/string_utils.h>
 31 | 
 32 | namespace speech {
 33 | namespace sparrowhawk {
 34 | 
 35 | // Same as in Kestrel: add a phrase boundary at the beginning and ending of the
 36 | // utterance.
 37 | 
 38 | void Normalizer::AddPhraseToUtt(Utterance* utt, bool addword) const {
 39 |   Token* token = utt->mutable_linguistic()->add_tokens();
 40 |   token->set_type(Token::PUNCT);
 41 |   token->set_name("");
 42 |   token->set_phrase_break(true);
 43 |   if (addword) AddWord(utt, token, "sil");
 44 | }
 45 | 
 46 | int Normalizer::TokenIndex(Utterance* utt, Token *token) const {
 47 |   for (int i = 0; i < utt->linguistic().tokens_size(); ++i) {
 48 |     const class Token *t = &(utt->linguistic().tokens(i));
 49 |     if (t == token) {
 50 |       return i;
 51 |     }
 52 |   }
 53 |   return -1;
 54 | }
 55 | 
 56 | Word* Normalizer::AddWord(Utterance* utt,
 57 |                           Token* token,
 58 |                           const string& spelling) const {
 59 |   Word* word = utt->mutable_linguistic()->add_words();
 60 |   int word_index = utt->linguistic().words_size() - 1;
 61 |   if (!token->has_first_daughter() || token->first_daughter() == -1) {
 62 |     token->set_first_daughter(word_index);
 63 |   }
 64 |   token->set_last_daughter(word_index);
 65 |   word->set_parent(TokenIndex(utt, token));
 66 |   word->set_spelling(spelling);
 67 |   word->set_id(spelling);
 68 |   return word;
 69 | }
 70 | 
 71 | // Similar to Kestrel, but without the lexicon().ContainsWordId(spelling) logic,
 72 | // which we want to shunt to later processing.
 73 | // We assume that if someone puts a "," in the verbalization grammar, they mean
 74 | // for this to represent a phrase boundary, so we add in logic here fore that.
 75 | 
 76 | Word* Normalizer::AddWords(Utterance* utt, Token* token,
 77 |                            const string& words) const {
 78 |   std::vector<string> word_names = SplitString(words, " \t\n");
 79 |   Word* word = NULL;
 80 | 
 81 |   for (int i = 0; i < word_names.size(); ++i) {
 82 |     if (word_names[i] == ",")
 83 |       word = AddWord(utt, token, "sil");
 84 |     else
 85 |       word = AddWord(utt, token, word_names[i]);
 86 |   }
 87 |   return word;  // return last word added.
 88 | }
 89 | 
 90 | void Normalizer::CleanFields(Token* markup) const {
 91 |   markup->clear_first_daughter();
 92 |   markup->clear_last_daughter();
 93 |   markup->clear_type();
 94 |   markup->clear_skip();
 95 |   markup->clear_next_space();
 96 |   markup->clear_phrase_break();
 97 |   markup->clear_start_index();
 98 |   markup->clear_end_index();
 99 |   markup->clear_name();
100 | }
101 | 
102 | string Normalizer::InputSubstring(int left, int right) const {
103 |   if (left < 0 || right >= input_.size() || left > right) return "";
104 |   return input_.substr(left, right - left + 1);
105 | }
106 | 
107 | string Normalizer::LinearizeWords(Utterance* utt) const {
108 |   string output;
109 |   for (int i = 0; i < utt->linguistic().words_size(); ++i) {
110 |     if (i) output.append(" ");
111 |     output.append(utt->linguistic().words(i).spelling());
112 |   }
113 |   return output;
114 | }
115 | 
116 | string Normalizer::ShowLinks(Utterance *utt) const {
117 |   string output;
118 |   for (int i = 0; i < utt->linguistic().tokens_size(); ++i) {
119 |     output.append("Token:\t" + std::to_string(i) + "\t");
120 |     output.append(utt->linguistic().tokens(i).name() + "\t");
121 |     // Start and end positions in the input string.
122 |     output.append(std::to_string(utt->linguistic().tokens(i).start_index()));
123 |     output.append(",");
124 |     output.append(std::to_string(utt->linguistic().tokens(i).end_index()));
125 |     output.append("\t");
126 |     // First and last word daughters.
127 |     output.append(std::to_string(utt->linguistic().tokens(i).first_daughter()));
128 |     output.append(",");
129 |     output.append(std::to_string(utt->linguistic().tokens(i).last_daughter()));
130 |     output.append("\n");
131 |   }
132 |   for (int i = 0; i < utt->linguistic().words_size(); ++i) {
133 |     output.append("Word:\t" + std::to_string(i) + "\t");
134 |     output.append(utt->linguistic().words(i).spelling());
135 |     output.append("\t" + std::to_string(utt->linguistic().words(i).parent()));
136 |     output.append("\n");
137 |   }
138 |   return output;
139 | }
140 | 
141 | string Normalizer::ToString(const Token& markup) const {
142 |   ProtobufSerializer serializer(&markup, NULL);
143 |   return serializer.SerializeToString();
144 | }
145 | 
146 | }  // namespace sparrowhawk
147 | }  // namespace speech
148 | 


--------------------------------------------------------------------------------
/src/lib/numbers.cc:
--------------------------------------------------------------------------------
 1 | // Licensed under the Apache License, Version 2.0 (the "License");
 2 | // you may not use this file except in compliance with the License.
 3 | // You may obtain a copy of the License at
 4 | //
 5 | //     http://www.apache.org/licenses/LICENSE-2.0
 6 | //
 7 | // Unless required by applicable law or agreed to in writing, software
 8 | // distributed under the License is distributed on an "AS IS" BASIS,
 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | // See the License for the specific language governing permissions and
11 | // limitations under the License.
12 | //
13 | // Copyright 2015 and onwards Google, Inc.
14 | #include <sparrowhawk/numbers.h>
15 | 
16 | #include <errno.h>
17 | #include <stdlib.h>
18 | #include <string>
19 | using std::string;
20 | 
21 | namespace speech {
22 | namespace sparrowhawk {
23 | 
24 | #define CONVERT(value, output) \
25 |   char *endptr; \
26 |   *output = strtof(value.c_str(), &endptr); \
27 |   if (errno == ERANGE) return false; \
28 |   if (endptr < value.c_str() + value.size()) return false; \
29 |   return true;
30 | 
31 | bool safe_strtof(const string &value, float *output) {
32 |   CONVERT(value, output);
33 | }
34 | 
35 | bool safe_strtod(const string &value, double *output) {
36 |   CONVERT(value, output);
37 | }
38 | 
39 | bool safe_strto32(const string &value, int32 *output) {
40 |   CONVERT(value, output);
41 | }
42 | 
43 | bool safe_strto64(const string &value, int64 *output) {
44 |   CONVERT(value, output);
45 | }
46 | 
47 | #undef CONVERT
48 | 
49 | }  // namespace sparrowhawk
50 | }  // namespace speech
51 | 


--------------------------------------------------------------------------------
/src/lib/regexp.cc:
--------------------------------------------------------------------------------
  1 | // Licensed under the Apache License, Version 2.0 (the "License");
  2 | // you may not use this file except in compliance with the License.
  3 | // You may obtain a copy of the License at
  4 | //
  5 | //     http://www.apache.org/licenses/LICENSE-2.0
  6 | //
  7 | // Unless required by applicable law or agreed to in writing, software
  8 | // distributed under the License is distributed on an "AS IS" BASIS,
  9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | // See the License for the specific language governing permissions and
 11 | // limitations under the License.
 12 | //
 13 | // Copyright 2015 and onwards Google, Inc.
 14 | #include <sparrowhawk/regexp.h>
 15 | 
 16 | #include <memory>
 17 | #include <sparrowhawk/logger.h>
 18 | 
 19 | namespace speech {
 20 | namespace sparrowhawk {
 21 | 
 22 | Regexp::Regexp() {
 23 |   re_ = nullptr;
 24 |   nsubexp_ = -1;
 25 | }
 26 | 
 27 | Regexp::~Regexp() {
 28 |   Clear();
 29 | }
 30 | 
 31 | void Regexp::Clear() {
 32 |   delete re_;
 33 |   re_ = nullptr;
 34 |   nsubexp_ = -1;
 35 | }
 36 | 
 37 | int Regexp::nsubexp() const {
 38 |   return nsubexp_;
 39 | }
 40 | 
 41 | bool Regexp::ok() const {
 42 |   return re_ != nullptr && re_->ok();
 43 | }
 44 | 
 45 | bool Regexp::Compile(const string &pattern) {
 46 |   Clear();
 47 |   RE2::Options options;
 48 |   options.set_longest_match(true);
 49 |   options.set_log_errors(false);
 50 |   re_ = new RE2(pattern, options);
 51 | 
 52 |   if (re_ == nullptr) {
 53 |     LoggerError("Error in allocating regexp \"%s\"", pattern.c_str());
 54 |     return false;
 55 |   }
 56 |   if (!re_->ok()) {
 57 |     LoggerError("Error in allocating regexp \"%s\": %s",
 58 |                 pattern.c_str(),
 59 |                 re_->error().c_str());
 60 |     Clear();
 61 |     return false;
 62 |   }
 63 |   nsubexp_ = re_->NumberOfCapturingGroups();
 64 |   return true;
 65 | }
 66 | 
 67 | bool Regexp::CheckMatch(const string &input) const {
 68 |   if (ok()) {
 69 |     return RE2::PartialMatch(input, *re_);
 70 |   } else {
 71 |     return false;
 72 |   }
 73 | }
 74 | 
 75 | bool Regexp::CheckFullMatch(const string &input) const {
 76 |   if (ok()) {
 77 |     return RE2::FullMatch(input, *re_);
 78 |   } else {
 79 |     return false;
 80 |   }
 81 | }
 82 | 
 83 | bool Regexp::CheckMatch(const string &input, const string &pattern) {
 84 |   return RE2::PartialMatch(input, pattern);
 85 | }
 86 | 
 87 | int Regexp::GetAllMatches(const string &input,
 88 |                           std::vector<RegMatch> *matches) const {
 89 |   if (!ok()) {
 90 |     return 0;
 91 |   }
 92 |   int nmatches = 0;
 93 |   int offset = 0;
 94 |   int end_pos = input.size();
 95 |   matches->clear();
 96 |   re2::StringPiece input_piece(input);
 97 | 
 98 |   std::unique_ptr<re2::StringPiece[]> matched_pieces(new re2::StringPiece[1 + nsubexp_]);
 99 |   bool result = re_->Match(input_piece,
100 |                            offset,
101 |                            end_pos,
102 |                            RE2::UNANCHORED,
103 |                            matched_pieces.get(),
104 |                            1 + nsubexp_);
105 |   RegMatch re_info;
106 |   while (result) {
107 |     nmatches++;
108 |     re_info.sub_str.clear();
109 |     re_info.sub_start.clear();
110 |     re_info.sub_end.clear();
111 |     re_info.full_str = "";
112 |     re_info.n_sub = nsubexp_;
113 |     int match_offset = matched_pieces[0].data() - input.c_str();
114 |     int match_length = matched_pieces[0].length();
115 | 
116 |     re_info.start_char = match_offset;
117 |     re_info.end_char = match_offset + match_length;
118 |     re_info.len = match_length;
119 |     re_info.full_str = matched_pieces[0].as_string();
120 | 
121 |     for (int i = 1; i <= nsubexp_; ++i) {
122 |       re_info.sub_str.push_back(matched_pieces[i].as_string());
123 |       int sub_match_start = matched_pieces[i].data() - input.c_str();
124 |       re_info.sub_start.push_back(sub_match_start);
125 |       re_info.sub_end.push_back(sub_match_start + matched_pieces[i].length());
126 |     }
127 | 
128 |     matches->push_back(re_info);
129 |     offset = re_info.end_char;
130 |     result = re_->Match(input_piece,
131 |                        offset,
132 |                        end_pos,
133 |                        RE2::UNANCHORED,
134 |                        matched_pieces.get(),
135 |                        1 + nsubexp_);
136 |   }
137 |   return nmatches;
138 | }
139 | 
140 | }  // namespace sparrowhawk
141 | }  // namespace speech
142 | 


--------------------------------------------------------------------------------
/src/lib/rule_system.cc:
--------------------------------------------------------------------------------
  1 | // Licensed under the Apache License, Version 2.0 (the "License");
  2 | // you may not use this file except in compliance with the License.
  3 | // You may obtain a copy of the License at
  4 | //
  5 | //     http://www.apache.org/licenses/LICENSE-2.0
  6 | //
  7 | // Unless required by applicable law or agreed to in writing, software
  8 | // distributed under the License is distributed on an "AS IS" BASIS,
  9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | // See the License for the specific language governing permissions and
 11 | // limitations under the License.
 12 | //
 13 | // Copyright 2015 and onwards Google, Inc.
 14 | #include <sparrowhawk/rule_system.h>
 15 | 
 16 | #include <google/protobuf/text_format.h>
 17 | #include <sparrowhawk/io_utils.h>
 18 | #include <sparrowhawk/logger.h>
 19 | 
 20 | namespace speech {
 21 | namespace sparrowhawk {
 22 | 
 23 | using fst::LabelLookAheadRelabeler;
 24 | using fst::StdArc;
 25 | 
 26 | RuleSystem::~RuleSystem() {
 27 |   std::map<string, LookaheadFst*>::iterator iter;
 28 |   for (iter = lookaheads_.begin(); iter != lookaheads_.end(); iter++) {
 29 |     delete iter->second;
 30 |   }
 31 | }
 32 | 
 33 | bool RuleSystem::LoadGrammar(const string& filename, const string& prefix) {
 34 |   // This is the contents of filename.
 35 |   string proto_string = IOStream::LoadFileToString(prefix + filename);
 36 |   if (!google::protobuf::TextFormat::ParseFromString(proto_string, &grammar_))
 37 |     return false;
 38 |   string grm_file = prefix + grammar_.grammar_file();
 39 |   grammar_name_ = grammar_.grammar_name();
 40 |   grm_.reset(new GrmManager);
 41 |   if (!grm_->LoadArchive(grm_file)) {
 42 |     LoggerError("Error loading archive \"%s\" from \"%s\"",
 43 |                 grammar_name_.c_str(), grm_file.c_str());
 44 |     return false;
 45 |   }
 46 |   // Verifies that the rules named in the rule ordering all exist in the
 47 |   // grammar.
 48 |   for (int i = 0; i < grammar_.rules_size(); ++i) {
 49 |     Rule rule = grammar_.rules(i);
 50 |     if (grm_->GetFst(rule.main()) == NULL) {
 51 |       LoggerError("Rule \"%s\" not found in \"%s\"",
 52 |                   rule.main().c_str(), grammar_name_.c_str());
 53 |       return false;
 54 |     }
 55 |     if (rule.has_parens() && grm_->GetFst(rule.parens()) == NULL) {
 56 |       LoggerError("Rule \"%s\" not found in \"%s\"",
 57 |                   rule.parens().c_str(), grammar_name_.c_str());
 58 |       return false;
 59 |     }
 60 |     if (rule.has_redup() && grm_->GetFst(rule.redup()) == NULL) {
 61 |       LoggerError("Rule \"%s\" not found in \"%s\"",
 62 |                   rule.redup().c_str(), grammar_name_.c_str());
 63 |       return false;
 64 |     }
 65 |   }
 66 |   return true;
 67 | }
 68 | 
 69 | bool RuleSystem::ApplyRules(const Transducer& input,
 70 |                             MutableTransducer* output,
 71 |                             bool use_lookahead) const {
 72 |   MutableTransducer mutable_input(input);
 73 |   for (int i = 0; i < grammar_.rules_size(); ++i) {
 74 |     Rule rule = grammar_.rules(i);
 75 |     if (rule.has_redup()) {
 76 |       const string& redup_rule = rule.redup();
 77 |       MutableTransducer redup1;
 78 |       // Not an error if it fails.
 79 |       if (grm_->Rewrite(redup_rule, mutable_input, &redup1, "")) {
 80 |         MutableTransducer redup2(redup1);
 81 |         fst::Concat(redup1, &redup2);
 82 |         fst::Union(&mutable_input, redup2);
 83 |         fst::RmEpsilon(&mutable_input);
 84 |       }
 85 |     }
 86 |     const string& rule_name = rule.main();
 87 |     string parens_rule = rule.has_parens() ? rule.parens() : "";
 88 |     // Only use lookahead on non (M)PDT's
 89 |     bool success = true;
 90 |     if (parens_rule.empty()
 91 |         && use_lookahead) {
 92 |       std::map<string, LookaheadFst*>::iterator iter =
 93 |           lookaheads_.find(rule_name);
 94 |       LookaheadFst *lookahead_rule_fst;
 95 |       if (iter == lookaheads_.end()) {
 96 |         const Transducer *rule_fst = grm_->GetFst(rule_name);
 97 |         lookahead_rule_fst = new LookaheadFst(*rule_fst);
 98 |         lookaheads_[rule_name] = lookahead_rule_fst;
 99 |       } else {
100 |         lookahead_rule_fst = iter->second;
101 |       }
102 |       LabelLookAheadRelabeler<StdArc>::Relabel(&mutable_input,
103 |                                                *lookahead_rule_fst,
104 |                                                false);
105 |       fst::ComposeFst<StdArc> tmp_output(mutable_input,
106 |                                              *lookahead_rule_fst);
107 |       *output = tmp_output;
108 |       if (output->NumStates() == 0) {
109 |         success = false;
110 |       }
111 |       // Otherwise we just use the regular rewrite mechanism
112 |     } else if (!grm_->Rewrite(rule_name,
113 |                               mutable_input,
114 |                               output,
115 |                               parens_rule
116 |                               )
117 |         || output->NumStates() == 0) {
118 |       success = false;
119 |     }
120 |     if (!success) {
121 |       LoggerError("Application of rule \"%s\" failed", rule_name.c_str());
122 |       return false;
123 |     }
124 |     mutable_input = *output;
125 |   }
126 |   // NB: We do NOT want to Project in this case because this will be the input
127 |   // to the ProtobufParser, which needs the input-side epsilons in order to keep
128 |   // track of positions in the input.
129 |   fst::RmEpsilon(output);
130 |   return true;
131 | }
132 | 
133 | typedef fst::StringCompiler<StdArc> Compiler;
134 | typedef fst::StringPrinter<StdArc> Printer;
135 | 
136 | bool RuleSystem::ApplyRules(const string& input,
137 |                             string* output,
138 |                             bool use_lookahead) const {
139 |   Compiler compiler(fst::StringTokenType::BYTE);
140 |   MutableTransducer input_fst, output_fst;
141 |   if (!compiler.operator()(input, &input_fst)) {
142 |     LoggerError("Failed to compile input string \"%s\"", input.c_str());
143 |     return false;
144 |   }
145 |   if (!ApplyRules(input_fst, &output_fst, use_lookahead)) return false;
146 |   MutableTransducer shortest_path;
147 |   fst::ShortestPath(output_fst, &shortest_path);
148 |   fst::Project(&shortest_path, fst::PROJECT_OUTPUT);
149 |   fst::RmEpsilon(&shortest_path);
150 |   Printer printer(fst::StringTokenType::BYTE);
151 |   if (!printer.operator()(shortest_path, output)) {
152 |     LoggerError("Failed to print output string");
153 |     return false;
154 |   }
155 |   return true;
156 | }
157 | 
158 | bool RuleSystem::ApplyRules(const Transducer& input,
159 |                             string* output,
160 |                             bool use_lookahead) const {
161 |   MutableTransducer output_fst;
162 |   if (!ApplyRules(input, &output_fst, use_lookahead)) return false;
163 |   MutableTransducer shortest_path;
164 |   fst::ShortestPath(output_fst, &shortest_path);
165 |   fst::Project(&shortest_path, fst::PROJECT_OUTPUT);
166 |   fst::RmEpsilon(&shortest_path);
167 |   Printer printer(fst::StringTokenType::BYTE);
168 |   if (!printer.operator()(shortest_path, output)) {
169 |     LoggerError("Failed to print to output string");
170 |     return false;
171 |   }
172 |   return true;
173 | }
174 | 
175 | const Transducer* RuleSystem::FindRule(const string& name) const {
176 |   return grm_->GetFst(name);
177 | }
178 | 
179 | 
180 | }  // namespace sparrowhawk
181 | }  // namespace speech
182 | 


--------------------------------------------------------------------------------
/src/lib/sentence_boundary.cc:
--------------------------------------------------------------------------------
  1 | // Licensed under the Apache License, Version 2.0 (the "License");
  2 | // you may not use this file except in compliance with the License.
  3 | // You may obtain a copy of the License at
  4 | //
  5 | //     http://www.apache.org/licenses/LICENSE-2.0
  6 | //
  7 | // Unless required by applicable law or agreed to in writing, software
  8 | // distributed under the License is distributed on an "AS IS" BASIS,
  9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | // See the License for the specific language governing permissions and
 11 | // limitations under the License.
 12 | //
 13 | // Copyright 2015 and onwards Google, Inc.
 14 | #include <sparrowhawk/sentence_boundary.h>
 15 | 
 16 | #include <memory>
 17 | #include <string>
 18 | using std::string;
 19 | #include <vector>
 20 | using std::vector;
 21 | 
 22 | #include <sparrowhawk/io_utils.h>
 23 | #include <sparrowhawk/logger.h>
 24 | #include <sparrowhawk/regexp.h>
 25 | #include <sparrowhawk/string_utils.h>
 26 | 
 27 | namespace speech {
 28 | namespace sparrowhawk {
 29 | 
 30 | SentenceBoundary::SentenceBoundary(const string &regexp) :
 31 |     pad_exceptions_with_space_prefix_(true) {
 32 |   regexp_.reset(new Regexp);
 33 |   if (!regexp_->Compile(regexp)) {
 34 |     LoggerFatal("SentenceBoundary failed with bad regexp: %s", regexp.c_str());
 35 |   }
 36 | }
 37 | 
 38 | bool SentenceBoundary::LoadSentenceBoundaryExceptions(const string &filename) {
 39 |   string raw = IOStream::LoadFileToString(filename);
 40 |   std::vector<string> tokens = SplitString(raw, "\n", true /* skip_empty */);
 41 |   for (auto token : tokens) {
 42 |     token = StripWhitespace(token);
 43 |     // Having it as an unordered list is of course not very efficient for
 44 |     // search, but we do not expect these lists to be very long.
 45 |     // We pad with a space before it since most scripts that use end-of-sentence
 46 |     // markers ambiguously to denote abbreviations also use spaces to delimit
 47 |     // words.
 48 |     // TODO(rws): We should extend this to regexps to handle things like German
 49 |     // ordinals.
 50 |     if (pad_exceptions_with_space_prefix_)
 51 |       sentence_boundary_exceptions_.push_back(" " + token);
 52 |   }
 53 |   return true;
 54 | }
 55 | 
 56 | std::vector<string> SentenceBoundary::ExtractSentences(
 57 |     const string &input_text) const {
 58 |   std::vector<RegMatch> potentials;
 59 |   regexp_->GetAllMatches(input_text, &potentials);
 60 |   std::vector<int> cutpoints;
 61 |   int last = 0, i;
 62 |   for (i = 0; i < potentials.size(); ++i) {
 63 |     const int start = potentials[i].start_char;
 64 |     const int end = potentials[i].end_char;
 65 |     const string text_before = input_text.substr(last, start - last);
 66 |     const string marker = input_text.substr(start, end - start);
 67 |     const string text_after = input_text.substr(end);
 68 |     if (EvaluateCandidate(text_before, marker)) {
 69 |       cutpoints.push_back(end);
 70 |       last = end;
 71 |     }
 72 |   }
 73 |   std::vector<string> result;
 74 |   last = 0;
 75 |   string sentence;
 76 |   for (int i = 0; i < cutpoints.size(); ++i) {
 77 |     sentence = StripWhitespace(input_text.substr(last, cutpoints[i] - last));
 78 |     if (!sentence.empty()) result.push_back(sentence);
 79 |     last = cutpoints[i];
 80 |   }
 81 |   sentence = StripWhitespace(input_text.substr(last));
 82 |   if (!sentence.empty()) result.push_back(sentence);
 83 |   return result;
 84 | }
 85 | 
 86 | bool SentenceBoundary::EvaluateCandidate(const string &input_text,
 87 |                                          const string &marker) const {
 88 |   // Gets the previous sentence and the marker, minus any trailing whitespace.
 89 |   string previous = StripWhitespace(input_text + marker);
 90 |   int previous_length = previous.size();
 91 |   for (const auto &exception : sentence_boundary_exceptions_) {
 92 |     int length = exception.size();
 93 |     if (length <= previous_length &&
 94 |         previous.substr(previous_length - length, length) == exception)
 95 |       return false;
 96 |     // If the exception starts with a space because we have added one, then also
 97 |     // check to see if this was the first token --- i.e. matches the entire
 98 |     // previous "sentence".
 99 |     if (pad_exceptions_with_space_prefix_) {
100 |       string stripped_exception = StripWhitespace(exception);
101 |       if (previous == stripped_exception) return false;
102 |     }
103 |   }
104 |   return true;
105 | }
106 | 
107 | }  // namespace sparrowhawk
108 | }  // namespace speech
109 | 


--------------------------------------------------------------------------------
/src/lib/spec_serializer.cc:
--------------------------------------------------------------------------------
 1 | // Licensed under the Apache License, Version 2.0 (the "License");
 2 | // you may not use this file except in compliance with the License.
 3 | // You may obtain a copy of the License at
 4 | //
 5 | //     http://www.apache.org/licenses/LICENSE-2.0
 6 | //
 7 | // Unless required by applicable law or agreed to in writing, software
 8 | // distributed under the License is distributed on an "AS IS" BASIS,
 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | // See the License for the specific language governing permissions and
11 | // limitations under the License.
12 | //
13 | // Copyright 2015 and onwards Google, Inc.
14 | #include <sparrowhawk/spec_serializer.h>
15 | 
16 | #include <memory>
17 | #include <vector>
18 | using std::vector;
19 | 
20 | #include <google/protobuf/descriptor.h>
21 | #include <google/protobuf/message.h>
22 | #include <sparrowhawk/items.pb.h>
23 | #include <sparrowhawk/serialization_spec.pb.h>
24 | #include <sparrowhawk/style_serializer.h>
25 | 
26 | namespace speech {
27 | namespace sparrowhawk {
28 | 
29 | using google::protobuf::Descriptor;
30 | using google::protobuf::FieldDescriptor;
31 | using google::protobuf::Reflection;
32 | 
33 | namespace {
34 | 
35 | typedef Serializer::MutableTransducer MutableTransducer;
36 | const char kClassSeparator[] = "|";
37 | 
38 | }  // namespace
39 | 
40 | std::unique_ptr<Serializer> Serializer::Create(
41 |     const SerializeSpec &serialize_spec) {
42 |   std::unique_ptr<Serializer> serializer(new Serializer());
43 |   const Descriptor *token_descriptor = Token::descriptor();
44 |   for (const ClassSpec &class_spec : serialize_spec.class_spec()) {
45 |     const FieldDescriptor *class_descriptor =
46 |       token_descriptor->FindFieldByName(class_spec.semiotic_class());
47 |     if (class_descriptor == nullptr) {
48 |       LOG(ERROR) << "Cannot find " << class_spec.semiotic_class()
49 |                  << " field in Token proto";
50 |       return nullptr;
51 |     }
52 |     std::vector<std::unique_ptr<StyleSerializer>> &styles =
53 |         serializer->serializers_[class_descriptor];
54 |     for (const StyleSpec &style_spec : class_spec.style_spec()) {
55 |       auto style_serializer = StyleSerializer::Create(style_spec);
56 |       if (style_serializer) {
57 |         styles.push_back(std::move(style_serializer));
58 |       } else {
59 |         return nullptr;
60 |       }
61 |     }
62 |   }
63 |   return serializer;
64 | }
65 | 
66 | MutableTransducer Serializer::Serialize(const Token &token) const {
67 |   MutableTransducer fst;
68 |   const Reflection *reflection = token.GetReflection();
69 |   for (const auto &candidate_class : serializers_) {
70 |     if (reflection->HasField(token, candidate_class.first)) {
71 |       string_compiler_(candidate_class.first->name() + kClassSeparator,
72 |                        &fst);
73 |       MutableTransducer fst_styles;
74 |       for (const auto &candidate_style : candidate_class.second) {
75 |         MutableTransducer fst_style;
76 |         fst_style.SetStart(fst_style.AddState());
77 |         fst_style.SetFinal(0, 1);
78 |         if (candidate_style->Serialize(token, &fst_style)) {
79 |           Union(&fst_styles, fst_style);
80 |         }
81 |       }
82 |       Concat(&fst, fst_styles);
83 |     }
84 |   }
85 |   return fst;
86 | }
87 | 
88 | }  // namespace sparrowhawk
89 | }  // namespace speech
90 | 


--------------------------------------------------------------------------------
/src/lib/string_utils.cc:
--------------------------------------------------------------------------------
 1 | // Licensed under the Apache License, Version 2.0 (the "License");
 2 | // you may not use this file except in compliance with the License.
 3 | // You may obtain a copy of the License at
 4 | //
 5 | //     http://www.apache.org/licenses/LICENSE-2.0
 6 | //
 7 | // Unless required by applicable law or agreed to in writing, software
 8 | // distributed under the License is distributed on an "AS IS" BASIS,
 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | // See the License for the specific language governing permissions and
11 | // limitations under the License.
12 | //
13 | // Copyright 2015 and onwards Google, Inc.
14 | #include <sparrowhawk/string_utils.h>
15 | 
16 | #include <string>
17 | using std::string;
18 | #include <vector>
19 | using std::vector;
20 | 
21 | namespace speech {
22 | namespace sparrowhawk {
23 | 
24 | std::vector<string> SplitString(const string &s, const string &delims) {
25 |   return SplitString(s, delims, false);
26 | }
27 | 
28 | std::vector<string> SplitString(const string &s,
29 |                            const string &delims,
30 |                            bool skip_empty) {
31 |   std::vector<string> out;
32 |   if (s.empty()) {
33 |     return out;
34 |   }
35 | 
36 |   string::size_type len = s.length(), i = 0, pos = 0;
37 |   do {
38 |     if ((i = s.find_first_of(delims, pos)) == string::npos) {
39 |       string substring = s.substr(pos);
40 |       if (skip_empty && substring.empty()) continue;
41 |       out.push_back(substring);
42 |     } else {
43 |       if (pos != i) {
44 |         string substring = s.substr(pos, i - pos);
45 |         if (skip_empty && substring.empty()) continue;
46 |         out.push_back(substring);
47 |       }
48 |       pos = i + 1;
49 |     }
50 |   } while (i != string::npos && pos < len);
51 |   return out;
52 | }
53 | 
54 | string StripWhitespace(const string &s) {
55 |   int start = s.find_first_not_of(" \t\n");
56 |   if (start == string::npos) return "";
57 |   int end = s.find_last_not_of(" \t\n");
58 |   return s.substr(start, end - start + 1);
59 | }
60 | 
61 | }  // namespace sparrowhawk
62 | }  // namespace speech
63 | 


--------------------------------------------------------------------------------
/src/lib/style_serializer.cc:
--------------------------------------------------------------------------------
  1 | // Licensed under the Apache License, Version 2.0 (the "License");
  2 | // you may not use this file except in compliance with the License.
  3 | // You may obtain a copy of the License at
  4 | //
  5 | //     http://www.apache.org/licenses/LICENSE-2.0
  6 | //
  7 | // Unless required by applicable law or agreed to in writing, software
  8 | // distributed under the License is distributed on an "AS IS" BASIS,
  9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | // See the License for the specific language governing permissions and
 11 | // limitations under the License.
 12 | //
 13 | // Copyright 2015 and onwards Google, Inc.
 14 | #include <sparrowhawk/style_serializer.h>
 15 | 
 16 | #include <memory>
 17 | #include <string>
 18 | using std::string;
 19 | #include <vector>
 20 | using std::vector;
 21 | 
 22 | #include <google/protobuf/descriptor.h>
 23 | #include <google/protobuf/message.h>
 24 | #include <google/protobuf/text_format.h>
 25 | #include <sparrowhawk/items.pb.h>
 26 | #include <sparrowhawk/serialization_spec.pb.h>
 27 | #include <sparrowhawk/field_path.h>
 28 | #include <sparrowhawk/record_serializer.h>
 29 | #include <sparrowhawk/string_utils.h>
 30 | 
 31 | namespace speech {
 32 | namespace sparrowhawk {
 33 | 
 34 | using google::protobuf::Descriptor;
 35 | using google::protobuf::FieldDescriptor;
 36 | using google::protobuf::Reflection;
 37 | using google::protobuf::TextFormat;
 38 | using google::protobuf::Message;
 39 | 
 40 | bool StyleSerializer::CreateRecordSerializers(
 41 |     const StyleSpec &style_spec,
 42 |     const std::unique_ptr<StyleSerializer> &style_serializer) {
 43 |   for (const RecordSpec &record_spec : style_spec.record_spec()) {
 44 |     auto record_serializer = RecordSerializer::Create(record_spec);
 45 |     if (record_serializer) {
 46 |       style_serializer->record_serializers_.push_back(
 47 |           std::move(record_serializer));
 48 |     } else {
 49 |       return false;
 50 |     }
 51 |   }
 52 |   return true;
 53 | }
 54 | 
 55 | bool StyleSerializer::SetRequiredFieldPaths(
 56 |     const StyleSpec &style_spec,
 57 |     const std::unique_ptr<StyleSerializer> &style_serializer) {
 58 |   const Descriptor *token_descriptor = Token::descriptor();
 59 |   for (const string &required_fields : style_spec.required_fields()) {
 60 |     std::vector<FieldPath> any_of;
 61 |     for (const auto &required_field :
 62 |          SplitString(required_fields, "|")) {
 63 |       std::unique_ptr<FieldPath> field_path =
 64 |           FieldPath::Create(token_descriptor);
 65 |       any_of.push_back(*field_path);
 66 |       if (!any_of.back().Parse(required_field)) {
 67 |         LOG(ERROR) << "FieldPath failed to parse for required field: "
 68 |                    << required_field;
 69 |         return false;
 70 |       }
 71 |     }
 72 |     style_serializer->required_fields_.push_back(std::move(any_of));
 73 |   }
 74 |   return true;
 75 | }
 76 | 
 77 | bool StyleSerializer::SetProhibitedFieldPaths(
 78 |     const StyleSpec &style_spec,
 79 |     const std::unique_ptr<StyleSerializer> &style_serializer) {
 80 |   const Descriptor *token_descriptor = Token::descriptor();
 81 |   for (const string &prohibited_field : style_spec.prohibited_fields()) {
 82 |     std::vector<FieldPath> &prohibited_fields =
 83 |         style_serializer->prohibited_fields_;
 84 |     std::unique_ptr<FieldPath> field_path =
 85 |         FieldPath::Create(token_descriptor);
 86 |     prohibited_fields.push_back(*field_path);
 87 |     if (!prohibited_fields.back().Parse(prohibited_field)) {
 88 |       LOG(ERROR) << "FieldPath failed to parse for prohibited field: "
 89 |                  << prohibited_field;
 90 |       return false;
 91 |     }
 92 |   }
 93 |   return true;
 94 | }
 95 | 
 96 | std::unique_ptr<StyleSerializer> StyleSerializer::Create(
 97 |     const StyleSpec &style_spec) {
 98 |   std::unique_ptr<StyleSerializer> style_serializer(new StyleSerializer());
 99 |   if (!CreateRecordSerializers(style_spec, style_serializer) ||
100 |       !SetRequiredFieldPaths(style_spec, style_serializer) ||
101 |       !SetProhibitedFieldPaths(style_spec, style_serializer)) {
102 |     return nullptr;
103 |   }
104 |   return style_serializer;
105 | }
106 | 
107 | bool StyleSerializer::IsFieldSet(const Message &root,
108 |                                  const FieldPath &field_path) const {
109 |   const Message *parent;
110 |   const FieldDescriptor *field;
111 |   if (!field_path.Follow(root, &parent, &field)) {
112 |     LOG(ERROR) << "FieldPath traversal failed for input Message "
113 |                << root.DebugString();
114 |     return false;
115 |   }
116 |   const Reflection *parent_reflection = parent->GetReflection();
117 |   if (field->label() == FieldDescriptor::LABEL_REPEATED) {
118 |     // The field is assumed to be a scalar here.
119 |     if (parent_reflection->FieldSize(*parent, field) == 0) {
120 |       return false;
121 |     }
122 |   } else if (!parent_reflection->HasField(*parent, field)) {
123 |     return false;
124 |   }
125 |   return true;
126 | }
127 | 
128 | bool StyleSerializer::CheckRequiredFields(const Token &token) const {
129 |   for (const std::vector<FieldPath> &field_paths : required_fields_) {
130 |     bool found = false;
131 |     for (const FieldPath &field_path : field_paths) {
132 |       if (IsFieldSet(token, field_path)) {
133 |         found = true;
134 |         break;
135 |       }
136 |     }
137 |     if (!found) {
138 |       return false;
139 |     }
140 |   }
141 |   return true;
142 | }
143 | 
144 | bool StyleSerializer::CheckProhibitedFields(const Token &token) const {
145 |   for (const FieldPath &field_path : prohibited_fields_) {
146 |     if (IsFieldSet(token, field_path)) {
147 |       return false;
148 |     }
149 |   }
150 |   return true;
151 | }
152 | 
153 | bool StyleSerializer::Serialize(const Token &token,
154 |                                 MutableTransducer *serialization) const {
155 |   if (!CheckRequiredFields(token) || !CheckProhibitedFields(token)) {
156 |     return false;
157 |   }
158 |   for (const auto &record_serializer : record_serializers_) {
159 |     if (!record_serializer->Serialize(token, serialization)) {
160 |       LOG(ERROR) << "Record serialization failure for token " + token.name();
161 |       return false;
162 |     }
163 |   }
164 |   return true;
165 | }
166 | 
167 | 
168 | }  // namespace sparrowhawk
169 | }  // namespace speech
170 | 


--------------------------------------------------------------------------------
/src/proto/Makefile.am:
--------------------------------------------------------------------------------
 1 | dist_noinst_DATA = items.proto \
 2 |                    links.proto \
 3 |                    rule_order.proto \
 4 |                    semiotic_classes.proto \
 5 | 	           serialization_spec.proto \
 6 |                    sparrowhawk_configuration.proto
 7 | 
 8 | CC_OUT = $(srcdir)/../lib
 9 | H_OUT  = $(srcdir)/../include/sparrowhawk
10 | 
11 | %.pb.cc %.pb.h: %.proto
12 | 	$(PROTOC) --proto_path=$(srcdir) --cpp_out=$(srcdir) $^
13 | 	cp $*.pb.h $(H_OUT)
14 | 	cp $*.pb.cc $(CC_OUT)
15 | 
16 | MOSTLYCLEANFILES = items.pb.h items.pb.cc \
17 |                    links.pb.h links.pb.cc \
18 |                    rule_order.pb.h rule_order.pb.cc \
19 |                    semiotic_classes.pb.h semiotic_classes.pb.cc \
20 | 	           serialization_spec.pb.h serialization_spec.pb.cc \
21 |                    sparrowhawk_configuration.pb.h sparrowhawk_configuration.pb.cc
22 | 
23 | all: $(MOSTLYCLEANFILES)
24 | 
25 | 


--------------------------------------------------------------------------------
/src/proto/items.proto:
--------------------------------------------------------------------------------
  1 | // Licensed under the Apache License, Version 2.0 (the "License");
  2 | // you may not use this file except in compliance with the License.
  3 | // You may obtain a copy of the License at
  4 | //
  5 | //     http://www.apache.org/licenses/LICENSE-2.0
  6 | //
  7 | // Unless required by applicable law or agreed to in writing, software
  8 | // distributed under the License is distributed on an "AS IS" BASIS,
  9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | // See the License for the specific language governing permissions and
 11 | // limitations under the License.
 12 | //
 13 | // Copyright 2015 and onwards Google, Inc.
 14 | syntax = "proto2";
 15 | 
 16 | // TODO(rws): Probably phase out links since we are not using it.
 17 | import "links.proto";
 18 | import "semiotic_classes.proto";
 19 | 
 20 | package speech.sparrowhawk;
 21 | 
 22 | // Message containing the contents for a single token as determined by the
 23 | // tokenizer. Roughly speaking, a token corresponds to a single verbalizable
 24 | // entity, such as a single word, or single semiotic object such as "$15.60".
 25 | message Token {
 26 |   // Describes the kind of entity this token represents.
 27 |   enum Type {
 28 |     // A known word which is present in the lexicon.
 29 |     WORD = 1;
 30 | 
 31 |     // A semiotic class.
 32 |     SEMIOTIC_CLASS = 2;
 33 | 
 34 |     // Punctuation which is not expected to be pronounced.
 35 |     PUNCT = 3;
 36 | 
 37 |     // A word, but requires some further verbalization work.
 38 |     // For example, Thai words with a trailing repetition character.
 39 |     WORD_NEEDS_VERBALIZATION = 4;
 40 |   }
 41 | 
 42 |   // General pause duration lengths.
 43 |   enum PauseLength {
 44 |     PAUSE_NONE = 0;    // No pause.
 45 |     PAUSE_SHORT = 1;   // Brief pause, eg. for brackets or quotes.
 46 |     PAUSE_MEDIUM = 2;  // Longer pause, for a comma or similar.
 47 |     PAUSE_LONG = 3;    // Longest pause, for a fullstop or phrase break.
 48 |   }
 49 | 
 50 |   // Structural relationships. The children are words.
 51 |   // TODO(rws): Probably phase out links since we are not using it.
 52 |   optional Links links = 1;
 53 | 
 54 |   // Optional information about where this token came from in the
 55 |   // original input.
 56 |   // Indices are given in Unicode codepoints (*not* byte indices).
 57 |   optional uint32 start_index = 2;
 58 |   optional uint32 end_index = 3;
 59 | 
 60 |   // The name of the token, which is generally the original unnormalized text
 61 |   // the token was generated from.
 62 |   //
 63 |   // Voice Building Note: This field appears on ScriptLine protos that serve as
 64 |   // input to voice building.
 65 |   optional string name = 4;
 66 | 
 67 |   // Basic type of the token (see enum comments).
 68 |   //
 69 |   // Voice Building Note: This field appears on ScriptLine protos that serve as
 70 |   // input to voice building.
 71 |   optional Type type = 5;
 72 | 
 73 |   // The wordid of the token, when a single one is known.
 74 |   // Set when type == WORD
 75 |   optional string wordid = 6;
 76 | 
 77 |   // If the token is a word, this represents the regular lower-cased spelling of
 78 |   // that word.
 79 |   optional string spelling = 7;
 80 | 
 81 |   // If true, this token is a phrase break.
 82 |   optional bool phrase_break = 8;
 83 | 
 84 |   // Indicates a pause of given length, in seconds.  Used when pause given from
 85 |   // markup.
 86 |   // Currently unused.
 87 |   optional float pause_duration = 9;
 88 | 
 89 |   // If set, indicates a general length of pause that should be introduced
 90 |   // for synthesis. For example, a fullstop would generally generate
 91 |   // a longer pause than a comma.
 92 |   // Currently unused.
 93 |   optional PauseLength pause_length = 10 [default = PAUSE_NONE];
 94 | 
 95 |   // This is used to store spelling with stress mark produced
 96 |   // by stress assigner or provided in input text.
 97 |   // Currently unused.
 98 |   optional string spelling_with_stress = 11;
 99 | 
100 |   // If true, don't verbalize this token. Used to skip tokens that are part of a
101 |   // multi-token semiotic class, or bypass homograph resolution when explicit
102 |   // wordids are provided.
103 |   optional bool skip = 12;
104 | 
105 |   // Is true if a space follows this token. E.g., after tokenization in Chinese/Japanese.
106 |   // Currently unused.
107 |   optional bool next_space = 13;
108 | 
109 |   // All the following (fields in the range [14-27]) are used when
110 |   // the token represents a semiotic class. In such a case, one of these
111 |   // is filled by the output from the classifier/parser stage.
112 |   // Alternatively, if part of the input was given as markup, it will
113 |   // be copied from the input to these fields.
114 |   optional Cardinal cardinal = 14;
115 |   optional Ordinal ordinal = 15;
116 |   optional string digit = 16;
117 |   optional Decimal decimal = 17;
118 |   optional Fraction fraction = 18;
119 |   optional Time time = 19;
120 |   optional Measure measure = 20;
121 |   optional Decimal percent = 21;
122 |   optional Date date = 22;
123 |   optional Telephone telephone = 23;
124 |   optional Money money = 24;
125 |   optional Electronic electronic = 25;
126 |   optional string verbatim = 26;
127 |   optional string letters = 27;
128 | 
129 |   // Tokens defined by things they connect, for example "-" in "Mon-Fri",
130 |   // ":" in "1:1", etc.
131 |   optional Connector connector = 28;
132 | 
133 |   // Abbreviations, intended for languages where they may inflect depending
134 |   // on case etc.
135 |   optional Abbreviation abbreviation = 29;
136 | 
137 |   // Indices of the first and last words.
138 |   optional int32 first_daughter = 30;
139 | 
140 |   optional int32 last_daughter = 31;
141 | 
142 |   extensions 1000 to max;
143 | }
144 | 
145 | // A single word
146 | message Word {
147 |   // Structural relationships. The parent items are tokens.
148 |   // TODO(rws): Probably phase out links since we are not using it.
149 |   optional Links links = 1;
150 | 
151 |   // The id of the word, predominantly used as a key into the lexicon.
152 |   optional string id = 2;
153 | 
154 |   // The conventional spelling of the word.
155 |   // There can be several spellings matching one id in the lexicon
156 |   // (e.g. colour, color correspond to the same wordid) and vice versa
157 |   // (spelling "project" maps to ids "project_nou" and "project_vrb").
158 |   optional string spelling = 3;
159 | 
160 |   // If set, indicates the length of pause that should be generated for
161 |   // this word, in seconds. Only applies to the special word "sil".
162 |   // Currently unused.
163 |   optional float pause_length = 4;
164 | 
165 |   // True when the prosodic_features have specified that there should (value
166 |   // true) or should not (value false) be a pause just after this word, either
167 |   // because contains_pause was specified in an utterance in which this was the
168 |   // penultimate word, or because precedes_pause was specified in an utterance
169 |   // in which this was the last word.
170 |   // Currently unused.
171 |   optional bool precedes_pause = 5;
172 | 
173 |   // Parent token
174 |   optional int32 parent = 6;
175 | 
176 |   extensions 1000 to max;
177 | 
178 | }
179 | 
180 | // A single utterance's linguistic structure
181 | message LinguisticStructure {
182 |   // ID uniquely identifying this utterance. If used in asynchronous mode
183 |   // the utterance IDs can be used to match multiply emitted utterances
184 |   // generated from a single source. 64-bit integer is used to
185 |   // accommodate utterance ID as a timestamp.
186 |   optional int64 id = 1;
187 |   optional string input = 2;  // The original sentence.
188 |   repeated Token tokens = 3;
189 |   repeated Word words = 4;
190 | 
191 |   extensions 1000 to max;
192 | }
193 | 
194 | // An utterance
195 | message Utterance {
196 |   // An arbitrary identifier used to identify the utterance for debugging
197 |   // purposes. The controller assigns this id internally when it creates an
198 |   // utterance, and the id is unique (with high probability) within the process.
199 |   // Currently unused.
200 |   optional uint64 id = 1;
201 | 
202 |   // If loaded from file, the filename (usually without a path). Mainly intended
203 |   // as a human-readable identifier for debugging purposes.
204 |   // Currently unused.
205 |   optional string filename = 2;
206 | 
207 |   // This field can be mutated by various text pre-processing streams, such as
208 |   // character segmenters and text filters.
209 |   // Currently unused.
210 |   optional string sentence = 3;
211 | 
212 |   // Copy of the original sentence that is guaranteed not to be changed by the
213 |   // pipeline.
214 |   // Currently unused.
215 |   optional string original_sentence = 4;
216 | 
217 |   // If segmentation was applied on the original sentences, the following field
218 |   // will contain the results of the segmentation. Each string corresponds to
219 |   // an individual sentence.
220 |   // Currently unused.
221 |   repeated string segmenter_output = 5;
222 | 
223 |   // Linguistic streams, words, tokens etc.
224 |   optional LinguisticStructure linguistic = 6;
225 | 
226 |   extensions 1000 to max;
227 | }
228 | 


--------------------------------------------------------------------------------
/src/proto/links.proto:
--------------------------------------------------------------------------------
 1 | // Licensed under the Apache License, Version 2.0 (the "License");
 2 | // you may not use this file except in compliance with the License.
 3 | // You may obtain a copy of the License at
 4 | //
 5 | //     http://www.apache.org/licenses/LICENSE-2.0
 6 | //
 7 | // Unless required by applicable law or agreed to in writing, software
 8 | // distributed under the License is distributed on an "AS IS" BASIS,
 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | // See the License for the specific language governing permissions and
11 | // limitations under the License.
12 | //
13 | // Copyright 2015 and onwards Google, Inc.
14 | // Defines the relationship between items in the linguistic structure and other
15 | // parts of the utterance, maintaining hierarchies such as syllables being
16 | // parents of phonemes, words being parents of syllables etc.
17 | //
18 | // Unless otherwise noted, we use global 0-based indices within an utterance.
19 | // For example, the 20th phoneme in the utterance will have index 19, even
20 | // though it may be the 1st phoneme in its syllable.
21 | 
22 | syntax = "proto2";
23 | 
24 | package speech.sparrowhawk;
25 | 
26 | message Links {
27 |   // The index of this entity; mainly useful for debugging purposes.
28 |   optional int32 own_index = 1;
29 | 
30 |   // The index of the parent entity of the current entity.
31 |   optional int32 parent = 2;
32 | 
33 |   // The index of the first child of the current entity.
34 |   optional int32 first_child = 3;
35 | 
36 |   // The index of the last child of the current entity.
37 |   optional int32 last_child = 4;
38 | }
39 | 


--------------------------------------------------------------------------------
/src/proto/rule_order.proto:
--------------------------------------------------------------------------------
 1 | // Licensed under the Apache License, Version 2.0 (the "License");
 2 | // you may not use this file except in compliance with the License.
 3 | // You may obtain a copy of the License at
 4 | //
 5 | //     http://www.apache.org/licenses/LICENSE-2.0
 6 | //
 7 | // Unless required by applicable law or agreed to in writing, software
 8 | // distributed under the License is distributed on an "AS IS" BASIS,
 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | // See the License for the specific language governing permissions and
11 | // limitations under the License.
12 | //
13 | // Copyright 2015 and onwards Google, Inc.
14 | // Definition of ordering of rules to be applied. Each rule, and an optional
15 | // pushdown-transducer parenthesis set, is applied (composed) to the input in
16 | // the order specified.
17 | //
18 | // If a reduplication rule (also a Thrax target) is supplied, then that will
19 | // optionally copy any matching input. This is most useful in cases where it is
20 | // desirable to copy the entire to-be-verbalized token. For example, with money
21 | // where one can copy an entire money token, and then read the major currency
22 | // off the first copy and the minor currency off the second.  See
23 | //
24 | // Ebden, Peter and Sproat, Richard. 2015. The Kestrel TTS text normalization
25 | // system. Natural Language Engineering, Issue 03, pp 333-353.
26 | //
27 | // for further discussion.
28 | //
29 | // See the Thrax documentation at
30 | // http://www.openfst.org/twiki/bin/view/GRM/ThraxQuickTour for discussion of
31 | // PDTs and (coming soon) MPTDs.
32 | 
33 | syntax = "proto2";
34 | 
35 | package speech.sparrowhawk;
36 | 
37 | message Rule {
38 |   required string main = 1;  // Main normalization rule.
39 |   optional string parens = 2;  // Optional PDT parens.
40 |   optional string assignments = 3;  // Optional MPDT assignments.
41 |   optional string redup = 4;  // Optional reduplication rule.
42 | };
43 | 
44 | message Grammar {
45 |   required string grammar_file = 1;
46 |   required string grammar_name = 2;  // Name for this grammar.
47 |   repeated Rule rules = 3;
48 | };
49 | 


--------------------------------------------------------------------------------
/src/proto/serialization_spec.proto:
--------------------------------------------------------------------------------
  1 | // Licensed under the Apache License, Version 2.0 (the "License");
  2 | // you may not use this file except in compliance with the License.
  3 | // You may obtain a copy of the License at
  4 | //
  5 | //     http://www.apache.org/licenses/LICENSE-2.0
  6 | //
  7 | // Unless required by applicable law or agreed to in writing, software
  8 | // distributed under the License is distributed on an "AS IS" BASIS,
  9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | // See the License for the specific language governing permissions and
 11 | // limitations under the License.
 12 | //
 13 | // Copyright 2015 and onwards Google, Inc.
 14 | // Proto messages describing specifications for serializing semiotic classes.
 15 | // These serializations determine the input to the verbalization grammars.
 16 | // TODO(drasha) consider changing the name to serialize_spec.proto for
 17 | // consistency.
 18 | 
 19 | syntax = "proto2";
 20 | 
 21 | package speech.sparrowhawk;
 22 | 
 23 | // Specification for serializing a sub-part of a semiotic class. RecordSpecs may
 24 | // be simple, such as a single field, or recursively combine additional
 25 | // RecordSpecs to specify more elaborate formats.
 26 | // For a repeated scalar field, we simply serialize all the values in the
 27 | // token for this field in an identical fashion, respecting the original
 28 | // order.
 29 | // NB. Assumes there are no repeated embedded messages in semiotic_classes.proto
 30 | message RecordSpec {
 31 |   // The serialization for these RecordSpecs will be emitted prior to every
 32 |   // instance of the main field for this spec.
 33 |   repeated RecordSpec prefix_spec = 1;
 34 | 
 35 |   // The serialization for these RecordSpecs will be emitted after every
 36 |   // instance of the main field for this spec.
 37 |   repeated RecordSpec suffix_spec = 2;
 38 | 
 39 |   // Field serialization specification: the fields below are used to include a
 40 |   // value from the input proto in the serialization. This record will only be
 41 |   // included in the output serialization if this field is present in the input,
 42 |   // a default value is supplied, or a one_of field is given.
 43 | 
 44 |   // The path (from the top-level token, in proto_path.h format) to this field.
 45 |   // If the label field is not set, the terminal portion of this will be used as
 46 |   // the label in the serialized output.
 47 |   optional string field_path = 3;
 48 | 
 49 |   // Defines the record label in the serialization. This should be set only to
 50 |   // override the use of the terminal field name from the field path as the
 51 |   // default label.
 52 |   optional string label = 4;
 53 | 
 54 |   // String defining the value to be used for the field in case it is not set.
 55 |   // Note that prefix and suffix records with default values will not be
 56 |   // serialized if the parent record is missing. The default value is
 57 |   // well-defined only for singular fields and is ignored otherwise.
 58 |   optional string default_value = 5;
 59 | }
 60 | 
 61 | // Specification for serializing a semiotic class in a particular style.
 62 | // StyleSpecs provide required and prohibited fields to help determine the style
 63 | // to be used for verbalization.
 64 | message StyleSpec {
 65 |   // Gives the specification for how tokens should be serialized in this style.
 66 |   // The serialization components for this style will be emitted in the same
 67 |   // order as the record specs in this field.
 68 |   repeated RecordSpec record_spec = 1;
 69 | 
 70 |   // When more than one serialization style is used for a semiotic class, it may
 71 |   // be possible to infer that a serialization is inappropriate due to the
 72 |   // presence or absence of a particular field. The following fields provide a
 73 |   // mechanism to do this.
 74 | 
 75 |   // This serialization will not be emitted unless all of the fields referred to
 76 |   // here are present. A single instance can have multiple fields (separated by
 77 |   // "|") from which at least one field is required for serialization.
 78 |   repeated string required_fields = 2;
 79 | 
 80 |   // This serialization will not be emitted if any of the fields referred to
 81 |   // here are present.
 82 |   repeated string prohibited_fields = 3;
 83 | }
 84 | 
 85 | // Specification of a serialization format for a particular semiotic class.
 86 | message ClassSpec {
 87 |   // Indicates the type of token that may be serialized by this spec: those with
 88 |   // this field present, e.g. "cardinal" or "measure".
 89 |   optional /* required */ string semiotic_class = 1;
 90 | 
 91 |   // Denotes the style within the semiotic class. StyleSpecs augment ClassSpec
 92 |   // by enabling multiple ways of verbalizing the same semiotic class.
 93 |   repeated StyleSpec style_spec = 2;
 94 | }
 95 | 
 96 | // Collection of all serialization specs for a language. A single semiotic class
 97 | // may have more than one specification, and all matching serializations for
 98 | // that class will be included as paths in the output.
 99 | message SerializeSpec {
100 |   repeated ClassSpec class_spec = 1;
101 | }
102 | 


--------------------------------------------------------------------------------
/src/proto/sparrowhawk_configuration.proto:
--------------------------------------------------------------------------------
 1 | // Licensed under the Apache License, Version 2.0 (the "License");
 2 | // you may not use this file except in compliance with the License.
 3 | // You may obtain a copy of the License at
 4 | //
 5 | //     http://www.apache.org/licenses/LICENSE-2.0
 6 | //
 7 | // Unless required by applicable law or agreed to in writing, software
 8 | // distributed under the License is distributed on an "AS IS" BASIS,
 9 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | // See the License for the specific language governing permissions and
11 | // limitations under the License.
12 | //
13 | // Copyright 2015 and onwards Google, Inc.
14 | // The SparrowhawkConfiguration stores information about the grammars, and rules
15 | // to run.
16 | 
17 | syntax = "proto2";
18 | 
19 | package speech.sparrowhawk;
20 | 
21 | message SparrowhawkConfiguration {
22 |   optional string tokenizer_grammar = 1;  // Tokenizer-classifier.
23 | 
24 |   optional string verbalizer_grammar = 2;
25 | 
26 |   // Regular expression for sentence boundary detector. This is a set
27 |   // of possible end-of-sentence markers.
28 |   optional string sentence_boundary_regexp = 3;
29 | 
30 |   // Optional file specifying tokens that end in a possible end-of-sentence
31 |   // marker that should *not* usually induce an end-of-sentence decision
32 |   // e.g. “Mr.”
33 |   optional string sentence_boundary_exceptions_file = 4;
34 | 
35 |   // Optional file with SerializeSpec for verbalizer as a text proto. If the
36 |   // the field is not set, we resort to protobuf serializer.
37 |   optional string serialization_spec = 5;
38 | }
39 | 


--------------------------------------------------------------------------------