├── src
    ├── build
    │   └── .gitignore
    ├── yatts_util.h
    ├── 3rdparty
    │   └── utf8
    │   │   ├── utf8.h
    │   │   ├── Makefile
    │   │   ├── unchecked.h
    │   │   ├── core.h
    │   │   └── checked.h
    ├── Makefile
    ├── Utf8Transducer.h
    ├── utf8ext.h
    ├── yatts_util.cpp
    ├── transduce.cpp
    └── Utf8Transducer.cpp
├── AUTHORS
├── COPYING
├── misc
    ├── README
    └── parser.yy.patch
├── grammars
    ├── make.sh
    ├── tester.sh
    ├── transduce.sh
    ├── README
    ├── g2p.grm
    ├── crossword.grm
    ├── definitions.grm
    ├── palatalization.grm
    ├── diphthongs.grm
    ├── syllabification.grm
    ├── vowels.grm
    ├── alphabets.grm
    ├── inflections.grm
    └── consonants.grm
├── test
    ├── README
    ├── rus_sentences.txt
    └── rus_sentences.txt.g2p
└── README.md


/src/build/.gitignore:
--------------------------------------------------------------------------------
1 | Utf8Transducer.d transduce.d yatts_util.d
2 | Utf8Transducer.o
3 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | Principal Contacts:
2 | 
3 | Alexis Wilpert 	<wilpert@alumni.ethz.ch>
4 | Schamai Safra
5 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 | Licensed under the Apache License, Version 2.0 (the "License");
 2 | you may not use these files except in compliance with the License.
 3 | You may obtain a copy of the License at
 4 | 
 5 |     http://www.apache.org/licenses/LICENSE-2.0
 6 | 
 7 | Unless required by applicable law or agreed to in writing, software
 8 | distributed under the License is distributed on an "AS IS" BASIS,
 9 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | See the License for the specific language governing permissions and
11 | limitations under the License.
12 | 
13 | Copyright 2014 Yandex LLC
14 | 


--------------------------------------------------------------------------------
/misc/README:
--------------------------------------------------------------------------------
 1 | Before compiling the Thrax package (http://openfst.cs.nyu.edu/twiki/bin/view/GRM/Thrax),
 2 | you need to apply the patch file parser.yy.patch to the following file in the Thrax
 3 | original source code distribution:
 4 | 
 5 | /thrax-1.0.2/src/lib/main/parser.yy
 6 | 
 7 | Like this:
 8 | 
 9 | patch parser.yy parser.yy.patch
10 | 
11 | The patch is required to interpret the strings in the grammar files as UTF-8 encoded
12 | (default is otherwise BYTE).
13 | 
14 | Depending on the compiler's version or OS you are using, you might get some issues when
15 | trying to compile the program thraxcompiler. My experience is that this can be solved
16 | changing the order in which the libraries are passed to the compiler.
17 | 


--------------------------------------------------------------------------------
/misc/parser.yy.patch:
--------------------------------------------------------------------------------
 1 | *** parser.yy.old	2012-06-23 21:00:50.000000000 +0200
 2 | --- parser.yy	2014-06-23 11:12:36.601662500 +0200
 3 | ***************
 4 | *** 376,382 ****
 5 |   
 6 |   string_fst:
 7 |     quoted_fst_string
 8 | !     { StringFstNode* node = new StringFstNode(StringFstNode::BYTE);
 9 |         node->AddArgument($1);
10 |         node->SetLine($1->getline());  // Get the line from the actual text line.
11 |         $$ = node; }
12 | --- 376,383 ----
13 |   
14 |   string_fst:
15 |     quoted_fst_string
16 | !     //{ StringFstNode* node = new StringFstNode(StringFstNode::BYTE);
17 | !     { StringFstNode* node = new StringFstNode(StringFstNode::UTF8);
18 |         node->AddArgument($1);
19 |         node->SetLine($1->getline());  // Get the line from the actual text line.
20 |         $$ = node; }
21 | 


--------------------------------------------------------------------------------
/grammars/make.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Copyright 2014 Yandex LLC
16 | # All Rights Reserved.
17 | #
18 | # Author : Alexis Wilpert
19 | 
20 | 
21 | 
22 | make clean
23 | rm -f Makefile
24 | rm -f *.far
25 | rm -f *.stackdump
26 | thraxmakedep g2p.grm
27 | make
28 | farextract g2p.far
29 | 


--------------------------------------------------------------------------------
/grammars/tester.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Copyright 2014 Yandex LLC
16 | # All Rights Reserved.
17 | #
18 | # Author : Alexis Wilpert
19 | 
20 | 
21 | 
22 | thraxrewrite-tester --input_mode=utf8 --far=g2p.far --rules="\
23 | READ,\
24 | INFL,\
25 | PALT,\
26 | DIPH,\
27 | VOWL,\
28 | CONS,\
29 | SYLL,\
30 | CROS,\
31 | WRIT\
32 | "
33 | 


--------------------------------------------------------------------------------
/grammars/transduce.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 | # Copyright 2014 Yandex LLC
16 | # All Rights Reserved.
17 | #
18 | # Author : Alexis Wilpert
19 | 
20 | 
21 | 
22 | if [ -z $1 ]
23 | then
24 | 	echo -e "\nUsage: transduce.sh INPUT_FILE\n"
25 | 	exit
26 | fi
27 | 
28 | IN=$1
29 | OUT=$1.transduced
30 | 
31 | rm -f transduce.stackdump
32 | farextract g2p.far
33 | 
34 | transduce -fst="\
35 | READ,\
36 | INFL,\
37 | PALT,\
38 | DIPH,\
39 | VOWL,\
40 | CONS,\
41 | CROS,\
42 | SYLL,\
43 | WRIT\
44 | " $IN > $IN.g2p
45 | 


--------------------------------------------------------------------------------
/src/yatts_util.h:
--------------------------------------------------------------------------------
 1 | /* Licensed under the Apache License, Version 2.0 (the "License");
 2 |  * you may not use this file except in compliance with the License.
 3 |  * You may obtain a copy of the License at
 4 |  *
 5 |  *     http://www.apache.org/licenses/LICENSE-2.0
 6 |  *
 7 |  * Unless required by applicable law or agreed to in writing, software
 8 |  * distributed under the License is distributed on an "AS IS" BASIS,
 9 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 |  * See the License for the specific language governing permissions and
11 |  * limitations under the License.
12 |  *
13 |  * Copyright 2014 Yandex LLC
14 |  * All Rights Reserved.
15 |  *
16 |  * Author : Schamai Safra
17 |  *
18 |  *
19 |  * yatts_util.h
20 |  */
21 | 
22 | 
23 | 
24 | #include <utf8/utf8.h>
25 | #include <vector>
26 | #include <string>
27 | 
28 | #ifndef YATTS_UTIL_H_
29 | #define YATTS_UTIL_H_
30 | 
31 | int to_uint(char const *s);
32 | 
33 | std::vector<std::string> tokenize_utf8_string( std::string* utf8_string, std::string* delimiter, int limit = 0 );
34 | 
35 | template <class T, class A>
36 | T join(const A &begin, const A &end, const T &t)
37 | {
38 |   T result;
39 |   A it = begin;
40 |   if (it != end) {
41 |    result.append(*it++);
42 |   }
43 |   for( ; it!=end; ++it) {
44 |    result.append(t).append(*it);
45 |   }
46 |   return result;
47 | }
48 | 
49 | 
50 | #endif /* YATTS_UTIL_H_ */
51 | 


--------------------------------------------------------------------------------
/grammars/README:
--------------------------------------------------------------------------------
 1 | To compile the rules and generate the final FSTs, just run ./make.sh. The
 2 | exported FSTs are defined in the grammar file g2p.grm. The two FST files
 3 | G2P1 and G2P2 are defined for being used with the Python transcriber
 4 | script. They are split in two parts to minimize size on disk, but you may
 5 | just export a single FST file, if you wish.
 6 | 
 7 | After succesful compilation, you might test the rules using either:
 8 | 
 9 | - tester.sh: a very simple interactive loop. Just write in or paste the
10 |   words/sentences you want to test.
11 | 
12 | - transcribe.sh: which will transcribe a file with words or sentences
13 |   given as input.
14 | 
15 | Please, take the following in account:
16 | 
17 | - you will be able only to test words or sentences that have previously
18 |   been normalized. "Normalization" means in this case that the set of
19 |   characters used in the input string must be contained in the input
20 |   alphabet defined in the first FST (in_feeder in alphabets.grm). This
21 |   is done already when you use scripts/tts_transcriber.py
22 | 
23 | - second, the strings should contain a stress marker (the "+" char) for
24 |   optimal accuracy. This information comes either from the exceptions
25 |   lexicon or from the stress prediction model.
26 | 
27 | Thanks:
28 | 
29 | The implementation of the rules would not have been possible without the
30 | Russian language advice from my colleague at Yandex Anastasiya Polkanova.
31 | 
32 | Sources:
33 | 
34 | - Chew, Peter A. (2003): A Computational Phonology of Russian. Dissertation.com
35 | - Jones, Daniel & Ward, Dennis (1969): The Phonetics of Russian. Cambridge University Press
36 | 


--------------------------------------------------------------------------------
/src/3rdparty/utf8/utf8.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2006 Nemanja Trifunovic
 2 | 
 3 | /*
 4 | Permission is hereby granted, free of charge, to any person or organization
 5 | obtaining a copy of the software and accompanying documentation covered by
 6 | this license (the "Software") to use, reproduce, display, distribute,
 7 | execute, and transmit the Software, and to prepare derivative works of the
 8 | Software, and to permit third-parties to whom the Software is furnished to
 9 | do so, all subject to the following:
10 | 
11 | The copyright notices in the Software and this entire statement, including
12 | the above license grant, this restriction and the following disclaimer,
13 | must be included in all copies of the Software, in whole or in part, and
14 | all derivative works of the Software, unless such copies or derivative
15 | works are solely in the form of machine-executable object code generated by
16 | a source language processor.
17 | 
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 | DEALINGS IN THE SOFTWARE.
25 | */
26 | 
27 | 
28 | #ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
29 | #define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
30 | 
31 | #include "utf8/checked.h"
32 | #include "utf8/unchecked.h"
33 | 
34 | #endif // header guard
35 | 


--------------------------------------------------------------------------------
/src/3rdparty/utf8/Makefile:
--------------------------------------------------------------------------------
 1 | ### Makefile --- 
 2 | 
 3 | ## Author:
 4 | ## Keywords: 
 5 | ## X-URL: 
 6 | 
 7 | TARGETS=../libs/libjson.a
 8 | OBJS=json/json.o
 9 | 
10 | 
11 | ########################################################################
12 | # Macro definitions for "standard" C and C++ compilations
13 | ## 
14 | #CPPFLAGS=-g3 -O0 -fprofile-arcs -ftest-coverage -Ijson -fPIC
15 | CPPFLAGS=-g3 -O0   -Ijson -fPIC
16 | #
17 | CFLAGS=-g
18 | # 
19 | #  What is the name of the program you want to create?  (See below for notes
20 | #     on using this makefile to generate multiple programs.)
21 | #
22 | LINK=g++ $(CPPFLAGS)
23 | #LINK=gcc $(CFLAGS)
24 | # 
25 | #  Define special linkage flags.  Usually, these are used to include
26 | #  special libraries of code, e.g., -lm to add the library of mathematical
27 | #  routines such as sqrt, sin, cos, etc.
28 | LFLAGS=-lm
29 | #
30 | #
31 | #
32 | #  In most cases, you should not change anything below this line.
33 | #
34 | #  The following is "boilerplate" to set up the standard compilation
35 | #  commands:
36 | #
37 | .SUFFIXES:
38 | .SUFFIXES: .d .o .h .c .cc .C .cpp
39 | .c.o: ; $(CC) $(CFLAGS) -MMD -c $*.c
40 | .cc.o: ; $(CPP) $(CPPFLAGS) -MMD -c $*.cc 
41 | .C.o: ; $(CPP) $(CPPFLAGS) -MMD -c $*.C
42 | .cpp.o: ; $(CPP) $(CPPFLAGS) -MMD -c $*.cpp -o $@
43 | 
44 | CC=gcc
45 | CPP=g++
46 | 
47 | %.d: %.c
48 | 	touch $@
49 | %.d: %.cc
50 | 	touch $@
51 | %.d: %.C
52 | 	touch $@
53 | %.d: %.cpp
54 | 	touch $@
55 | 
56 | DEPENDENCIES = $(OBJS:.o=.d)
57 | 
58 | # 
59 | # Targets:
60 | # 
61 | all: $(TARGETS)
62 | 
63 | clean:
64 | 	-rm -f $(TARGETS) $(DEPENDENCIES) $(OBJS) make.dep
65 | 
66 | ../libs/libjson.a: json/json.o
67 | 	ar rcs $@ $^
68 | 
69 | make.dep: $(DEPENDENCIES)
70 | 	-cat $(DEPENDENCIES) > make.dep
71 | 
72 | include make.dep
73 | 
74 | ### Makefile ends here
75 | 


--------------------------------------------------------------------------------
/grammars/g2p.grm:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | #
13 | # Copyright 2014 Yandex LLC
14 | # All Rights Reserved.
15 | #
16 | # Author : Alexis Wilpert
17 | 
18 | 
19 | 
20 | import 'alphabets.grm' as alphabets;
21 | 
22 | import 'palatalization.grm' as palatal;
23 | import 'inflections.grm' as infl;
24 | import 'diphthongs.grm' as diphthongs;
25 | import 'vowels.grm' as vowels;
26 | import 'consonants.grm' as consonants;
27 | import 'syllabification.grm' as syll;
28 | import 'crossword.grm' as cross;
29 | 
30 | 
31 | export READ = Optimize[alphabets.in_feeder];
32 | export INFL = Optimize[infl.inflections];
33 | export PALT = Optimize[palatal.palatalization];
34 | export DIPH = Optimize[diphthongs.diphthongs];
35 | export VOWL = Optimize[vowels.reduced];
36 | export CONS = Optimize[consonants.consonant_rules];
37 | export CROS = Optimize[cross.crossword];
38 | export SYLL = Optimize[syll.syllabified];
39 | export WRIT = Optimize[alphabets.out_feeder];
40 | 
41 | export G2P1 =  Optimize[alphabets.in_feeder        @
42 |                         infl.inflections           @
43 |                         palatal.palatalization     @
44 |                         diphthongs.diphthongs      @
45 |                         vowels.reduced             @
46 |                         alphabets.out_feeder
47 |                        ];
48 | 
49 | export G2P2 =  Optimize[alphabets.in_feeder        @
50 |                         consonants.consonant_rules @
51 |                         cross.crossword            @
52 |                         syll.syllabified           @
53 |                         alphabets.out_feeder
54 |                        ];
55 | 


--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | #
13 | # Copyright 2014 Yandex LLC
14 | # All Rights Reserved.
15 | #
16 | # Author : Schamai Safra
17 | #
18 | #
19 | # Makefile
20 | 
21 | 
22 | 
23 | TARGETS=transduce
24 | 
25 | ### transduce: stand-alone transducer tool (UTF8 characters as input labels)
26 | OBJS_transduce=transduce.o yatts_util.o Utf8Transducer.o
27 | LIB_transduce = fst dl m rt
28 | 
29 | LIBS=../libs  /usr/local/lib
30 | 
31 | TARGET_LIBS = ${LIB_${1}:%=-l%}
32 | TARGET_OBJS = ${OBJS_${1}:%=%}
33 | OBJS=${foreach target,${TARGETS:%=OBJS_%},${${target}}}
34 | 
35 | ########################################################################
36 | # Macro definitions for "standard" C and C++ compilations
37 | #
38 | CPPFLAGS= -g3 -O0 -std=c++11 -I../include -I./3rdparty -I/usr/include -fPIC
39 | #
40 | CFLAGS=-g
41 | #
42 | LINK=g++ $(CPPFLAGS)
43 | 
44 | LFLAGS=$(LIBS:%=-L%)
45 | #
46 | .SUFFIXES:
47 | .SUFFIXES: .d .o .h .c .cc .C .cpp
48 | .c.o: ; $(CC) $(CFLAGS) -MMD -c $*.c
49 | .cc.o: ; $(CPP) $(CPPFLAGS) -MMD -c $*.cc
50 | .C.o: ; $(CPP) $(CPPFLAGS) -MMD -c $*.C
51 | .cpp.o: ; $(CPP) $(CPPFLAGS) -MMD -c $*.cpp
52 | 
53 | CC=gcc
54 | CPP=g++
55 | 
56 | %.d: %.c
57 | 	touch $@
58 | %.d: %.cc
59 | 	touch $@
60 | %.d: %.C
61 | 	touch $@
62 | %.d: %.cpp
63 | 	touch $@
64 | 
65 | DEPENDENCIES = $(OBJS:.o=.d)
66 | 
67 | #
68 | # Targets:
69 | #
70 | all: $(TARGETS:%=./build/%)
71 | 
72 | .SECONDEXPANSION:
73 | ./build/%: $$(OBJS_%)
74 | 	echo dependencies $^
75 | 	$(LINK) $(FLAGS) -o $@ $^ $(LFLAGS) $(call TARGET_LIBS,$*)
76 | 
77 | cleantargets:
78 | 	-rm -f $(TARGETS:%=../build/%) $(OBJS)
79 | 
80 | clean: cleantargets
81 | 	-rm -f $(DEPENDENCIES)
82 | 
83 | ### Makefile ends here
84 | 
85 | 


--------------------------------------------------------------------------------
/src/Utf8Transducer.h:
--------------------------------------------------------------------------------
 1 | /* Licensed under the Apache License, Version 2.0 (the "License");
 2 |  * you may not use this file except in compliance with the License.
 3 |  * You may obtain a copy of the License at
 4 |  *
 5 |  *     http://www.apache.org/licenses/LICENSE-2.0
 6 |  *
 7 |  * Unless required by applicable law or agreed to in writing, software
 8 |  * distributed under the License is distributed on an "AS IS" BASIS,
 9 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 |  * See the License for the specific language governing permissions and
11 |  * limitations under the License.
12 |  *
13 |  * Copyright 2014 Yandex LLC
14 |  * All Rights Reserved.
15 |  *
16 |  * Author : Schamai Safra
17 |  *
18 |  *
19 |  * Utf8Transducer.h
20 |  */
21 | 
22 | 
23 | 
24 | #ifndef UTF8TRANSDUCER_H_
25 | #define UTF8TRANSDUCER_H_
26 | #include <utf8/utf8.h>
27 | #include <fst/fstlib.h>
28 | 
29 | namespace yatts {
30 | 
31 | using namespace fst;
32 | 
33 | static const int maxMsgLength = 1000;
34 | 
35 | class Utf8Transducer {
36 | public:
37 |     enum Status {
38 |         OK,
39 |         WARN,
40 |         ERROR,
41 |     };
42 |     Utf8Transducer();
43 |     virtual ~Utf8Transducer();
44 |     Status appendFst(VectorFst<StdArc>*& transducer, string id = string(""));
45 |     Status appendFst(const string& file_name, string id = string(""));
46 |     Status transduceText(string text, string& result);
47 |     Status readOrNewSymtab(string file, string name);
48 |     const char* getMessage() const;
49 | 
50 | protected:
51 |     template <class Arc>
52 |     VectorFst<Arc> * MakeInputFST(vector<typename Arc::Label> input);
53 |     vector<VectorFst<StdArc>*> transducers;
54 |     SymbolTable * symbolTable;
55 |     vector<string> transducer_ids;
56 |     char message[maxMsgLength];
57 | };
58 | 
59 | template <class Arc>
60 | VectorFst<Arc> * Utf8Transducer::MakeInputFST(vector<typename Arc::Label> input) {
61 |     typedef typename Arc::StateId StateId;
62 |     typedef typename Arc::Weight Weight;
63 |     typedef typename Arc::Label Label;
64 |     fst::VectorFst<Arc> * ifst = new fst::VectorFst<Arc>();
65 |     ifst->DeleteStates();
66 |     StateId s = ifst->AddState(), nextstate = fst::kNoStateId;
67 |     ifst->SetStart(s);
68 |     for (size_t i = 0; i < input.size(); i++) {
69 |         nextstate = ifst->AddState();
70 |         Arc arc(input[i], input[i], Weight::One(), nextstate);
71 |         ifst->AddArc(s, arc);
72 |         s = nextstate;
73 |     }
74 |     ifst->SetFinal(s, Weight::One());
75 |     return ifst;
76 | }
77 | 
78 | 
79 | } /* namespace yatts */
80 | 
81 | 
82 | #endif /* UTF8TRANSDUCER_H_ */
83 | 


--------------------------------------------------------------------------------
/test/README:
--------------------------------------------------------------------------------
 1 | This directory contains the following files:
 2 | 
 3 | - rus_sentences.txt: 173 Russian sentences to test the transcriber output.
 4 | - rus_sentences.txt.g2p: the phonetic transcription of all 173 sentences.
 5 | - rus_sentences.txt.log: transcription log for every sentence, for debugging purposes.
 6 | 
 7 | The log file contains the following information (in this order):
 8 | 
 9 |     [SNUM]: sentence number. It should correlate with the line number of previous file.
10 |             Useful to quickly locate any sentence.
11 |     [SENT]: the original sentence.
12 |     [YOWR]: a word in the sentence was reconstructed as having a yo letter
13 |     [NORM]: the normalized sentence (currently only case normalization and separation
14 |             of punctuation symbols).
15 |     [WORD]: the word being processed.
16 |     [POSP]: POS prediction output for the word if available.
17 | 
18 |     One of the following:
19 | 
20 |         [DISA]: if the word was found in the homographs list. Values here can be:
21 |             * morpho-syntactic tags that were used for the disambiguation.
22 |             * "SINGLETON": the word is not really an homograph (only one unique
23 |               transcription found).
24 |             * "LEX1": no disambiguation possible, entry marked by LEX1 was chosen.
25 |             * "FREQ": no LEX information, the most frequent variant was chosen
26 |               (or the first one, if all were equally frequent).
27 |         [INFO]: word was not found in the homograph list. Possible values:
28 |             * "entry found in user lexicon".
29 |             * "entry found in lexicon".
30 |             * "stress predicted".
31 |         [STRS]: predicted string with stress information, if the word was not found in
32 |                 any dictionary.
33 | 
34 |     [SPHO]: phonetic transcription for the whole sentence after applying cross-word
35 |             assimilations.
36 | 
37 | Notes/disclaimer:
38 | 
39 | - The output files (g2p and log) are kept here for information purposes only. More
40 |   specifically, you should not expect to get the same results if you run the
41 |   transcriber on the input sentences. The final output will heavily depend on the
42 |   actual stress prediction model that you use. It can depend also on the version of
43 |   the software packages on which the transcription process depend.
44 | 
45 | - The transcriptions in rus_sentences.txt.g2p are not guaranteed to be correct.
46 |   Actually, the contrary is the case: since the version of the transcriber uploaded
47 |   to GitHub does not contain any POS prediction software many of the words whose
48 |   pronunciation depend on their function in the sentence will not be predicted
49 |   correctly.
50 | 
51 | Thanks:
52 | 
53 | The test sentences were carefully selected by my colleague at Yandex Anastasiya
54 | Polkanova. They were used to assess the quality of the TTS transcriber.
55 | 


--------------------------------------------------------------------------------
/grammars/crossword.grm:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | #
13 | # Copyright 2014 Yandex LLC
14 | # All Rights Reserved.
15 | #
16 | # Author : Alexis Wilpert
17 | 
18 | 
19 | 
20 | import 'alphabets.grm' as alphabets;
21 | import 'definitions.grm' as defs;
22 | import 'consonants.grm' as cons;
23 | 
24 | # set/class definitions
25 | 
26 | LSEP = defs.LSEP;
27 | RSEP = defs.RSEP;
28 | EOS = "[EOS]" | "[SIL]";
29 | vowel = defs.vowel;
30 | cons_letter_hyphen = defs.cons_letter_hyphen;
31 | stress_minus_1 = defs.stress_minus_1;
32 | before_stress = defs.before_stress;
33 | all = defs.all;
34 | soft_cons = defs.soft_cons;
35 | hard_cons = defs.hard_cons;
36 | 
37 | devoicing_pairs = cons.devoicing_pairs;
38 | voicing_pairs = cons.voicing_pairs;
39 | voicing_context = cons.voicing_context;
40 | 
41 | #----------------------------------------------------------------------------
42 | 
43 | # devoicing
44 | 
45 | devoicing_context = alphabets.voiceless_consonants |
46 |                     "n" | "[nJ]" | "m" | "[mJ]"    |
47 |                     "l" | "[lJ]" | "r" | "[rJ]"    |
48 |                     "j" | "v" | "[vJ]" |
49 |                     ("+"? alphabets.nuclei)
50 | ;
51 | 
52 | cross_devoicing = CDRewrite[devoicing_pairs,
53 |                             "",
54 |                             EOS |
55 |                             (alphabets.word_sep
56 |                              devoicing_context),
57 |                             alphabets.sigma_star
58 |                            ];
59 | 
60 | #----------------------------------------------------------------------------
61 | 
62 | # voicing
63 | 
64 | cross_voicing = CDRewrite[voicing_pairs,
65 |                           "",
66 |                           (alphabets.word_sep
67 |                           voicing_context),
68 |                           alphabets.sigma_star
69 |                          ];
70 | 
71 | #----------------------------------------------------------------------------
72 | 
73 | # remove [WUD] trigger (marking certain function words in the lexicon
74 | 
75 | clean_wud = CDRewrite[("[WUD]":""),
76 |                       "",
77 |                       "",
78 |                       alphabets.sigma_star
79 |                      ];
80 | 
81 | #----------------------------------------------------------------------------
82 | 
83 | export crossword = Optimize[cross_devoicing         @
84 |                             cross_voicing           @
85 |                             cons.voice_assimilation @
86 |                             clean_wud
87 |                            ];
88 | 


--------------------------------------------------------------------------------
/grammars/definitions.grm:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | #
13 | # Copyright 2014 Yandex LLC
14 | # All Rights Reserved.
15 | #
16 | # Author : Alexis Wilpert
17 | 
18 | 
19 | 
20 | import 'alphabets.grm' as alphabets;
21 | 
22 | # set/class definitions
23 | 
24 | export LSEP = (alphabets.word_sep | "-");
25 | export RSEP = (alphabets.word_sep);
26 | 
27 | # coming from the palatalization done above
28 | soft_cons_phonetic = ("[bJ]" | "[vJ]" | "[gJ]" |
29 |                       "[dJ]" | "[zJ]" | "[kJ]" |
30 |                       "[lJ]" | "[mJ]" | "[nJ]" |
31 |                       "[pJ]" | "[rJ]" | "[sJ]" |
32 |                       "[tJ]" | "[fJ]" | "[xJ]"
33 |                      );
34 | 
35 | # stressed vowels are already phonetic
36 | export vowel = Optimize[(alphabets.vow_letter | alphabets.nuclei)];
37 | 
38 | # [j] could appear after the approximant insertion
39 | export cons_letter_hyphen = alphabets.cons_letter |
40 |                             soft_cons_phonetic    |
41 |                             "j"                   |
42 |                             "-"
43 | ;
44 | 
45 | export stress_minus_1 = Optimize[cons_letter_hyphen* "+"];
46 | 
47 | export before_stress = Optimize[(cons_letter_hyphen | vowel)* "+"];
48 | 
49 | export all = Optimize[cons_letter_hyphen | vowel];
50 | 
51 | # Vi   = <и, е, ё, я, ю> - vowels that trigger palatalization
52 | # Cvar = <б, п, в, ф, з, с, г, к, д, х, т, л, м, н, р>
53 | #
54 | # <Cvar> / ¬_ <Vi> --> hard consonant
55 | # <Cvar> /  _ <Vi> --> soft consonant
56 | # <Cvar> /  _ <ь>  --> soft consonant
57 | # <Cvar> /  _ <ъ>  --> hard consonant
58 | #
59 | # always hard letters:
60 | # <ж, ш, ц>
61 | #
62 | # always soft letters:
63 | # <ч, щ>
64 | #
65 | # "Vi-palatalization" does not exist at the words junction.
66 | 
67 | # the rest of the contexts are not needed, because they are represented
68 | # by the phonetic palatal consonants that were generated previously
69 | export always_soft_cons = ("ч" | "щ" | "ь") "-"?;
70 | 
71 | cd_soft_cons = (soft_cons_phonetic | "j") "-"?;
72 | 
73 | export always_hard_cons = ("ж" | "ш" | "ц" | "ъ") "-"?;
74 | 
75 | # no other palatalization will take place, so we can safely assume that
76 | # consonants that are not phonetically soft, are hard consonants
77 | # not a context-dependent rule, but we leave it as it is in case of changes
78 | # in the future
79 | cd_hard_cons = ("б" | "п" | "в"  |
80 |                 "ф" | "з" | "с"  |
81 |                 "г" | "к" | "д"  |
82 |                 "х" | "т" | "л"  |
83 |                 "м" | "н" | "р"
84 |                ) "-"?
85 | ;
86 | 
87 | export soft_cons = Optimize[(always_soft_cons | cd_soft_cons)];
88 | 
89 | export hard_cons = Optimize[(always_hard_cons | cd_hard_cons)];
90 | 
91 | 


--------------------------------------------------------------------------------
/src/utf8ext.h:
--------------------------------------------------------------------------------
  1 | /* Licensed under the Apache License, Version 2.0 (the "License");
  2 |  * you may not use this file except in compliance with the License.
  3 |  * You may obtain a copy of the License at
  4 |  *
  5 |  *     http://www.apache.org/licenses/LICENSE-2.0
  6 |  *
  7 |  * Unless required by applicable law or agreed to in writing, software
  8 |  * distributed under the License is distributed on an "AS IS" BASIS,
  9 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 |  * See the License for the specific language governing permissions and
 11 |  * limitations under the License.
 12 |  *
 13 |  * Copyright 2014 Yandex LLC
 14 |  * All Rights Reserved.
 15 |  *
 16 |  * Author : Schamai Safra
 17 |  *
 18 |  *
 19 |  * utf8ext.h
 20 |  */
 21 | 
 22 | 
 23 | 
 24 | #ifndef UTF8EXT_H_
 25 | #define UTF8EXT_H_
 26 | 
 27 | #include "utf8/utf8.h"
 28 | #include <stdexcept>
 29 | 
 30 | namespace utf8 {
 31 | 
 32 | using namespace std;
 33 | template <typename T, size_t N>
 34 | T* begin(T(&arr)[N]) { return &arr[0]; }
 35 | template <typename T, size_t N>
 36 | T* end(T(&arr)[N]) { return &arr[0]+N; }
 37 | 
 38 | template <typename T, size_t N>
 39 | bool is_in(T x, const T(&arr)[N]) {
 40 |     const T* lb = lower_bound(begin(arr), end(arr), x);
 41 |     return (end(arr) != lb) && (*lb == x);
 42 | }
 43 | 
 44 | 
 45 | const uint32_t WHITESPACE_CHARS[] = {32, 160, 5760, 6158, 8192, 8193, 8194, 8195,8196, 8197, 8198, 8199, 8200, 8201, 8202, 8203, 8204,  8205, 8239, 8287, 8288, 12288, 65279};
 46 | 
 47 | const uint32_t CR = 13, LF = 10;
 48 | const uint32_t EOL_CHARS[] = {10, 11, 12, 13, 133, 8232, 8233};
 49 | 
 50 | // CR=13, LF=10, 11, 12, 133, 8232, 8233
 51 | 
 52 | 
 53 | inline bool is_whitespace(uint32_t cp) {
 54 |         return is_in(cp, WHITESPACE_CHARS);
 55 | }
 56 | 
 57 | inline bool is_EOL(uint32_t cp) {
 58 |         return is_in(cp, EOL_CHARS);
 59 | }
 60 | 
 61 | 
 62 | /**
 63 |  * like utf8::next, but in case of invalid sequence,
 64 |  * consumes irreparable octets before throwing.
 65 |  * @param start
 66 |  * @param end
 67 |  * @return next code point, if valid
 68 |  */
 69 | template <typename octet_iterator>
 70 | uint32_t next_skip_invalid(octet_iterator& it, octet_iterator end)
 71 | {
 72 |     uint32_t cp = 0;
 73 |     internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
 74 |     octet_iterator bad;
 75 |     switch (err_code) {
 76 |         case internal::UTF8_OK :
 77 |             break;
 78 |         case internal::NOT_ENOUGH_ROOM :
 79 |             throw not_enough_room();
 80 |             break;
 81 |         case internal::INVALID_LEAD :
 82 |             bad = it;
 83 |             it++;
 84 |             throw invalid_utf8(*bad);
 85 |             break;
 86 |         case internal::INCOMPLETE_SEQUENCE :
 87 |         case internal::OVERLONG_SEQUENCE :
 88 |             bad = it;
 89 |             it++;
 90 |             while (it != end && utf8::internal::is_trail(*it))
 91 |                 ++it;
 92 |             throw invalid_utf8(*bad);
 93 |             break;
 94 |         case internal::INVALID_CODE_POINT :
 95 |             it++;
 96 |             while (it != end && utf8::internal::is_trail(*it))
 97 |                 ++it;
 98 |             throw invalid_code_point(cp);
 99 |     }
100 |     return cp;
101 | }
102 | 
103 | 
104 | }
105 | 
106 | 
107 | #endif /* UTF8EXT_H_ */
108 | 


--------------------------------------------------------------------------------
/grammars/palatalization.grm:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | #
 13 | # Copyright 2014 Yandex LLC
 14 | # All Rights Reserved.
 15 | #
 16 | # Author : Alexis Wilpert
 17 | 
 18 | 
 19 | 
 20 | import 'alphabets.grm' as alphabets;
 21 | import 'definitions.grm' as defs;
 22 | 
 23 | # approximant insertion is necessary when:
 24 | # - <e, ю, я, ё> are after another vowel
 25 | # - or after <ь, ъ>
 26 | # - or at the beginning of the word.
 27 | 
 28 | BOS = "[BOS]" | "[SIL]";
 29 | SEP = alphabets.word_sep;
 30 | WUD = "[WUD]" SEP;
 31 | 
 32 | #----------------------------------------------------------------------------
 33 | 
 34 | insertion_context = (alphabets.vow_letter |
 35 |                      "й"                  |
 36 |                      ("ь" | "ъ")          |
 37 |                      defs.LSEP            |
 38 |                      BOS
 39 |                     ) "+"?
 40 | ;
 41 | 
 42 | insertion_pairs = ("е":"jе") |
 43 |                   ("ю":"jю") |
 44 |                   ("я":"jя") |
 45 |                   ("ё":"jё")
 46 | ;
 47 | 
 48 | initial_approximant_insertion = CDRewrite[insertion_pairs,
 49 |                                           insertion_context,
 50 |                                           "",
 51 |                                           alphabets.sigma_star
 52 |                                          ];
 53 | 
 54 | # correct approximant insertion after WUD
 55 | 
 56 | approx_insertion_correction = CDRewrite[("j":""),
 57 |                                         "[WUD]" SEP,
 58 |                                         "",
 59 |                                         alphabets.sigma_star
 60 |                                        ];
 61 | 
 62 | approximant_insertion = Optimize[initial_approximant_insertion @
 63 |                                  approx_insertion_correction
 64 |                                 ];
 65 | 
 66 | # WARNING: possible output --> + j V
 67 | 
 68 | #----------------------------------------------------------------------------
 69 | 
 70 | # we do the palatalization here since later we will loose the orthographic
 71 | # context triggered by <я>, <ю> and <ё>
 72 | 
 73 | # we need to take this in consideration in the rules below
 74 | 
 75 | palatal_cons_pairs = ("б":"[bJ]") |
 76 |                      ("в":"[vJ]") |
 77 |                      ("г":"[gJ]") |
 78 |                      ("д":"[dJ]") |
 79 |                      ("з":"[zJ]") |
 80 |                      ("к":"[kJ]") |
 81 |                      ("л":"[lJ]") |
 82 |                      ("м":"[mJ]") |
 83 |                      ("н":"[nJ]") |
 84 |                      ("п":"[pJ]") |
 85 |                      ("р":"[rJ]") |
 86 |                      ("с":"[sJ]") |
 87 |                      ("т":"[tJ]") |
 88 |                      ("ф":"[fJ]") |
 89 |                      ("х":"[xJ]")
 90 | ;
 91 | 
 92 | cd_palatalization  = CDRewrite[palatal_cons_pairs,
 93 |                                "",
 94 |                                "ь" | ("-"? "+"?
 95 |                                       alphabets.soft_vow_letter),
 96 |                                alphabets.sigma_star
 97 |                               ];
 98 | 
 99 | #----------------------------------------------------------------------------
100 | 
101 | export palatalization = Optimize[approximant_insertion @
102 |                                  cd_palatalization
103 |                                 ];
104 | 
105 | 


--------------------------------------------------------------------------------
/src/yatts_util.cpp:
--------------------------------------------------------------------------------
  1 | /* Licensed under the Apache License, Version 2.0 (the "License");
  2 |  * you may not use this file except in compliance with the License.
  3 |  * You may obtain a copy of the License at
  4 |  *
  5 |  *     http://www.apache.org/licenses/LICENSE-2.0
  6 |  *
  7 |  * Unless required by applicable law or agreed to in writing, software
  8 |  * distributed under the License is distributed on an "AS IS" BASIS,
  9 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 |  * See the License for the specific language governing permissions and
 11 |  * limitations under the License.
 12 |  *
 13 |  * Copyright 2014 Yandex LLC
 14 |  * All Rights Reserved.
 15 |  *
 16 |  * Author : Schamai Safra
 17 |  *
 18 |  *
 19 |  * yatts_util.cpp
 20 |  */
 21 | 
 22 | 
 23 | 
 24 | #include "yatts_util.h"
 25 | #include <cstring>
 26 | #include <algorithm>
 27 | 
 28 | int to_uint(char const *s) {
 29 |     int result = 0;
 30 |     if (!*s) {
 31 |         throw std::invalid_argument("invalid input string");
 32 |     }
 33 |     while (*s) {
 34 |         if (*s >= '0' && *s <= '9') {
 35 |             result = result * 10 + (*s - '0');
 36 |         } else {
 37 |             throw std::invalid_argument("invalid input string");
 38 |         }
 39 |         s++;
 40 |     }
 41 |     return result;
 42 | }
 43 | 
 44 | 
 45 | std::vector<std::string> tokenize_utf8_string(std::string* utf8_string, std::string* delimiters,
 46 |                                     int limit) {
 47 |     /*
 48 |      Support for tokenizing a utf-8 string. Adapted to also support delimiters and a limit.
 49 |      Note that (unlike Joe's version) any of the utf8 characters in delimiters is a delimiter (like strtok),
 50 |      not the whole string.
 51 |      Note that leading, trailing or multiple consecutive delimiters will result in
 52 |      empty vector elements.  Normally should not be a problem but just in case.
 53 |      Also note that any tokens that cannot be found in the model symbol table will be
 54 |      deleted from the input word prior to grapheme-to-phoneme conversion.
 55 | 
 56 |      http://stackoverflow.com/questions/2852895/c-iterate-or-split-utf-8-string-into-array-of-symbols#2856241
 57 | 
 58 |      schsafra: adapted from http://code.google.com/p/phonetisaurus/ (phonetisaurus-0.7.8) by Josef Robert Novak
 59 | 
 60 |      */
 61 |     char* str = (char*) utf8_string->c_str(); // utf-8 string
 62 |     char* str_i = str;                         // string iterator
 63 |     char* str_j = str;
 64 |     char* end = str + strlen(str) + 1;           // end iterator
 65 |     std::vector<std::string> string_vec;
 66 |     std::vector<int> delim_code;
 67 |     if (delimiters->compare("") != 0) {
 68 |         string_vec.push_back("");
 69 |         char* delim_i = (char*) delimiters->c_str();
 70 |         char* delim_end = delim_i + strlen(delim_i) + 1;
 71 |         do {
 72 |             delim_code.push_back(utf8::next(delim_i, delim_end));
 73 |         } while (delim_i < delim_end);
 74 |     }
 75 |     do {
 76 |         str_j = str_i;
 77 |         utf8::uint32_t code = utf8::next(str_i, end); // get 32 bit code of a utf-8 symbol
 78 |         if (code == 0) {
 79 |             continue;
 80 |         }
 81 |         int start = strlen(str) - strlen(str_j);
 82 |         int end = strlen(str) - strlen(str_i);
 83 |         int len = end - start;
 84 | 
 85 |         if (delimiters->compare("") == 0) {
 86 |             string_vec.push_back(utf8_string->substr(start, len));
 87 |         } else {
 88 |             if ((limit == 0 || string_vec.size() < limit) &&
 89 |                     std::find(delim_code.begin(), delim_code.end(), code) != delim_code.end()) {
 90 |                 string_vec.push_back("");
 91 |             } else {
 92 |                 string_vec[string_vec.size() - 1] += utf8_string->substr(start,
 93 |                                                                          len);
 94 |             }
 95 |         }
 96 |     } while (str_i < end);
 97 | 
 98 |     return string_vec;
 99 | }
100 | 
101 | 
102 | /*
103 |  * http://stackoverflow.com/questions/9620437/string-const-char-size-t-to-int
104 |  */
105 | int to_int(char const *s, size_t count)
106 | {
107 |      size_t i = 0 ;
108 |      if ( s[0] == '+' || s[0] == '-' )
109 |           ++i;
110 |      int result = 0;
111 |      while(i < count)
112 |      {
113 |           if ( s[i] >= '0' && s[i] <= '9' )
114 |           {
115 |               result = result * 10  - (s[i] - '0');  //assume negative number
116 |           }
117 |           else
118 |               throw std::invalid_argument("invalid input string");
119 |           i++;
120 |      }
121 |      return s[0] == '-' ? result : -result; //-result is positive!
122 | }
123 | 


--------------------------------------------------------------------------------
/src/transduce.cpp:
--------------------------------------------------------------------------------
  1 | /* Licensed under the Apache License, Version 2.0 (the "License");
  2 |  * you may not use this file except in compliance with the License.
  3 |  * You may obtain a copy of the License at
  4 |  *
  5 |  *     http://www.apache.org/licenses/LICENSE-2.0
  6 |  *
  7 |  * Unless required by applicable law or agreed to in writing, software
  8 |  * distributed under the License is distributed on an "AS IS" BASIS,
  9 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 |  * See the License for the specific language governing permissions and
 11 |  * limitations under the License.
 12 |  *
 13 |  * Copyright 2014 Yandex LLC
 14 |  * All Rights Reserved.
 15 |  *
 16 |  * Author : Schamai Safra
 17 |  *
 18 |  *
 19 |  * transduce.cpp
 20 |  */
 21 | 
 22 | 
 23 | 
 24 | #include <float.h>
 25 | #include <stdexcept>
 26 | #include <fst/fstlib.h>
 27 | #include <fst/script/print.h>
 28 | #include <fst/dfs-visit.h>
 29 | #include <fst/arc-map.h>
 30 | #include <fst/extensions/far/far.h>
 31 | #include <fst/extensions/far/farscript.h>
 32 | #include <utf8/utf8.h>
 33 | #include "utf8ext.h"
 34 | #include "yatts_util.h"
 35 | #include "Utf8Transducer.h"
 36 | 
 37 | using namespace fst;
 38 | using namespace yatts;
 39 | 
 40 | #define VERBOSE 1
 41 | #if VERBOSE
 42 | #define msg(x, ...) fprintf(stderr, x, ##__VA_ARGS__);
 43 | #else
 44 | #define msg(x, ...)
 45 | #endif
 46 | #define SAVE_INTERMEDIATE 0
 47 | 
 48 | DEFINE_string(fst, "", "rewrite FST.");
 49 | DEFINE_string( symbols, "", "symbol table of the rewrite FST.");
 50 | 
 51 | 
 52 | 
 53 | void ProcessCorpus(string corpus_filename, Utf8Transducer& transducer, FILE * fp) {
 54 |     istream * ifp = &cin;
 55 |     ifstream corpus_fp;
 56 |     if (corpus_filename != "-") {
 57 |         corpus_fp.open(corpus_filename.c_str());
 58 |         if (corpus_fp.is_open()) {
 59 |             ifp = &corpus_fp;
 60 |         } else {
 61 |             msg("*** warning: Can't open '%s' for reading.\n",
 62 |                     corpus_filename.c_str());
 63 |             exit(1);
 64 |         }
 65 |     }
 66 |     string line;
 67 |     int lineCount = 0;
 68 |     while (ifp->good()) {
 69 |         getline(*ifp, line);
 70 |         if (line.compare("") == 0) {
 71 |             continue;
 72 |         }
 73 |         lineCount++;
 74 |         try {
 75 |             string utf8line;
 76 |             int status = transducer.transduceText(line, utf8line);
 77 |             switch (status) {
 78 |                 case Utf8Transducer::OK:
 79 |                     fprintf(fp, "%s\n", utf8line.c_str());
 80 |                     break;
 81 |                 case Utf8Transducer::WARN:
 82 |                     msg("*** warning: %s\n", transducer.getMessage());
 83 |                     fprintf(fp, "%s\n", utf8line.c_str());
 84 |                     break;
 85 |                 default:
 86 |                     msg("*** warning: %s\n", transducer.getMessage());
 87 |                     fprintf(fp, "\n");
 88 |                     break;
 89 |             }
 90 | 
 91 |         } catch (std::exception& e) {
 92 |             cerr << e.what() << endl;
 93 |             throw;
 94 |         }
 95 |     }
 96 |     corpus_fp.close();
 97 | }
 98 | 
 99 | 
100 | 
101 | int main(int argc, char **argv) {
102 |     string usage =
103 |             "Transduce (rewrite) words according to rewrite-fst .\n\n  Usage: ";
104 |     usage += argv[0];
105 |     usage += " [input.utf [output.utf]]\n";
106 |     set_new_handler(FailedNewHandler);
107 |     SetFlags(usage.c_str(), &argc, &argv, true);
108 | 
109 | #define MANDATORY(name)                        \
110 |     if (FLAGS_ ## name == "") {                \
111 |         fprintf(stderr,"*** Error: --" # name " is mandatory\n");   \
112 |         exit(1);                               \
113 |     }
114 | 
115 |     MANDATORY(fst);
116 | 
117 |     if (argc > 3) {
118 |         ShowUsage();
119 |         return 1;
120 |     }
121 | 
122 |     string in_name = (argc > 1 && (strcmp(argv[1], "-") != 0)) ? argv[1] : "-";
123 |     string out_name = argc > 2 ? argv[2] : "-";
124 | 
125 |     yatts::Utf8Transducer transducer;
126 | 
127 |     // currently not used: transducer deletes symbol tables when reading fsts
128 |     // later we may use one, or even read one for each fst
129 |     transducer.readOrNewSymtab(FLAGS_symbols, "symbol_table");
130 | 
131 |     string delim = string(",");
132 | 
133 |     vector<string> rules = tokenize_utf8_string(&FLAGS_fst, &delim);
134 |     for (int i = 0; i < rules.size(); i++) {
135 |         Utf8Transducer::Status s = transducer.appendFst(rules[i], rules[i]);
136 |         switch (s) {
137 |         case Utf8Transducer::OK:
138 |             msg("loaded fst '%s'\n", rules[i].c_str());
139 |             break;
140 |         case Utf8Transducer::WARN:
141 |             msg("*** warning: %s\n", transducer.getMessage());
142 |             break;
143 |         case Utf8Transducer::ERROR:
144 |             throw runtime_error(transducer.getMessage());
145 |         default:
146 |             msg("*** warning: %s\n", transducer.getMessage());
147 |             break;
148 |         }
149 |     }
150 | 
151 |     FILE * fp;
152 |     if (out_name == "-") {
153 |         fp = stdout;
154 |     } else {
155 |         fp = fopen(out_name.c_str(), "w");
156 |     }
157 |     ProcessCorpus(in_name, transducer, fp);
158 | }
159 | 


--------------------------------------------------------------------------------
/src/Utf8Transducer.cpp:
--------------------------------------------------------------------------------
  1 | /* Licensed under the Apache License, Version 2.0 (the "License");
  2 |  * you may not use this file except in compliance with the License.
  3 |  * You may obtain a copy of the License at
  4 |  *
  5 |  *     http://www.apache.org/licenses/LICENSE-2.0
  6 |  *
  7 |  * Unless required by applicable law or agreed to in writing, software
  8 |  * distributed under the License is distributed on an "AS IS" BASIS,
  9 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 |  * See the License for the specific language governing permissions and
 11 |  * limitations under the License.
 12 |  *
 13 |  * Copyright 2014 Yandex LLC
 14 |  * All Rights Reserved.
 15 |  *
 16 |  * Author : Schamai Safra
 17 |  *
 18 |  *
 19 |  * Utf8Transducer.cpp
 20 |  */
 21 | 
 22 | 
 23 | 
 24 | #include "utf8ext.h"
 25 | #include "Utf8Transducer.h"
 26 | 
 27 | namespace yatts {
 28 | 
 29 | Utf8Transducer::Utf8Transducer(): symbolTable(0) {
 30 |     message[0] = 0;
 31 |     // TODO Auto-generated constructor stub
 32 | 
 33 | }
 34 | 
 35 | Utf8Transducer::~Utf8Transducer() {
 36 |     // TODO Auto-generated destructor stub
 37 | }
 38 | 
 39 | Utf8Transducer::Status Utf8Transducer::appendFst(VectorFst<StdArc>*& transducer, string id) {
 40 |     if (id == "") {
 41 |         char buf[10];
 42 |         snprintf(buf, 10, "%lu",transducers.size());
 43 |         id = string(buf);
 44 |     }
 45 |     if (transducer) {
 46 |         transducers.push_back(transducer);
 47 |         transducer_ids.push_back(id);
 48 |         return OK;
 49 |     } else {
 50 |         snprintf(message, maxMsgLength, "Cannot append null FST '%s'", id.c_str());
 51 |         return WARN;
 52 |     }
 53 | }
 54 | 
 55 | Utf8Transducer::Status Utf8Transducer::appendFst(const string& file_name, string id) {
 56 |     if (id == "") {
 57 |         char buf[10];
 58 |         snprintf(buf, 10, "%lu",transducers.size());
 59 |         id = string(buf);
 60 |     }
 61 |     VectorFst<StdArc> * fst = 0;
 62 |     try {
 63 |          fst = VectorFst<StdArc>::Read(file_name);
 64 |     } catch (exception& e) {
 65 |         snprintf(message, maxMsgLength, "Cannot load FST '%s' from file '%s': %s", id.c_str(), file_name.c_str(),e.what());
 66 |         return WARN;
 67 |     }
 68 |     if (!fst) {
 69 |         snprintf(message, maxMsgLength, "Cannot load FST '%s' from file '%s'", id.c_str(), file_name.c_str());
 70 |         return WARN;
 71 |     }
 72 |     // sort arcs by input label
 73 |     ArcSort(fst, ILabelCompare<StdArc>());
 74 |     // ------- Delete input symbols (assume utf8 input)
 75 |     fst->SetInputSymbols(NULL);
 76 |     this->appendFst(fst, id);
 77 |     return OK;
 78 | }
 79 | 
 80 | Utf8Transducer::Status Utf8Transducer::transduceText(string text, string& result) {
 81 |     string::iterator readPos = text.begin();
 82 |     utf8::uint32_t codePoint;
 83 |     vector<StdArc::Label> input;
 84 |     //string utf8line;
 85 |     while (readPos < text.end()) {
 86 |         codePoint = utf8::next_skip_invalid(readPos, text.end());
 87 |         input.push_back(codePoint);
 88 |     }
 89 |     try {
 90 |         bool ok = true;
 91 |         VectorFst<StdArc> * fst = MakeInputFST<StdArc>(input), * fst2;
 92 |         for (size_t i = 0; i < transducers.size(); i++) {
 93 |             fst2 = new VectorFst<StdArc>(
 94 |                     ComposeFst<StdArc>(*fst, *transducers[i]));
 95 |             delete fst;
 96 |             fst = fst2;
 97 |             Connect(fst);
 98 |             if (fst->NumStates() == 0) {
 99 |                 snprintf(message, maxMsgLength, "No transduction after applying transducer '%s'", transducer_ids[i].c_str());
100 |                 break;
101 |             }
102 |         }
103 |         if (fst && fst->Start() >= 0 && fst->NumArcs(fst->Start()) > 0) {
104 |             fst::VectorFst<StdArc> nbest_paths;
105 |             fst::ShortestPath(*fst, &nbest_paths, 2);
106 |             delete fst;
107 |             vector<unsigned short> utf16line;
108 |             StdArc::StateId cur_state = nbest_paths.Start();
109 |             if (cur_state < 0 || nbest_paths.NumArcs(cur_state) < 1) {
110 |                 ok = false;
111 |             } else {
112 |                 if (nbest_paths.NumArcs(cur_state) != 1) {
113 |                     snprintf(message, maxMsgLength,
114 |                             "ambiguous transduction (%s)", text.c_str());
115 |                 }
116 |                 for (;
117 |                         nbest_paths.Final(cur_state)
118 |                                 == StdArc::Weight::Zero();) {
119 |                     fst::ArcIterator<fst::Fst<StdArc> > aiter(nbest_paths,
120 |                             cur_state);
121 |                     StdArc arc = aiter.Value();
122 |                     if (arc.olabel != 0) {
123 |                         utf16line.push_back(arc.olabel);
124 |                     }
125 |                     cur_state = arc.nextstate;
126 |                 }
127 |                 utf8::utf16to8(utf16line.begin(), utf16line.end(),
128 |                         back_inserter(result));
129 |             }
130 |         } else {
131 |             ok = false;
132 |         }
133 |         if (ok) {
134 |             return OK;
135 |         } else {
136 |             //that's a temporary hack for Alexis' script to work. It is wrong because an empty
137 |             //result line could also be a valid transduction
138 |            return WARN;
139 |         }
140 |     } catch (std::exception& e) {
141 |         cerr << e.what() << endl;
142 |         throw;
143 |     }
144 | }
145 | 
146 | 
147 | Utf8Transducer::Status Utf8Transducer::readOrNewSymtab(string file, string name) {
148 |     if (!file.empty()) {
149 |         ifstream st_fp;
150 |         st_fp.open(file.c_str());
151 |         if (st_fp.is_open()) {
152 |             symbolTable = SymbolTable::ReadText(st_fp,name);
153 |         }
154 |         st_fp.close();
155 |     } else {
156 |         symbolTable = new SymbolTable("name");
157 |     }
158 |     if (!symbolTable) {
159 |         snprintf(message, maxMsgLength, "Couldn't read symbol table %s from file %s\n", name.c_str(), file.c_str());
160 |         return WARN;
161 |     }
162 |     return OK;
163 | }
164 | 
165 | 
166 | const char* Utf8Transducer::getMessage() const {
167 |     return message;
168 | }
169 | 
170 | } /* namespace yatts */
171 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # RusPhonetizer
  2 | 
  3 | _**IMPORTANT NOTE: this repository has been archived, as I am not maintaining it actively any more**_
  4 | 
  5 | ## General
  6 | 
  7 | *RusPhonetizer* is a simple script together with a set of Thrax grammar rules and dictionaries for the phonetic transcription of Russian sentences.
  8 | 
  9 | ## Software requirements
 10 | 
 11 | *RusPhonetizer* depends on the following software packages:
 12 | 
 13 | - [OpenFST](http://www.openfst.org/twiki/bin/view/FST/WebHome): used by Phonetisaurus, Thrax, and the Transcribe tool (see below). Tested with version 1.5.4.
 14 | - [OpenGrm Thrax Grammar Development Tools](http://openfst.cs.nyu.edu/twiki/bin/view/GRM/Thrax): needed to compile the grammar rules. Tested with version 1.2.2.
 15 | - [The WFST-driven Phoneticizer Phonetisaurus](https://github.com/JosefNovak/Phonetisaurus): required to build and use the stress prediction model. Tested with version 0.8a (https://www.dropbox.com/s/154q9yt3xenj2gr/phonetisaurus-0.8a.tgz)
 16 | 
 17 | The compiler needs to be C++11 compliant.
 18 | 
 19 | A tiny patch in the Thrax source code is needed before compilation. Please, refer to misc/README.
 20 | 
 21 | ## Transcribe tool
 22 | 
 23 | The transcription process relies on a small tool used to apply the G2P FST rules. The sources are kept in src/. A Makefile is provided for easy compilation.
 24 | 
 25 | ## Grammars
 26 | 
 27 | The grammars need first to be compiled before being used by the transcription tool. Please, refer to grammars/README for more information.
 28 | 
 29 | See [PhoneGroups](https://github.com/wilpert/PhoneGroups/blob/master/tables/YANDEX/map_YANDEX-ttssampa_ru-RU.dat) for the list of valid phoneme symbols and their meaning as used in the Thrax grammars.
 30 | 
 31 | ## Dictionaries
 32 | 
 33 | The most common type of transcriptions is what I call "pseudo-transcriptions": basically the same Cyrillic string as the entry word enriched with the stress information and possibly with some other lexical pronunciation exceptions. The following dictionaries are available:
 34 | 
 35 | - **tts-dict-simple.pruned.txt**: exceptions dictionary that contains mainly words for which the stress prediction model did not predict the stress correctly. There are also some other few entries, mainly function words, with pure phonetic transcription. Depending on the stress prediction model you build some more entries might be needed in this file for correct phonetic transcriptions. The format of the entries in this dictionary is as follows:
 36 | 
 37 | ```
 38 | ORTHO \t PHONO(,\s*PHONO)*
 39 | 
 40 | кредитно-расчётный	кредитно-расчётный
 41 | крем-брюле	кр+ем-брюле, крем-брюл+е
 42 | ```
 43 | 
 44 | - **tts-dict-homographs.txt**: dictionary with multiple transcriptions and morpho-syntactic information for homograph words. To be able to use this information, a tool for word disambiguation is required, which is not provided in the current package.
 45 | 
 46 | ```
 47 | FREQ \t ORTHO \t POS'('FEATS*')' \t '[' PHONO ']' (LEX\d)?
 48 | FEATS = FEAT'('','\s?FEAT')'*
 49 | 
 50 | 147286	войска	NN(sg)	[в+ойска]
 51 | 47286	войска	NN(pl)	[войск+а]
 52 | 66172	пола	NN(gen, sg, msc)	[п+ола] LEX1
 53 | 66172	пола	NN(nom, sg, fem)	[пол+а] LEX2
 54 | ```
 55 | 
 56 | - **tts-dict-yo-list.txt**: a list of words that should be written with letter <ё> (yo), used for reconstructing those words in the case that the input does not contain it.
 57 | 
 58 | ```
 59 | ORTHO \t ORTHO
 60 | 
 61 | артем	артём
 62 | ```
 63 | 
 64 | ## Main script options
 65 | 
 66 | ```AsciiDoc
 67 |   -h, --help            show this help message and exit
 68 |   -i INPUT, --input=INPUT
 69 |                         The file containing the words to transcribe
 70 |   -y YO_LIST, --yo_list=YO_LIST
 71 |                         List of words that contain the letter <yo> (OPT)
 72 |   -l DICTIONARY, --dictionary=DICTIONARY
 73 |                         A simple dictionary file (OPT)
 74 |   -u USER, --user=USER  A user lexicon file in the same format as simple
 75 |                         dictionary (OPT)
 76 |   -a HOMOGRAPHS, --homographs=HOMOGRAPHS
 77 |                         A file with homographs (OPT)
 78 |   -m MODEL_FILE, --model_file=MODEL_FILE
 79 |                         Read g2p model from FILE (for stress prediction)
 80 |   -g G2P_FST, --g2p_fst=G2P_FST
 81 |                         Path to the G2P FST(s)
 82 | 
 83 | python scripts/tts_transcriber.py \
 84 | -i test/rus_sentences.txt \
 85 | -y dictionaries/tts-dict-yo-list.txt \
 86 | -l dictionaries/tts-dict-simple.pruned.txt \
 87 | -a dictionaries/tts-dict-homographs.txt \
 88 | -m stress_prediction.fst \
 89 | -g "grammars/G2P1,grammars/G2P2"
 90 | ```
 91 | 
 92 | ## Transcription flow
 93 | 
 94 | 1. Tokenize/normalize sentence
 95 | 2. Get POS analysis for the tokenized/normalized sentence
 96 | 3. For every token after the POS analysis:
 97 |   - If no POS/features are available, give the word a generic GEN_POS.
 98 |   - Look up dictionaries:
 99 |     - First, try to find the word in the user dictionary. If found, retrieve its transcription.
100 |     - Second, try to find the word in the homographs dictionary. If found, retrieve its transcription as follows:
101 |       - Find the correct transcriptions in the homographs dictionary using the POS analysis (best intersection).
102 |       - If no intersection is found, get the transcription variant tagged by 'LEX1'.
103 |       - If no 'LEX' tags are available for entry, get the most frequent one.
104 |       - If everything fails, take the first transcription found.
105 |     - Third, try to find the word in the simple dictionary.
106 |     - Finally, if the word is not found in any dictionary, predict stress with the stress prediction FST model:
107 |   - For correct G2P (information used by Thrax rules), attach POS information to the token in the cases supported in the
108 |   G2P rules (currently, only adjectives and verbs).
109 | 4. Send the result of concatenating all resulting tokens to the G2P FST chain.
110 | 
111 | ## Stress prediction model
112 | 
113 | Due to file size limitations in GitHub, it is not possible to include in the repository the data required for building
114 | the stress prediction model. However, I have made it accessible from the following link:
115 | 
116 | https://www.mycloud.ch/s/S00DD7C0E5E1814BDE44BFBB92868EDB7E94CEA7AB7
117 | 
118 | I have also included in the package a prebuilt model for the case that you do not succeed building it yourself. Let me
119 | know, if you meet any problems accessing the data.
120 | 


--------------------------------------------------------------------------------
/grammars/diphthongs.grm:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | #
 13 | # Copyright 2014 Yandex LLC
 14 | # All Rights Reserved.
 15 | #
 16 | # Author : Alexis Wilpert
 17 | 
 18 | 
 19 | 
 20 | import 'alphabets.grm' as alphabets;
 21 | import 'definitions.grm' as defs;
 22 | 
 23 | # set/class definitions
 24 | 
 25 | LSEP = defs.LSEP;
 26 | RSEP = defs.RSEP;
 27 | LSEP_ALL = (defs.RSEP | "[SIL]" | "[BOS]" | "-");
 28 | RSEP_ALL = (defs.RSEP | "[SIL]" | "[EOS]");
 29 | vowel = defs.vowel;
 30 | cons_letter_hyphen = defs.cons_letter_hyphen;
 31 | stress_minus_1 = defs.stress_minus_1;
 32 | before_stress = defs.before_stress;
 33 | all = defs.all;
 34 | soft_cons = defs.soft_cons;
 35 | hard_cons = defs.hard_cons;
 36 | always_hard_cons = defs.always_hard_cons;
 37 | 
 38 | all_cons = (alphabets.consonants - "j")  |
 39 |            (alphabets.cons_letter - "й")
 40 | ;
 41 | 
 42 | #----------------------------------------------------------------------------
 43 | 
 44 | # stressed diphthongs
 45 | 
 46 | stressed_pairs = ("и":"[i_i]") |
 47 |                  ("е":"[e_i]") |
 48 |                  ("э":"[e_i]") |
 49 |                  ("а":"[a_i]") |
 50 |                  ("я":"[a_i]") |
 51 |                  ("о":"[o_i]") |
 52 |                  ("ё":"[o_i]") |
 53 |                  ("у":"[u_i]") |
 54 |                  ("ю":"[u_i]")
 55 | ;
 56 | 
 57 | stress_diphthongs1 = CDRewrite[stressed_pairs,
 58 |                                "+" "j"?,
 59 |                                "й",
 60 |                                alphabets.sigma_star
 61 |                               ];
 62 | 
 63 | # only for stressed <ый>, fo example <выйдя>
 64 | 
 65 | stress_diphthongs2a = CDRewrite[("ы":"[i_x]"),
 66 |                                 "+",
 67 |                                 "й",
 68 |                                 alphabets.sigma_star
 69 |                                ];
 70 | stress_diphthongs2b = CDRewrite[("й":"I"),
 71 |                                 "[i_x]",
 72 |                                 "",
 73 |                                 alphabets.sigma_star
 74 |                                ];
 75 | 
 76 | stress_diphthongs = Optimize[stress_diphthongs1  @
 77 |                              stress_diphthongs2a @
 78 |                              stress_diphthongs2b
 79 |                             ];
 80 | 
 81 | #----------------------------------------------------------------------------
 82 | 
 83 | # unstressed diphthongs
 84 | 
 85 | 
 86 | diphthongs6a = CDRewrite[("и":"[I_i]") | ("е":"[I_i]"),
 87 |                          soft_cons,
 88 |                          "й",
 89 |                          alphabets.sigma_star
 90 |                         ];
 91 | 
 92 | diphthongs6b = CDRewrite[("и":"[@_i]") | ("е":"[@_i]"),
 93 |                          always_hard_cons,
 94 |                          "й" RSEP_ALL,
 95 |                          alphabets.sigma_star
 96 |                         ];
 97 | 
 98 | diphthongs7 = CDRewrite[("ы":"[@_r_i]"),
 99 |                         hard_cons,
100 |                         "й",
101 |                         alphabets.sigma_star
102 |                        ];
103 | 
104 | # left context
105 | diphthongs8 = CDRewrite[("а":"[@_o_i]") |
106 |                         ("о":"[@_o_i]") |
107 |                         ("я":"[@_o_i]") |
108 |                         ("ё":"[@_o_i]"),
109 |                         vowel | LSEP,
110 |                         "й",
111 |                         alphabets.sigma_star
112 |                        ];
113 | 
114 | # right context
115 | diphthongs9 = CDRewrite[("а":"[@_o_i]") |
116 |                         ("о":"[@_o_i]") |
117 |                         ("я":"[@_o_i]") |
118 |                         ("ё":"[@_o_i]"),
119 |                         cons_letter_hyphen,
120 |                         "й" (stress_minus_1 | RSEP),
121 |                         alphabets.sigma_star
122 |                        ];
123 | 
124 | diphthongs10 = CDRewrite[("а":"[@_i]") |
125 |                          ("о":"[@_i]") |
126 |                          ("я":"[@_i]") |
127 |                          ("ё":"[@_i]"),
128 |                          "",
129 |                          "й",
130 |                          alphabets.sigma_star
131 |                         ];
132 | 
133 | diphthongs11 = CDRewrite[("у":"[U_i]") | ("ю":"[U_i]"),
134 |                          all | LSEP,
135 |                          "й",
136 |                          alphabets.sigma_star
137 |                         ];
138 | 
139 | 
140 | reduce_diphthongs = Optimize[diphthongs6a @
141 |                              diphthongs6b @
142 |                              diphthongs7  @
143 |                              diphthongs8  @
144 |                              diphthongs9  @
145 |                              diphthongs10 @
146 |                              diphthongs11
147 |                             ];
148 | 
149 | #----------------------------------------------------------------------------
150 | 
151 | # handling of <й> in other cases:
152 | 
153 | # 1. stand-alone --> lexicon entry (<и краткое>)
154 | # 2. after consonant --> [j]
155 | # 3. at BOW --> [j]
156 | # 4. in all other cases --> DEL
157 | 
158 | i_kratkoye1 = CDRewrite[("й":"j"),
159 |                         (LSEP_ALL | all_cons),
160 |                         "+"? vowel,
161 |                         alphabets.sigma_star
162 |                        ];
163 | 
164 | # this rule could catch also stand-alone <й>, but this is already
165 | # caught by the lexicon lookup
166 | i_kratkoye2 = CDRewrite[("й":"и")*,
167 |                         (LSEP_ALL | all_cons),
168 |                         (RSEP_ALL | all_cons),
169 |                         alphabets.sigma_star
170 |                        ];
171 | 
172 | i_kratkoye3 = CDRewrite[("й":"")*,
173 |                         "",
174 |                         "",
175 |                         alphabets.sigma_star
176 |                        ];
177 | 
178 | i_kratkoye = Optimize[i_kratkoye1 @
179 |                       i_kratkoye2 @
180 |                       i_kratkoye3
181 |                      ];
182 | 
183 | #----------------------------------------------------------------------------
184 | 
185 | export diphthongs = Optimize[stress_diphthongs @
186 |                              reduce_diphthongs @
187 |                              i_kratkoye
188 |                             ];
189 | 
190 | 


--------------------------------------------------------------------------------
/src/3rdparty/utf8/unchecked.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Nemanja Trifunovic
  2 | 
  3 | /*
  4 | Permission is hereby granted, free of charge, to any person or organization
  5 | obtaining a copy of the software and accompanying documentation covered by
  6 | this license (the "Software") to use, reproduce, display, distribute,
  7 | execute, and transmit the Software, and to prepare derivative works of the
  8 | Software, and to permit third-parties to whom the Software is furnished to
  9 | do so, all subject to the following:
 10 | 
 11 | The copyright notices in the Software and this entire statement, including
 12 | the above license grant, this restriction and the following disclaimer,
 13 | must be included in all copies of the Software, in whole or in part, and
 14 | all derivative works of the Software, unless such copies or derivative
 15 | works are solely in the form of machine-executable object code generated by
 16 | a source language processor.
 17 | 
 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 24 | DEALINGS IN THE SOFTWARE.
 25 | */
 26 | 
 27 | 
 28 | #ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 29 | #define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 30 | 
 31 | #include "core.h"
 32 | 
 33 | namespace utf8
 34 | {
 35 |     namespace unchecked 
 36 |     {
 37 |         template <typename octet_iterator>
 38 |         octet_iterator append(uint32_t cp, octet_iterator result)
 39 |         {
 40 |             if (cp < 0x80)                        // one octet
 41 |                 *(result++) = static_cast<uint8_t>(cp);  
 42 |             else if (cp < 0x800) {                // two octets
 43 |                 *(result++) = static_cast<uint8_t>((cp >> 6)          | 0xc0);
 44 |                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
 45 |             }
 46 |             else if (cp < 0x10000) {              // three octets
 47 |                 *(result++) = static_cast<uint8_t>((cp >> 12)         | 0xe0);
 48 |                 *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
 49 |                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
 50 |             }
 51 |             else {                                // four octets
 52 |                 *(result++) = static_cast<uint8_t>((cp >> 18)         | 0xf0);
 53 |                 *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80);
 54 |                 *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
 55 |                 *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
 56 |             }
 57 |             return result;
 58 |         }
 59 | 
 60 |         template <typename octet_iterator>
 61 |         uint32_t next(octet_iterator& it)
 62 |         {
 63 |             uint32_t cp = utf8::internal::mask8(*it);
 64 |             typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it);
 65 |             switch (length) {
 66 |                 case 1:
 67 |                     break;
 68 |                 case 2:
 69 |                     it++;
 70 |                     cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
 71 |                     break;
 72 |                 case 3:
 73 |                     ++it; 
 74 |                     cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
 75 |                     ++it;
 76 |                     cp += (*it) & 0x3f;
 77 |                     break;
 78 |                 case 4:
 79 |                     ++it;
 80 |                     cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);                
 81 |                     ++it;
 82 |                     cp += (utf8::internal::mask8(*it) << 6) & 0xfff;
 83 |                     ++it;
 84 |                     cp += (*it) & 0x3f; 
 85 |                     break;
 86 |             }
 87 |             ++it;
 88 |             return cp;        
 89 |         }
 90 | 
 91 |         template <typename octet_iterator>
 92 |         uint32_t peek_next(octet_iterator it)
 93 |         {
 94 |             return utf8::unchecked::next(it);    
 95 |         }
 96 | 
 97 |         template <typename octet_iterator>
 98 |         uint32_t prior(octet_iterator& it)
 99 |         {
100 |             while (utf8::internal::is_trail(*(--it))) ;
101 |             octet_iterator temp = it;
102 |             return utf8::unchecked::next(temp);
103 |         }
104 | 
105 |         // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous)
106 |         template <typename octet_iterator>
107 |         inline uint32_t previous(octet_iterator& it)
108 |         {
109 |             return utf8::unchecked::prior(it);
110 |         }
111 | 
112 |         template <typename octet_iterator, typename distance_type>
113 |         void advance (octet_iterator& it, distance_type n)
114 |         {
115 |             for (distance_type i = 0; i < n; ++i)
116 |                 utf8::unchecked::next(it);
117 |         }
118 | 
119 |         template <typename octet_iterator>
120 |         typename std::iterator_traits<octet_iterator>::difference_type
121 |         distance (octet_iterator first, octet_iterator last)
122 |         {
123 |             typename std::iterator_traits<octet_iterator>::difference_type dist;
124 |             for (dist = 0; first < last; ++dist) 
125 |                 utf8::unchecked::next(first);
126 |             return dist;
127 |         }
128 | 
129 |         template <typename u16bit_iterator, typename octet_iterator>
130 |         octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
131 |         {       
132 |             while (start != end) {
133 |                 uint32_t cp = utf8::internal::mask16(*start++);
134 |             // Take care of surrogate pairs first
135 |                 if (utf8::internal::is_lead_surrogate(cp)) {
136 |                     uint32_t trail_surrogate = utf8::internal::mask16(*start++);
137 |                     cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
138 |                 }
139 |                 result = utf8::unchecked::append(cp, result);
140 |             }
141 |             return result;         
142 |         }
143 | 
144 |         template <typename u16bit_iterator, typename octet_iterator>
145 |         u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
146 |         {
147 |             while (start < end) {
148 |                 uint32_t cp = utf8::unchecked::next(start);
149 |                 if (cp > 0xffff) { //make a surrogate pair
150 |                     *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
151 |                     *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
152 |                 }
153 |                 else
154 |                     *result++ = static_cast<uint16_t>(cp);
155 |             }
156 |             return result;
157 |         }
158 | 
159 |         template <typename octet_iterator, typename u32bit_iterator>
160 |         octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
161 |         {
162 |             while (start != end)
163 |                 result = utf8::unchecked::append(*(start++), result);
164 | 
165 |             return result;
166 |         }
167 | 
168 |         template <typename octet_iterator, typename u32bit_iterator>
169 |         u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
170 |         {
171 |             while (start < end)
172 |                 (*result++) = utf8::unchecked::next(start);
173 | 
174 |             return result;
175 |         }
176 | 
177 |         // The iterator class
178 |         template <typename octet_iterator>
179 |           class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { 
180 |             octet_iterator it;
181 |             public:
182 |             iterator () {}
183 |             explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
184 |             // the default "big three" are OK
185 |             octet_iterator base () const { return it; }
186 |             uint32_t operator * () const
187 |             {
188 |                 octet_iterator temp = it;
189 |                 return utf8::unchecked::next(temp);
190 |             }
191 |             bool operator == (const iterator& rhs) const 
192 |             { 
193 |                 return (it == rhs.it);
194 |             }
195 |             bool operator != (const iterator& rhs) const
196 |             {
197 |                 return !(operator == (rhs));
198 |             }
199 |             iterator& operator ++ () 
200 |             {
201 |                 ::std::advance(it, utf8::internal::sequence_length(it));
202 |                 return *this;
203 |             }
204 |             iterator operator ++ (int)
205 |             {
206 |                 iterator temp = *this;
207 |                 ::std::advance(it, utf8::internal::sequence_length(it));
208 |                 return temp;
209 |             }  
210 |             iterator& operator -- ()
211 |             {
212 |                 utf8::unchecked::prior(it);
213 |                 return *this;
214 |             }
215 |             iterator operator -- (int)
216 |             {
217 |                 iterator temp = *this;
218 |                 utf8::unchecked::prior(it);
219 |                 return temp;
220 |             }
221 |           }; // class iterator
222 | 
223 |     } // namespace utf8::unchecked
224 | } // namespace utf8 
225 | 
226 | 
227 | #endif // header guard
228 | 
229 | 


--------------------------------------------------------------------------------
/grammars/syllabification.grm:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | #
 13 | # Copyright 2014 Yandex LLC
 14 | # All Rights Reserved.
 15 | #
 16 | # Author : Alexis Wilpert
 17 | 
 18 | 
 19 | 
 20 | import 'alphabets.grm' as alphabets;
 21 | 
 22 | # definitions used in the syllabification rules
 23 | 
 24 | stops = alphabets.plosives | alphabets.affricates;
 25 | 
 26 | liquids_glides = alphabets.liquids | alphabets.approximant;
 27 | 
 28 | consonantal = alphabets.consonants;
 29 | 
 30 | vocalic = alphabets.nuclei | liquids_glides;
 31 | 
 32 | sonorant = vocalic | alphabets.nasals;
 33 | 
 34 | continuant = (alphabets.phone - stops);
 35 | 
 36 | coronal = alphabets.liquids            |
 37 |           alphabets.coronal_nasals     |
 38 |           alphabets.coronal_fricatives |
 39 |           alphabets.coronal_plosives   |
 40 |           alphabets.affricates
 41 | ;
 42 | 
 43 | anterior = alphabets.laterals            |
 44 |            alphabets.nasals              |
 45 |            alphabets.anterior_plosives   |
 46 |            alphabets.anterior_fricatives |
 47 |            alphabets.anterior_affricates
 48 | ;
 49 | 
 50 | lateral = alphabets.laterals;
 51 | 
 52 | nasal = alphabets.nasals;
 53 | 
 54 | delayed_release = alphabets.affricates;
 55 | 
 56 | #-------------------------------------------------------------
 57 | 
 58 | # RULES
 59 | 
 60 | SEP = alphabets.word_sep | "[BOS]" | "[SIL]";
 61 | 
 62 | # 1-consonant onset
 63 | 
 64 | onset1 = consonantal;
 65 | 
 66 | # 2-consonants onsets
 67 | 
 68 | 
 69 | onset_33  = (consonantal - vocalic - delayed_release)
 70 |             (consonantal @ vocalic)
 71 | ;
 72 | 
 73 | onset_36  = ((consonantal @ continuant @ coronal @ anterior) - sonorant)
 74 |             (consonantal - continuant)
 75 | ;
 76 | 
 77 | onset_37  = SEP ((consonantal @ continuant @ coronal) - sonorant - anterior)
 78 |             (consonantal - continuant - delayed_release)
 79 | ;
 80 | 
 81 | onset_38a = SEP ((consonantal @ continuant @ anterior) - sonorant)
 82 |             consonantal
 83 | ;
 84 | 
 85 | onset_38b = ((consonantal @ continuant @ anterior) - sonorant)
 86 |             (consonantal @ sonorant)
 87 | ;
 88 | 
 89 | onset_40a = SEP (consonantal - sonorant - delayed_release)
 90 |             ((consonantal @ sonorant @ nasal) - vocalic)
 91 | ;
 92 | 
 93 | onset_40b = (consonantal - sonorant)
 94 |             ((consonantal @ sonorant) - nasal)
 95 | ;
 96 | 
 97 | onset2 = Optimize[onset_33  | onset_36  | onset_37  |
 98 |                   onset_38a | onset_38b | onset_40a |
 99 |                   onset_40b
100 |                  ];
101 | 
102 | # 3-consonants onsets
103 | 
104 | onset_42 = SEP ((consonantal @ continuant @ anterior) - sonorant - coronal)
105 |                ((consonantal @ continuant @ coronal @ anterior) - sonorant)
106 |                (consonantal @ sonorant)
107 | ;
108 | 
109 | onset_43 = ((consonantal @ continuant @ anterior) - sonorant - coronal)
110 |            ((consonantal @ continuant @ coronal @ anterior) - sonorant)
111 |            (consonantal - continuant - delayed_release)
112 | ;
113 | 
114 | onset_45 = ((consonantal @ continuant @ anterior) - sonorant)
115 |            (consonantal - continuant - delayed_release)
116 |            ((consonantal @ sonorant) - nasal)
117 | ;
118 | 
119 | onset_double_r = stops
120 |                  ("r" | "[rJ]")
121 |                  ("r" | "[rJ]")
122 | ;
123 | 
124 | onset3 = Optimize[onset_42 | onset_43 | onset_45 | onset_double_r];
125 | 
126 | # 4-consonants onsets
127 | 
128 | onset_47 = SEP ((consonantal @ continuant @ anterior) - sonorant - coronal)
129 |            ((consonantal @ continuant @ coronal @ anterior) - sonorant)
130 |            (consonantal - continuant - delayed_release)
131 |            (consonantal @ vocalic)
132 | ;
133 | 
134 | onset4 = Optimize[onset_47];
135 | 
136 | # onset exceptions
137 | 
138 | onset_except_1 = SEP "t" "[SJ]" "t";
139 | onset_except_2 = SEP "[lJ]" "d";
140 | onset_except_3 = SEP "g" "[dJ]";
141 | onset_except_4 = SEP "k" "t";
142 | onset_except_5 = SEP "l" "b";
143 | onset_except_6 = SEP "m" "[t_SJ]";
144 | onset_except_7 = SEP "m" "[nJ]";
145 | onset_except_8 = SEP "m" "n";
146 | onset_except_9 = SEP "p" "[tJ]";
147 | onset_except_10 = SEP "r" "t";
148 | onset_except_11 = SEP "s" "x" "v";
149 | onset_except_12 = SEP "[t_S]" "[t_SJ]";
150 | onset_except_13 = SEP "S" "[t_SJ]";
151 | 
152 | onset_except = Optimize[onset_except_1  |
153 |                         onset_except_2  |
154 |                         onset_except_3  |
155 |                         onset_except_4  |
156 |                         onset_except_5  |
157 |                         onset_except_6  |
158 |                         onset_except_7  |
159 |                         onset_except_8  |
160 |                         onset_except_9  |
161 |                         onset_except_10 |
162 |                         onset_except_11 |
163 |                         onset_except_12 |
164 |                         onset_except_13
165 |                        ];
166 | 
167 | onset = Optimize[onset1 | onset2 | onset3 | onset4 | onset_except];
168 | 
169 | # codas
170 | 
171 | coda1 = consonantal;
172 | 
173 | coda_48 = (consonantal @ vocalic)
174 |           (consonantal - vocalic)
175 | ;
176 | 
177 | coda_49 = (consonantal - vocalic - delayed_release)
178 |           ((consonantal @ coronal @ anterior) - continuant - delayed_release)
179 | ;
180 | 
181 | coda_50a = ((consonantal @ sonorant @ nasal) - vocalic)
182 |            ((consonantal @ continuant @ coronal) - sonorant)
183 | ;
184 | 
185 | coda_50b = ((consonantal @ sonorant @ nasal) - vocalic)
186 |            (consonantal - continuant)
187 | ;
188 | 
189 | coda_x1 = (consonantal - continuant - coronal)
190 |           ((consonantal @ continuant @ coronal @ anterior) - sonorant)
191 | ;
192 | 
193 | coda_x2 = ((consonantal @ continuant @ coronal @ anterior) - sonorant)
194 |           (consonantal - continuant - coronal - anterior)
195 | ;
196 | 
197 | coda_x3 = ((consonantal @ continuant @ coronal) - sonorant - anterior)
198 |           ((consonantal @ coronal) - continuant - anterior)
199 | ;
200 | 
201 | coda_x4 = ((consonantal @ continuant @ coronal @ anterior) - sonorant)
202 |           ((consonantal @ anterior) - vocalic)
203 | ;
204 | 
205 | coda_x5 = (consonantal - continuant - delayed_release)
206 |           (consonantal @ vocalic @ coronal)
207 | ;
208 | 
209 | # coda exceptions
210 | 
211 | coda_except1 ="s" "[lJ]";
212 | coda_except2 = "t" "[vJ]";
213 | coda_except3 = "Z" "b";
214 | coda_except4 = "l" "n" "t" "s";
215 | coda_except5 = "l" "l";
216 | coda_except6 = "r" "l";
217 | coda_except7 = "s" "l";
218 | coda_except8 = "m" "m";
219 | coda_except9 = "n" "n";
220 | coda_except10 = "p" "p";
221 | coda_except11 = "s" "[tJ]" "r";
222 | coda_except12 ="f" "r";
223 | coda_except13 = "n" "t" "r";
224 | coda_except14 = "s" "t" "r";
225 | coda_except15 = "n" "k" "t";
226 | coda_except16 = "[t_SJ]" "v";
227 | coda_except17 = "k" "v";
228 | coda_except18 = "s" "t" "v";
229 | 
230 | # coda_except19 = ("j" | "[lJ]" | "d" | "n" | "r" | "s" | "t" |"v") "s" "t" "v";
231 | 
232 | coda_except = Optimize[coda_except1  |
233 |                        coda_except2  |
234 |                        coda_except3  |
235 |                        coda_except4  |
236 |                        coda_except5  |
237 |                        coda_except6  |
238 |                        coda_except7  |
239 |                        coda_except8  |
240 |                        coda_except9  |
241 |                        coda_except10 |
242 |                        coda_except11 |
243 |                        coda_except12 |
244 |                        coda_except13 |
245 |                        coda_except14 |
246 |                        coda_except15 |
247 |                        coda_except16 |
248 |                        coda_except17 |
249 |                        coda_except18
250 |                       ];
251 | 
252 | coda = Optimize[coda1    | coda_48 | coda_49  |
253 |                 coda_50b | coda_x1 | coda_x2  |
254 |                 coda_x3  | coda_x4 | coda_x5  |
255 |                 coda_except
256 |                ];
257 | 
258 | # syllable definitions
259 | 
260 | syllable1 = onset alphabets.nuclei coda;
261 | syllable2 = onset alphabets.nuclei;
262 | syllable3 = alphabets.nuclei coda;
263 | syllable4 = alphabets.nuclei;
264 | 
265 | syllable = syllable1 | syllable2 | syllable3 | syllable4;
266 | 
267 | rewrite_alpha = (alphabets.phone | "-" | " " |
268 |                  "[SIL]" | "[ERROR]")*;
269 | 
270 | syllabified0 = CDRewrite["":"-",
271 |                          syllable,        # left context
272 |                          "j",             # right context
273 |                          rewrite_alpha    # alphabet
274 |                         ];
275 | 
276 | syllabified1 = CDRewrite["":"-",
277 |                          syllable,     # left context
278 |                          syllable,     # right context
279 |                          rewrite_alpha # alphabet
280 |                         ];
281 | 
282 | # residual rule (trying to catch unsyllabified sequences)
283 | syllabified2 = CDRewrite["":"-",
284 |                          onset? alphabets.nuclei coda,  # left context
285 |                          consonantal* alphabets.nuclei, # right context
286 |                          rewrite_alpha                  # alphabet
287 |                         ];
288 | 
289 | trailing_syll_removal = CDRewrite["-":"",
290 |                                   "",
291 |                                   SEP,
292 |                                   rewrite_alpha
293 |                                  ];
294 | 
295 | export syllabified = Optimize[syllabified0          @
296 |                               syllabified1          @
297 |                               syllabified2          @
298 |                               trailing_syll_removal
299 |                              ];
300 | 


--------------------------------------------------------------------------------
/grammars/vowels.grm:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | #
 13 | # Copyright 2014 Yandex LLC
 14 | # All Rights Reserved.
 15 | #
 16 | # Author : Alexis Wilpert
 17 | 
 18 | 
 19 | 
 20 | import 'alphabets.grm' as alphabets;
 21 | import 'definitions.grm' as defs;
 22 | 
 23 | # set/class definitions
 24 | 
 25 | WSEP = "[WUD]"? alphabets.word_sep;
 26 | LSEP = WSEP | "-";
 27 | RSEP = defs.RSEP;
 28 | EOS = "[EOS]" | "[SIL]";
 29 | BOS = "[BOS]" | "[SIL]";
 30 | vowel = defs.vowel;
 31 | cons_letter_hyphen = defs.cons_letter_hyphen;
 32 | stress_minus_1 = defs.stress_minus_1;
 33 | before_stress = defs.before_stress;
 34 | all = defs.all;
 35 | soft_cons = defs.soft_cons;
 36 | hard_cons = defs.hard_cons;
 37 | 
 38 | # orthographic and phonetic [ao] vowels
 39 | a_o_vowels =("а" | "a" | "о" | "o");
 40 | 
 41 | #----------------------------------------------------------------------------
 42 | 
 43 | # stressed vowels
 44 | 
 45 | # special case <ё>: always stressed (no stress marker present)
 46 | stress_yo = CDRewrite[("ё":"o"),
 47 |                       "",
 48 |                       "",
 49 |                       alphabets.sigma_star
 50 |                      ];
 51 | 
 52 | stressed_pairs = (("я":"a")     |
 53 |                   ("ю":"u")     |
 54 |                   ("и":"i")     |
 55 |                   ("ы":"[i_x]") |
 56 |                   ("е":"e")     |
 57 |                   ("а":"a")     |
 58 |                   ("у":"u")     |
 59 |                   ("о":"o")
 60 |                  );
 61 | 
 62 | stress_vowel  = CDRewrite[stressed_pairs,
 63 |                           "+" "j"?,
 64 |                           "",
 65 |                           alphabets.sigma_star
 66 |                          ];
 67 | 
 68 | # letter <э> will be mapped later, since its phonetic representation [e]
 69 | # is in conflict with the one for the letter <е>, which does trigger
 70 | # palatalization of the previous consonant. We need this information in
 71 | # vowel reduction rules below
 72 | 
 73 | stress_vowels = Optimize[stress_yo    @
 74 |                          stress_vowel
 75 |                         ];
 76 | 
 77 | #----------------------------------------------------------------------------
 78 | 
 79 | # reduction of unstressed vowels
 80 | 
 81 | 
 82 | # first level of reduction
 83 | 
 84 | reduction1 = CDRewrite["э":"E",
 85 |                        (BOS | vowel),
 86 |                        "",
 87 |                        alphabets.sigma_star
 88 |                       ];
 89 | 
 90 | I_reduction_letters1 = ("э":"I") | ("е":"I");
 91 | 
 92 | I_reduction_letters2 = ("я":"I");
 93 | 
 94 | I_reduction_letters3 = ("а":"I");
 95 | 
 96 | I_reduction_left_context = vowel | BOS | soft_cons | (soft_cons WSEP);
 97 | 
 98 | I_reduction_right_context = stress_minus_1 | EOS;
 99 | 
100 | 
101 | reduction2a = CDRewrite[I_reduction_letters1,
102 |                         I_reduction_left_context,
103 |                         "",
104 |                         alphabets.sigma_star
105 |                        ];
106 | 
107 | reduction2b = CDRewrite[I_reduction_letters1,
108 |                         "",
109 |                         I_reduction_right_context,
110 |                         alphabets.sigma_star
111 |                        ];
112 | 
113 | 
114 | reduction2c = CDRewrite[I_reduction_letters2,
115 |                         I_reduction_left_context,
116 |                         before_stress,
117 |                         alphabets.sigma_star
118 |                        ];
119 | 
120 | reduction2d = CDRewrite[I_reduction_letters3,
121 |                         defs.always_soft_cons,
122 |                         before_stress,
123 |                         alphabets.sigma_star
124 |                        ];
125 | 
126 | 
127 | schwa_r_reduction_letter1 = ("ы":"[@_r]");
128 | 
129 | schwa_r_reduction_letter2 = ("э":"[@_r]");
130 | 
131 | schwa_r_reduction_letter3 = (("и":"[@_r]") | ("е":"[@_r]"));
132 | 
133 | 
134 | # <ы> is always reduced to [@_r] when unstressed
135 | reduction3a = CDRewrite[schwa_r_reduction_letter1,
136 |                         all | WSEP | BOS,
137 |                         "",
138 |                         alphabets.sigma_star
139 |                        ];
140 | 
141 | reduction3b = CDRewrite[schwa_r_reduction_letter2,
142 |                         hard_cons |
143 |                         (hard_cons WSEP),
144 |                         "",
145 |                         alphabets.sigma_star
146 |                        ];
147 | 
148 | reduction3c = CDRewrite[schwa_r_reduction_letter3,
149 |                         defs.always_hard_cons,
150 |                         "",
151 |                         alphabets.sigma_star
152 |                        ];
153 | 
154 | # <и> is always reduced to [I] when unstressed, except in the cases
155 | # where it should become [@_r] (done before)
156 | reduction3d = CDRewrite[("и":"I"),
157 |                         all | WSEP | BOS,
158 |                         "",
159 |                         alphabets.sigma_star
160 |                        ];
161 | 
162 | schwa_o_reduction_letters = ("а":"[@_o]") | ("о":"[@_o]");
163 | 
164 | # always unstressed (otherwise immediaate l_context would be "+")
165 | schwa_o_reduction_l_context = (BOS | WSEP | a_o_vowels);
166 | 
167 | schwa_o_reduction_r_context = stress_minus_1 |
168 |                               EOS            |
169 |                               a_o_vowels
170 | ;
171 | 
172 | # left context
173 | reduction4 = CDRewrite[schwa_o_reduction_letters,
174 |                        schwa_o_reduction_l_context,
175 |                        "",
176 |                        alphabets.sigma_star
177 |                       ];
178 | 
179 | # right context
180 | reduction5 = CDRewrite[schwa_o_reduction_letters,
181 |                        all,
182 |                        schwa_o_reduction_r_context,
183 |                        alphabets.sigma_star
184 |                       ];
185 | 
186 | U_reduction_letters = ("ю":"U") | ("у":"U");
187 | 
188 | # left context
189 | reduction6a = CDRewrite[U_reduction_letters,
190 |                        vowel | BOS,
191 |                        "",
192 |                        alphabets.sigma_star
193 |                       ];
194 | 
195 | # right context
196 | reduction6b = CDRewrite[U_reduction_letters,
197 |                        "",
198 |                        stress_minus_1 | EOS,
199 |                        alphabets.sigma_star
200 |                       ];
201 | 
202 | first_level_reduction = Optimize[reduction1  @
203 |                                  reduction2a @
204 |                                  reduction2b @
205 |                                  reduction2c @
206 |                                  reduction2d @
207 |                                  reduction3a @
208 |                                  reduction3b @
209 |                                  reduction3c @
210 |                                  reduction3d @
211 |                                  reduction4  @
212 |                                  reduction5  @
213 |                                  reduction6a @
214 |                                  reduction6b
215 |                                 ];
216 | 
217 | #----------------------------------------------------------------------------
218 | 
219 | # second level of reduction
220 | 
221 | schwa_reduction_pairs = Optimize[("е":"@") |
222 |                                  ("я":"@") |
223 |                                  ("а":"@") |
224 |                                  ("о":"@")
225 |                                 ];
226 | 
227 | second_reduction_l_context = Optimize[cons_letter_hyphen | LSEP];
228 | 
229 | second_reduction_r_context = Optimize[cons_letter_hyphen*
230 |                                       (vowel | RSEP | EOS)
231 |                                      ];
232 | 
233 | reduction7 = CDRewrite[schwa_reduction_pairs,
234 |                        second_reduction_l_context,
235 |                        second_reduction_r_context,
236 |                        alphabets.sigma_star
237 |                       ];
238 | 
239 | U_x_reduction_letters = ("ю":"[U_x]") | ("у":"[U_x]");
240 | 
241 | reduction8 = CDRewrite[U_x_reduction_letters,
242 |                        second_reduction_l_context,
243 |                        second_reduction_r_context,
244 |                        alphabets.sigma_star
245 |                       ];
246 | 
247 | second_level_reduction = Optimize[reduction7 @
248 |                                   reduction8
249 |                                  ];
250 | 
251 | #----------------------------------------------------------------------------
252 | 
253 | # exceptions
254 | 
255 | # after <ц> and with the second degree of reduction we should better
256 | # have [@], not [@_r])
257 | 
258 | # post-tonic unstressed <е> --> [@] after <ц>
259 | 
260 | # Ex. <сердце>
261 | 
262 | exception1= CDRewrite[("[@_r]":"@"),
263 |                       "ц",
264 |                       (cons_letter_hyphen | vowel)* (EOS | WSEP),
265 |                       alphabets.sigma_star
266 |                      ];
267 | 
268 | exceptions = Optimize[exception1];
269 | 
270 | #----------------------------------------------------------------------------
271 | 
272 | # reduce vowels after other vowels that were not reduced so far
273 | 
274 | V_V_reduction_pairs = ("э":"E")     | ("и":"I")     |
275 |                       ("е":"I")     | ("я":"I")     |
276 |                       ("ы":"[@_r]") | ("а":"[@_o]") |
277 |                       ("о":"[@_o]") | ("ю":"U")     |
278 |                       ("у":"U")
279 | ;
280 | 
281 | reduce_V_V_vowels = CDRewrite[V_V_reduction_pairs,
282 |                               vowel "-"?,
283 |                               "",
284 |                               alphabets.sigma_star
285 |                              ];
286 | 
287 | #----------------------------------------------------------------------------
288 | 
289 | # cleaning
290 | 
291 | stress_non_soft_e = CDRewrite[("э":"e"),
292 |                               "+",
293 |                               "",
294 |                               alphabets.sigma_star
295 |                              ];
296 | 
297 | # needed for foreign words --> TO BE CHECKED
298 | transcribe_remaining_eps = CDRewrite[("э":"E"),
299 |                                       "",
300 |                                       "",
301 |                                       alphabets.sigma_star
302 |                                     ];
303 | 
304 | 
305 | clean_hyphen = CDRewrite["-":"",
306 |                          "",
307 |                          "",
308 |                          alphabets.sigma_star
309 |                         ];
310 | 
311 | clean_stress_marker = CDRewrite["+":"",
312 |                                 "",
313 |                                 "",
314 |                                 alphabets.sigma_star
315 |                                ];
316 | 
317 | cleaning = Optimize[stress_non_soft_e        @
318 |                     transcribe_remaining_eps @
319 |                     clean_hyphen             @
320 |                     clean_stress_marker
321 |                    ];
322 | 
323 | #----------------------------------------------------------------------------
324 | 
325 | export reduced = Optimize[stress_vowels          @
326 |                           first_level_reduction  @
327 |                           second_level_reduction @
328 |                           exceptions             @
329 |                           reduce_V_V_vowels      @
330 |                           cleaning];
331 | 


--------------------------------------------------------------------------------
/src/3rdparty/utf8/core.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Nemanja Trifunovic
  2 | 
  3 | /*
  4 | Permission is hereby granted, free of charge, to any person or organization
  5 | obtaining a copy of the software and accompanying documentation covered by
  6 | this license (the "Software") to use, reproduce, display, distribute,
  7 | execute, and transmit the Software, and to prepare derivative works of the
  8 | Software, and to permit third-parties to whom the Software is furnished to
  9 | do so, all subject to the following:
 10 | 
 11 | The copyright notices in the Software and this entire statement, including
 12 | the above license grant, this restriction and the following disclaimer,
 13 | must be included in all copies of the Software, in whole or in part, and
 14 | all derivative works of the Software, unless such copies or derivative
 15 | works are solely in the form of machine-executable object code generated by
 16 | a source language processor.
 17 | 
 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 24 | DEALINGS IN THE SOFTWARE.
 25 | */
 26 | 
 27 | 
 28 | #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 29 | #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 30 | 
 31 | #include <iterator>
 32 | 
 33 | namespace utf8
 34 | {
 35 |     // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
 36 |     // You may need to change them to match your system.
 37 |     // These typedefs have the same names as ones from cstdint, or boost/cstdint
 38 |     typedef unsigned char   uint8_t;
 39 |     typedef unsigned short  uint16_t;
 40 |     typedef unsigned int    uint32_t;
 41 | 
 42 | // Helper code - not intended to be directly called by the library users. May be changed at any time
 43 | namespace internal
 44 | {
 45 |     // Unicode constants
 46 |     // Leading (high) surrogates: 0xd800 - 0xdbff
 47 |     // Trailing (low) surrogates: 0xdc00 - 0xdfff
 48 |     const uint16_t LEAD_SURROGATE_MIN  = 0xd800u;
 49 |     const uint16_t LEAD_SURROGATE_MAX  = 0xdbffu;
 50 |     const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
 51 |     const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
 52 |     const uint16_t LEAD_OFFSET         = LEAD_SURROGATE_MIN - (0x10000 >> 10);
 53 |     const uint32_t SURROGATE_OFFSET    = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
 54 | 
 55 |     // Maximum valid value for a Unicode code point
 56 |     const uint32_t CODE_POINT_MAX      = 0x0010ffffu;
 57 | 
 58 |     template<typename octet_type>
 59 |     inline uint8_t mask8(octet_type oc)
 60 |     {
 61 |         return static_cast<uint8_t>(0xff & oc);
 62 |     }
 63 |     template<typename u16_type>
 64 |     inline uint16_t mask16(u16_type oc)
 65 |     {
 66 |         return static_cast<uint16_t>(0xffff & oc);
 67 |     }
 68 |     template<typename octet_type>
 69 |     inline bool is_trail(octet_type oc)
 70 |     {
 71 |         return ((utf8::internal::mask8(oc) >> 6) == 0x2);
 72 |     }
 73 | 
 74 |     template <typename u16>
 75 |     inline bool is_lead_surrogate(u16 cp)
 76 |     {
 77 |         return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
 78 |     }
 79 | 
 80 |     template <typename u16>
 81 |     inline bool is_trail_surrogate(u16 cp)
 82 |     {
 83 |         return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
 84 |     }
 85 | 
 86 |     template <typename u16>
 87 |     inline bool is_surrogate(u16 cp)
 88 |     {
 89 |         return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
 90 |     }
 91 | 
 92 |     template <typename u32>
 93 |     inline bool is_code_point_valid(u32 cp)
 94 |     {
 95 |         return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
 96 |     }
 97 | 
 98 |     template <typename octet_iterator>
 99 |     inline typename std::iterator_traits<octet_iterator>::difference_type
100 |     sequence_length(octet_iterator lead_it)
101 |     {
102 |         uint8_t lead = utf8::internal::mask8(*lead_it);
103 |         if (lead < 0x80)
104 |             return 1;
105 |         else if ((lead >> 5) == 0x6)
106 |             return 2;
107 |         else if ((lead >> 4) == 0xe)
108 |             return 3;
109 |         else if ((lead >> 3) == 0x1e)
110 |             return 4;
111 |         else
112 |             return 0;
113 |     }
114 | 
115 |     template <typename octet_difference_type>
116 |     inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
117 |     {
118 |         if (cp < 0x80) {
119 |             if (length != 1) 
120 |                 return true;
121 |         }
122 |         else if (cp < 0x800) {
123 |             if (length != 2) 
124 |                 return true;
125 |         }
126 |         else if (cp < 0x10000) {
127 |             if (length != 3) 
128 |                 return true;
129 |         }
130 | 
131 |         return false;
132 |     }
133 | 
134 |     enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
135 | 
136 |     /// Helper for get_sequence_x
137 |     template <typename octet_iterator>
138 |     utf_error increase_safely(octet_iterator& it, octet_iterator end)
139 |     {
140 |         if (++it == end)
141 |             return NOT_ENOUGH_ROOM;
142 | 
143 |         if (!utf8::internal::is_trail(*it))
144 |             return INCOMPLETE_SEQUENCE;
145 |         
146 |         return UTF8_OK;
147 |     }
148 | 
149 |     #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}    
150 | 
151 |     /// get_sequence_x functions decode utf-8 sequences of the length x
152 |     template <typename octet_iterator>
153 |     utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
154 |     {
155 |         if (it == end)
156 |             return NOT_ENOUGH_ROOM;
157 | 
158 |         code_point = utf8::internal::mask8(*it);
159 | 
160 |         return UTF8_OK;
161 |     }
162 | 
163 |     template <typename octet_iterator>
164 |     utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
165 |     {
166 |         if (it == end) 
167 |             return NOT_ENOUGH_ROOM;
168 |         
169 |         code_point = utf8::internal::mask8(*it);
170 | 
171 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
172 | 
173 |         code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
174 | 
175 |         return UTF8_OK;
176 |     }
177 | 
178 |     template <typename octet_iterator>
179 |     utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
180 |     {
181 |         if (it == end)
182 |             return NOT_ENOUGH_ROOM;
183 |             
184 |         code_point = utf8::internal::mask8(*it);
185 | 
186 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
187 | 
188 |         code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
189 | 
190 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
191 | 
192 |         code_point += (*it) & 0x3f;
193 | 
194 |         return UTF8_OK;
195 |     }
196 | 
197 |     template <typename octet_iterator>
198 |     utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
199 |     {
200 |         if (it == end)
201 |            return NOT_ENOUGH_ROOM;
202 | 
203 |         code_point = utf8::internal::mask8(*it);
204 | 
205 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
206 | 
207 |         code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
208 | 
209 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
210 | 
211 |         code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
212 | 
213 |         UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
214 | 
215 |         code_point += (*it) & 0x3f;
216 | 
217 |         return UTF8_OK;
218 |     }
219 | 
220 |     #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
221 | 
222 |     template <typename octet_iterator>
223 |     utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
224 |     {
225 |         // Save the original value of it so we can go back in case of failure
226 |         // Of course, it does not make much sense with i.e. stream iterators
227 |         octet_iterator original_it = it;
228 | 
229 |         uint32_t cp = 0;
230 |         // Determine the sequence length based on the lead octet
231 |         typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
232 |         const octet_difference_type length = utf8::internal::sequence_length(it);
233 | 
234 |         // Get trail octets and calculate the code point
235 |         utf_error err = UTF8_OK;
236 |         switch (length) {
237 |             case 0: 
238 |                 return INVALID_LEAD;
239 |             case 1:
240 |                 err = utf8::internal::get_sequence_1(it, end, cp);
241 |                 break;
242 |             case 2:
243 |                 err = utf8::internal::get_sequence_2(it, end, cp);
244 |             break;
245 |             case 3:
246 |                 err = utf8::internal::get_sequence_3(it, end, cp);
247 |             break;
248 |             case 4:
249 |                 err = utf8::internal::get_sequence_4(it, end, cp);
250 |             break;
251 |         }
252 | 
253 |         if (err == UTF8_OK) {
254 |             // Decoding succeeded. Now, security checks...
255 |             if (utf8::internal::is_code_point_valid(cp)) {
256 |                 if (!utf8::internal::is_overlong_sequence(cp, length)){
257 |                     // Passed! Return here.
258 |                     code_point = cp;
259 |                     ++it;
260 |                     return UTF8_OK;
261 |                 }
262 |                 else
263 |                     err = OVERLONG_SEQUENCE;
264 |             }
265 |             else 
266 |                 err = INVALID_CODE_POINT;
267 |         }
268 | 
269 |         // Failure branch - restore the original value of the iterator
270 |         it = original_it;
271 |         return err;
272 |     }
273 | 
274 |     template <typename octet_iterator>
275 |     inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
276 |         uint32_t ignored;
277 |         return utf8::internal::validate_next(it, end, ignored);
278 |     }
279 | 
280 | } // namespace internal
281 | 
282 |     /// The library API - functions intended to be called by the users
283 | 
284 |     // Byte order mark
285 |     const uint8_t bom[] = {0xef, 0xbb, 0xbf};
286 | 
287 |     template <typename octet_iterator>
288 |     octet_iterator find_invalid(octet_iterator start, octet_iterator end)
289 |     {
290 |         octet_iterator result = start;
291 |         while (result != end) {
292 |             utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
293 |             if (err_code != internal::UTF8_OK)
294 |                 return result;
295 |         }
296 |         return result;
297 |     }
298 | 
299 |     template <typename octet_iterator>
300 |     inline bool is_valid(octet_iterator start, octet_iterator end)
301 |     {
302 |         return (utf8::find_invalid(start, end) == end);
303 |     }
304 | 
305 |     template <typename octet_iterator>
306 |     inline bool starts_with_bom (octet_iterator it, octet_iterator end)
307 |     {
308 |         return (
309 |             ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
310 |             ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
311 |             ((it != end) && (utf8::internal::mask8(*it))   == bom[2])
312 |            );
313 |     }
314 | 	
315 |     //Deprecated in release 2.3 
316 |     template <typename octet_iterator>
317 |     inline bool is_bom (octet_iterator it)
318 |     {
319 |         return (
320 |             (utf8::internal::mask8(*it++)) == bom[0] &&
321 |             (utf8::internal::mask8(*it++)) == bom[1] &&
322 |             (utf8::internal::mask8(*it))   == bom[2]
323 |            );
324 |     }
325 | } // namespace utf8
326 | 
327 | #endif // header guard
328 | 
329 | 
330 | 


--------------------------------------------------------------------------------
/src/3rdparty/utf8/checked.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2006 Nemanja Trifunovic
  2 | 
  3 | /*
  4 | Permission is hereby granted, free of charge, to any person or organization
  5 | obtaining a copy of the software and accompanying documentation covered by
  6 | this license (the "Software") to use, reproduce, display, distribute,
  7 | execute, and transmit the Software, and to prepare derivative works of the
  8 | Software, and to permit third-parties to whom the Software is furnished to
  9 | do so, all subject to the following:
 10 | 
 11 | The copyright notices in the Software and this entire statement, including
 12 | the above license grant, this restriction and the following disclaimer,
 13 | must be included in all copies of the Software, in whole or in part, and
 14 | all derivative works of the Software, unless such copies or derivative
 15 | works are solely in the form of machine-executable object code generated by
 16 | a source language processor.
 17 | 
 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 24 | DEALINGS IN THE SOFTWARE.
 25 | */
 26 | 
 27 | 
 28 | #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 29 | #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
 30 | 
 31 | #include "core.h"
 32 | #include <stdexcept>
 33 | 
 34 | namespace utf8
 35 | {
 36 |     // Base for the exceptions that may be thrown from the library
 37 |     class exception : public ::std::exception {
 38 |     };
 39 | 
 40 |     // Exceptions that may be thrown from the library functions.
 41 |     class invalid_code_point : public exception {
 42 |         uint32_t cp;
 43 |     public:
 44 |         invalid_code_point(uint32_t cp) : cp(cp) {}
 45 |         virtual const char* what() const throw() { return "Invalid code point"; }
 46 |         uint32_t code_point() const {return cp;}
 47 |     };
 48 | 
 49 |     class invalid_utf8 : public exception {
 50 |         uint8_t u8;
 51 |     public:
 52 |         invalid_utf8 (uint8_t u) : u8(u) {}
 53 |         virtual const char* what() const throw() { return "Invalid UTF-8"; }
 54 |         uint8_t utf8_octet() const {return u8;}
 55 |     };
 56 | 
 57 |     class invalid_utf16 : public exception {
 58 |         uint16_t u16;
 59 |     public:
 60 |         invalid_utf16 (uint16_t u) : u16(u) {}
 61 |         virtual const char* what() const throw() { return "Invalid UTF-16"; }
 62 |         uint16_t utf16_word() const {return u16;}
 63 |     };
 64 | 
 65 |     class not_enough_room : public exception {
 66 |     public:
 67 |         virtual const char* what() const throw() { return "Not enough space"; }
 68 |     };
 69 | 
 70 |     /// The library API - functions intended to be called by the users
 71 | 
 72 |     template <typename octet_iterator>
 73 |     octet_iterator append(uint32_t cp, octet_iterator result)
 74 |     {
 75 |         if (!utf8::internal::is_code_point_valid(cp))
 76 |             throw invalid_code_point(cp);
 77 | 
 78 |         if (cp < 0x80)                        // one octet
 79 |             *(result++) = static_cast<uint8_t>(cp);
 80 |         else if (cp < 0x800) {                // two octets
 81 |             *(result++) = static_cast<uint8_t>((cp >> 6)            | 0xc0);
 82 |             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
 83 |         }
 84 |         else if (cp < 0x10000) {              // three octets
 85 |             *(result++) = static_cast<uint8_t>((cp >> 12)           | 0xe0);
 86 |             *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
 87 |             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
 88 |         }
 89 |         else {                                // four octets
 90 |             *(result++) = static_cast<uint8_t>((cp >> 18)           | 0xf0);
 91 |             *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)  | 0x80);
 92 |             *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
 93 |             *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
 94 |         }
 95 |         return result;
 96 |     }
 97 | 
 98 |     template <typename octet_iterator, typename output_iterator>
 99 |     output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
100 |     {
101 |         while (start != end) {
102 |             octet_iterator sequence_start = start;
103 |             internal::utf_error err_code = utf8::internal::validate_next(start, end);
104 |             switch (err_code) {
105 |                 case internal::UTF8_OK :
106 |                     for (octet_iterator it = sequence_start; it != start; ++it)
107 |                         *out++ = *it;
108 |                     break;
109 |                 case internal::NOT_ENOUGH_ROOM:
110 |                     throw not_enough_room();
111 |                 case internal::INVALID_LEAD:
112 |                     out = utf8::append (replacement, out);
113 |                     ++start;
114 |                     break;
115 |                 case internal::INCOMPLETE_SEQUENCE:
116 |                 case internal::OVERLONG_SEQUENCE:
117 |                 case internal::INVALID_CODE_POINT:
118 |                     out = utf8::append (replacement, out);
119 |                     ++start;
120 |                     // just one replacement mark for the sequence
121 |                     while (start != end && utf8::internal::is_trail(*start))
122 |                         ++start;
123 |                     break;
124 |             }
125 |         }
126 |         return out;
127 |     }
128 | 
129 |     template <typename octet_iterator, typename output_iterator>
130 |     inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
131 |     {
132 |         static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
133 |         return utf8::replace_invalid(start, end, out, replacement_marker);
134 |     }
135 | 
136 |     template <typename octet_iterator>
137 |     uint32_t next(octet_iterator& it, octet_iterator end)
138 |     {
139 |         uint32_t cp = 0;
140 |         internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
141 |         switch (err_code) {
142 |             case internal::UTF8_OK :
143 |                 break;
144 |             case internal::NOT_ENOUGH_ROOM :
145 |                 throw not_enough_room();
146 |             case internal::INVALID_LEAD :
147 |             case internal::INCOMPLETE_SEQUENCE :
148 |             case internal::OVERLONG_SEQUENCE :
149 |                 throw invalid_utf8(*it);
150 |             case internal::INVALID_CODE_POINT :
151 |                 throw invalid_code_point(cp);
152 |         }
153 |         return cp;
154 |     }
155 | 
156 |     template <typename octet_iterator>
157 |     uint32_t peek_next(octet_iterator it, octet_iterator end)
158 |     {
159 |         return utf8::next(it, end);
160 |     }
161 | 
162 |     template <typename octet_iterator>
163 |     uint32_t prior(octet_iterator& it, octet_iterator start)
164 |     {
165 |         // can't do much if it == start
166 |         if (it == start)
167 |             throw not_enough_room();
168 | 
169 |         octet_iterator end = it;
170 |         // Go back until we hit either a lead octet or start
171 |         while (utf8::internal::is_trail(*(--it)))
172 |             if (it == start)
173 |                 throw invalid_utf8(*it); // error - no lead byte in the sequence
174 |         return utf8::peek_next(it, end);
175 |     }
176 | 
177 |     /// Deprecated in versions that include "prior"
178 |     template <typename octet_iterator>
179 |     uint32_t previous(octet_iterator& it, octet_iterator pass_start)
180 |     {
181 |         octet_iterator end = it;
182 |         while (utf8::internal::is_trail(*(--it)))
183 |             if (it == pass_start)
184 |                 throw invalid_utf8(*it); // error - no lead byte in the sequence
185 |         octet_iterator temp = it;
186 |         return utf8::next(temp, end);
187 |     }
188 | 
189 |     template <typename octet_iterator, typename distance_type>
190 |     void advance (octet_iterator& it, distance_type n, octet_iterator end)
191 |     {
192 |         for (distance_type i = 0; i < n; ++i)
193 |             utf8::next(it, end);
194 |     }
195 | 
196 |     template <typename octet_iterator>
197 |     typename std::iterator_traits<octet_iterator>::difference_type
198 |     distance (octet_iterator first, octet_iterator last)
199 |     {
200 |         typename std::iterator_traits<octet_iterator>::difference_type dist;
201 |         for (dist = 0; first < last; ++dist)
202 |             utf8::next(first, last);
203 |         return dist;
204 |     }
205 | 
206 |     template <typename u16bit_iterator, typename octet_iterator>
207 |     octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
208 |     {
209 |         while (start != end) {
210 |             uint32_t cp = utf8::internal::mask16(*start++);
211 |             // Take care of surrogate pairs first
212 |             if (utf8::internal::is_lead_surrogate(cp)) {
213 |                 if (start != end) {
214 |                     uint32_t trail_surrogate = utf8::internal::mask16(*start++);
215 |                     if (utf8::internal::is_trail_surrogate(trail_surrogate))
216 |                         cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
217 |                     else
218 |                         throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
219 |                 }
220 |                 else
221 |                     throw invalid_utf16(static_cast<uint16_t>(cp));
222 | 
223 |             }
224 |             // Lone trail surrogate
225 |             else if (utf8::internal::is_trail_surrogate(cp))
226 |                 throw invalid_utf16(static_cast<uint16_t>(cp));
227 | 
228 |             result = utf8::append(cp, result);
229 |         }
230 |         return result;
231 |     }
232 | 
233 |     template <typename u16bit_iterator, typename octet_iterator>
234 |     u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
235 |     {
236 |         while (start != end) {
237 |             uint32_t cp = utf8::next(start, end);
238 |             if (cp > 0xffff) { //make a surrogate pair
239 |                 *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
240 |                 *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
241 |             }
242 |             else
243 |                 *result++ = static_cast<uint16_t>(cp);
244 |         }
245 |         return result;
246 |     }
247 | 
248 |     template <typename octet_iterator, typename u32bit_iterator>
249 |     octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
250 |     {
251 |         while (start != end)
252 |             result = utf8::append(*(start++), result);
253 | 
254 |         return result;
255 |     }
256 | 
257 |     template <typename octet_iterator, typename u32bit_iterator>
258 |     u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
259 |     {
260 |         while (start != end)
261 |             (*result++) = utf8::next(start, end);
262 | 
263 |         return result;
264 |     }
265 | 
266 |     // The iterator class
267 |     template <typename octet_iterator>
268 |     class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
269 |       octet_iterator it;
270 |       octet_iterator range_start;
271 |       octet_iterator range_end;
272 |       public:
273 |       iterator () {}
274 |       explicit iterator (const octet_iterator& octet_it,
275 |                          const octet_iterator& range_start,
276 |                          const octet_iterator& range_end) :
277 |                it(octet_it), range_start(range_start), range_end(range_end)
278 |       {
279 |           if (it < range_start || it > range_end)
280 |               throw std::out_of_range("Invalid utf-8 iterator position");
281 |       }
282 |       // the default "big three" are OK
283 |       octet_iterator base () const { return it; }
284 |       uint32_t operator * () const
285 |       {
286 |           octet_iterator temp = it;
287 |           return utf8::next(temp, range_end);
288 |       }
289 |       bool operator == (const iterator& rhs) const
290 |       {
291 |           if (range_start != rhs.range_start || range_end != rhs.range_end)
292 |               throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
293 |           return (it == rhs.it);
294 |       }
295 |       bool operator != (const iterator& rhs) const
296 |       {
297 |           return !(operator == (rhs));
298 |       }
299 |       iterator& operator ++ ()
300 |       {
301 |           utf8::next(it, range_end);
302 |           return *this;
303 |       }
304 |       iterator operator ++ (int)
305 |       {
306 |           iterator temp = *this;
307 |           utf8::next(it, range_end);
308 |           return temp;
309 |       }
310 |       iterator& operator -- ()
311 |       {
312 |           utf8::prior(it, range_start);
313 |           return *this;
314 |       }
315 |       iterator operator -- (int)
316 |       {
317 |           iterator temp = *this;
318 |           utf8::prior(it, range_start);
319 |           return temp;
320 |       }
321 |     }; // class iterator
322 | 
323 | } // namespace utf8
324 | 
325 | #endif //header guard
326 | 
327 | 
328 | 


--------------------------------------------------------------------------------
/test/rus_sentences.txt:
--------------------------------------------------------------------------------
  1 | Суворов с любопытством расспрашивал славного мятежника о его военных действияхи намерениях и повез его в Симбирск, куда должен был приехать и граф Панин .
  2 | И просто задохнулся от неожиданной радости .
  3 | Вскоре начал накрапывать дождь , мочивший нас до самого места лагеря экспедиции , партии Елизаветы Владимировны .
  4 | Они сидели над миром , участвуя в общем процессе жизни и гибели и присоединяя к своей сущности все души , личности и чувства « Я » .
  5 | Его толстая шея вздулась багровыми жилами , и широкое скуластое лицо налилось кровью .
  6 | Трусость его не дозволяла ему хорошенько изъясниться ; и так солгал он раза два или три без правил , а как господин закричал ему , чтобы он пошел домой , то тем дело и кончилось .
  7 | укладывается в узкий тоннель научных знаний - высмеивается или не замечается .
  8 | Комитет « Гражданское содействие » выдает им ходатайства о регистрации и обращения к сотрудникам МВД с разъяснением реального положения этих людей .
  9 | Однако ДНК сама по себе несет важнейшую функцию кодирования аминокислотной последовательности белков , и мы не можем произвольно ее модифицировать , не затронув этой информации и не изменив способности ДНК к удвоению .
 10 | Сейчас очень интересную женщину пишут .
 11 | Заставляли молиться , готовили еду , рассказывали анекдоты .
 12 | Офицерам и автоматчикам из роты охраны Поляков приказал тщательно осмотреть окрестность , а сам с немцами и занялся непосредственно участком , где располагалось ядро группы .
 13 | Калинин проехал , не заинтересовался .
 14 | Начнём с депутатов !
 15 | В Нюрнберге была знаменитая фамилия Фуггеров , банкиров - купцов того времени , вроде Ротшильдов ; они предложили у курфирста взять на откуп индульгенции .
 16 | Из-за смут и беспорядков этого времени , судоходство в португальских водах почти прекратилось , так что нечего было опасаться даже случайного обнаружения судна в этих местах .
 17 | Более того , мы считаем , что кредит в будущем может стать альтернативой долгу .
 18 | Прошло по крайней мере с полчаса , пока утихли эти хватающие за сердце рыдания .
 19 | На экранах - больших и малых - царят американские фильмы .
 20 | Переход к простой брани был бы слишком крут и заметен без разных переливов , оттенков и мостов .
 21 | Всех жильем обеспечил .
 22 | Когда сотрудники КГБ выскочили из здания , то на указанном месте обнаружили лишь ключи от машины .
 23 | Даже Николай запрещал об этом случае вспоминать - сразу беленился .
 24 | И Данило Сазоныч завел разговор о Потемкине , который говорил ему , что переходит опять за Московскую заставу .
 25 | Комната , в которой нас принимали , была , конечно , самая просторная в доме ; ее заранее мыли и чистили и перед образами затепляли лампады .
 26 | Последний кайф лета перед скучищей учебы и повседневности .
 27 | Я видел теперь , что на самом деле он не узкоглазый , а просто из тех людей , которые всегда смотрят вприщур .
 28 | Только извините .
 29 | Вдруг поместился посреди « чистилища » , как бы никому и не мешая .
 30 | такой интеллектуальный марафон .
 31 | Старый туркмен , прозванный Хоробрых « царем Менелаем » , раздувал слухи , что в мергеле сидит злой дух и будет жестоко мстить каждому , кто дойдет до сердца горы .
 32 | Богословие в Киеве он читал по Аквинату .
 33 | И Севастьянов не удивляется , что Семка пишет письмо брату , спящему в соседней комнате , - не до того Севастьянову .
 34 | Мы не могли определить , что теперь : солнечный ли день или непроницаемый туман , ибо в лесу были сумерки , как в наших широтах через час после солнечного заката .
 35 | Не избежал этой участи и Н .
 36 | Это же не паста , а чистая отрава !
 37 | За ним она , прекрасная как всегда , вся в белом , вся в цветах померанца , с длинным блондовым вуалем на голове , который живописно спускался назад ; с блестящим шифром на левом плече .
 38 | Если бы какой-либо священник вздумал устроить религиозные чтения для народа в своей же церкви , но в часы , когда нет богослужения в ней , для этого он предварительно должен испросить себе особое разрешение епархиальной власти .
 39 | Как будто раньше люди со временем молодели .
 40 | Кабинка была почище .
 41 | Сентябрь в Израиле - месяц новогодний и поэтому не очень перегруженный политикой .
 42 | Сделала это с грустным видом маменька , толстая дама , большая курительница и специалистка в преферансе .
 43 | Зосиму и Савватия .
 44 | Через два месяца : вы больны оттого , что не побереглись .
 45 | Расчет у него был простой : чем меньше гости пробудут в лагере , тем лучше , - лагерь всегда казался ему кипящим котлом , ежеминутно готовым взорваться .
 46 | Всегда мечтала иметь много детей , а тут счастье в руки : девочка славная , самостоятельная .
 47 | Или друг к другу - враждебно пристрастные .
 48 | Государственная территория , дотоле заключенная в пределах первоначального расселения великорусского племени , теперь переходит далеко за эти пределы и постепенно вбирает в себя всю русскую равнину , распространяясь как до географических ее границ , так почти везде до пределов русского народонаселения .
 49 | Здесь прежде всего большое значение имеет правильный отбор поступающих на военную службу .
 50 | Моя кобыла-то еще зимой до того привыкла солому возить , что с закрытыми глазами по тому маршруту ходила .
 51 | Сотрудники милиции даже выезжали на « стрелки » , где очень быстро и доступно объясняли своим оппонентам необходимость оставить в покое опекаемых или коммерсантов .
 52 | Любовь - мера одаренности жизнью людей , но она , вопреки всему , в очень малой степени сексуальность .
 53 | Ну , вы храбрый .
 54 | Бог весть , встретимся ли еще .
 55 | ибо как бы ни был человек мал , но есть какие-то результаты его жизни !
 56 | Став один раз вразрез с матерью и сестрами , она не умела с ними сойтись снова , а они этого не искали .
 57 | Один засунул его в настенные часы с кукушкой .
 58 | Тогда я , оборотясь , увидел на горе против нас , за речкой , множество колюжей , а сверх того человек двадцать бежавших , чтоб отрезать нас троих от наших товарищей ; между тем стрелы сыпались на нас , как град .
 59 | Мы не останавливаемся на достигнутом , стараясь охватить весь спектр направлений подготовки студентов .
 60 | А на мне , я знал , лежала вся ответственность за успех ее .
 61 | Собою их не заслоню , хотя я и автор , вернее - одно из второстепенных лиц на задах массовки .
 62 | Покачивая головой и сгорбившись , он возвращается к окну ; отряд длинной и неровной цепью выползает через ворота плац-парада .
 63 | Но это оказалось реальностью , и теперь мир уже не будет таким , каким он был вчера .
 64 | По счастью , падали деревья ночью , часа в четыре , в безлюдье .
 65 | Сам тощой , а места занял , как баба откормленная .
 66 | Неважно !
 67 | Когда мы выехали , была уже ночь , и дорога лежала через девственный лес с вековыми соснами , лиственницей и елью гигантских размеров .
 68 | Оставалась ровная , спокойная и незамужняя .
 69 | Эта сука в оценке мужиков и их дурацких качеств почти всегда бывает права .
 70 | Им это не мешало .
 71 | Но еще пыром , дырявым ботинком он и тут успел врезать сержанту меж ног - и это тоже со счастливой мыслью .
 72 | Навевает мысли о стабильности и постоянстве .
 73 | Послышались женские всхлипывания .
 74 | А так как деятельность его происходила среди очень молодых людей , принимавших его безграничную самоуверенность за глубокомыслие и мудрость , то большинство подчинялось ему , и он имел большой успех в революционных кругах .
 75 | Стараясь не сбиться с общего шага , он снова представил себе сыновей и мысленно обратился к ним с продолжением своего рассказа .
 76 | Общество , в котором властвовала партийная номенклатура , насквозь пропитанная догмами , неизлечимо больная утопической идеологией .
 77 | Этого желает моя высокая повелительница .
 78 | Настоящий громила с узким лбом , с лохматыми бровями над близко сведенными черными глазами , а длинные руки , словно клешни , - одной ладонью всю мою спину прикроют .
 79 | Те ценные результаты , о которых мы говорили , - итог длительного и напряженного труда .
 80 | Освобожденный от необходимости на каждом шагу доказывать свою независимость , всякий делал свое дело спокойно , без раздражения .
 81 | Сегодня они больше всего боятся , что придворная знать и крупные капиталисты не захотят их брать всерьез .
 82 | Скорее всего он не умеет понять противоречие между ответственностью их миссии и теми « ядовитыми » характеристиками , которыми наделяет их Ленин .
 83 | Ну , вот , мы и проверим , милая трусиха , насколько оправдаются ваши страхи .
 84 | Женщина обрадовалась разговору , и сама все рассказала , лаская его узкое лицо шелковистыми своими глазами .
 85 | России хватит на всех .
 86 | А где-то - водка , где-то - самогон , где-то - « чернила » обыкновенные .
 87 | Но они не любопытны .
 88 | результативность .
 89 | И всегда мысль « Бог со мною » .
 90 | Ефим Игнатьич только мигает , а до горячих пирожков не дотрагивается .
 91 | У тетушки Марьи Алексеевны она прожила недолго .
 92 | Эта постоянная необычность для нас , вероятно , основана на имманентной иллюзии каузальности временной организации психического .
 93 | Сыр , масло , кожа , мед , лес и - долой фабрики !
 94 | Одно время , например , уговаривали Л .
 95 | Стоя у подножия маяка , надо высоко задирать голову , чтобы увидеть его вершину , и только тогда постигаешь все величие сооружения .
 96 | Больше ничего не говорит , говорит только , что там очень плохо .
 97 | Так же как в начале прошлого века конные скачки , а в начале позапрошлого - стрельба из лука .
 98 | Афонской горы .
 99 | Как не знать Андрея Михайлыча !
100 | Степан Тимохин .
101 | Последнее мое свидание с Гоголем было в Петербурге , когда он останавливался в Зимнем дворце , у Жуковского .
102 | Мы уже узнали, что он собирался прочесть нам новое свое произведение , но приступить к делу было не легко .
103 | Гоголь как ни в чем не бывало ходил по комнате , добродушно подсмеивался над некоторыми общими знакомыми , а о чтении и помину не было .
104 | Даже раз он намекнул , что можно отложить заседание .
105 | Он подошел к Гоголю сзади, ощупал карманы его фрака, вытащил оттуда тетрадь почтовой бумаги в осьмушку .
106 | Гоголь сердито выхватил тетрадку , сел мрачно на диван и тотчас же начал читать при всеобщем молчании .
107 | Он читал без перерыва до тех пор , пока истощился весь его голос и зарябило в глазах .
108 | Мы узнали таким образом первые четыре главы «Мертвых душ» .
109 | Общий смех мало поразил Гоголя , но изъявление нелицемерного восторга , которое видимо было на всех лицах под конец чтения , его тронуло .
110 | Он был доволен .
111 | Кто-то сказал , что приветствие Селифана босой девочке , которую он сажает на козлы вместо проводника от Коробочки , не совсем прилично.
112 | Все остальные слушатели восстали против этого замечания .
113 | После чтения он закутался , по обыкновению , в шубу до самого лба , сел со мной на извозчика , и мы молча доехали до Зимнего дворца , где я его ссадил .
114 | Вскоре потом он опять исчез из Петербурга .
115 | Гоголь обрадовался нашей новой встрече, расспрашивал, каким путем прибыл я в Италию .
116 | Ему казалось , что после Италии Париж становится сух и безжизнен , а значение Италии бросается само собой в глаза после парижской жизни и парижских интересов .
117 | Впоследствии он часто развивал эту мысль .
118 | Между тем время было обеденное .
119 | Он повел меня в известную историческую австерию , где за длинными столами , шагая по грязному полу и усаживаясь просто на скамейках , стекается к обеденному часу разнообразнейшая публика .
120 | Это все тот же рис , барашек , курица - меняется только зелень по временам года .
121 | Простота, общежительность итальянская всего более кидаются тут в глаза , заставляя предчувствовать себя и во всех других сферах жизни .
122 | Гоголь поразил меня , однако, капризным , взыскательным обращением своим с прислужником .
123 | Раза два менял он блюдо риса , находя его то переваренным , то недоваренным , и всякий раз прислужник переменял блюдо с добродушной улыбкой .
124 | Получив наконец тарелку риса по своему вкусу , Гоголь приступилк ней с необычайною алчностью , наклонясь так , что длинные волосы его упали на самое блюдо , и поглощая ложку за ложкой со страстью и быстротой , какими , говорят, обыкновенно отличаются за столом люди , расположенные к ипохондрии .
125 | В середине обеда к нам подсел довольно плотный мужчина , с красивой , круглой бородкой .
126 | Опорожнив свое блюдо , Гоголь откинулся назад, сделался весел , разговорчив и начал шутить с прислужником, еще так недавно
127 | осыпаемым строгими выговорами и укоризнами .
128 | По окончании расчета за обед Гоголь оставил прислужнику , как и все другие посетители , два байока , а когда я со своей стороны что-то переложил против этой скудной суммы , он остановил меня замечанием .
129 | Известно, что житейской мудрости в нем было почти столько же , сколько и таланта .
130 | Он был в своей тарелке и мог  , что ему нужно было или что стоило этого , полной рукой , не давая сам ничего .
131 | Я никогда не хочу обедать .
132 | Мне так хорошо во дворе играть .
133 | Я всю жизнь бы во дворе играл .
134 | И никогда не обедал бы .
135 | Я совсем не люблю борщ с капустой .
136 | И вообще я суп не люблю .
137 | И кашу я не люблю .
138 | И котлеты тоже не очень люблю .
139 | Я люблю абрикосы .
140 | Вы ели абрикосы .
141 | Я так люблю абрикосы .
142 | Но вот мама зовёт меня есть борщ , мне приходится всё бросать .
143 | Мой брат Боба любит борщ .
144 | Он смеётся, когда ест борщ , а я морщусь .
145 | Он вообще всегда смеётся и тычет себе ложкой в нос вместо рта , потому что ему три года .
146 | Нет , борщ я могу съесть .
147 | И котлеты я тоже съедаю .
148 | Виноград-то я ем с удовольствием .
149 | Тогда и сажают меня за рояль .
150 | Пожалуй, я съел бы ещё раз борщ .
151 | Только бы не играть на рояле .
152 | Я играю , а брат сидит на полу и смеётся .
153 | В руках у него заводная машина .
154 | Он оторвал от машины колёса .
155 | И катает их по полу .
156 | И это ему очень нравится .
157 | Никто ему не мешает .
158 | Не заставляет играть на рояле .
159 | И потому ему очень весело .
160 | Плачет он очень редко .
161 | Когда у него что-нибудь отнимают .
162 | Или когда его стригут .
163 | Он совершенно не любит стричься .
164 | Он так и ходил бы всю жизнь лохматый .
165 | На это он не обращает внимания .
166 | В общем, ему хорошо , а мне плохо .
167 | Папа с мамой слушают , как я играю .
168 | Брат катает по полу колёсики .
169 | За окном кричат четыре брата .
170 | Они кричат разными голосами .
171 | Я вижу в окно : они машут руками .
172 | Они зовут меня .
173 | Им одним скучно .
174 | 


--------------------------------------------------------------------------------
/grammars/alphabets.grm:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | #
 13 | # Copyright 2014 Yandex LLC
 14 | # All Rights Reserved.
 15 | #
 16 | # Author : Alexis Wilpert
 17 | 
 18 | 
 19 | 
 20 | # CYRILLIC
 21 | # --------
 22 | 
 23 | export letter = Optimize[
 24 |     "а" | "б" | "в" | "г" | "д" | "е" | "ё" | "ж" | "з" |
 25 |     "и" | "й" | "к" | "л" | "м" | "н" | "о" | "п" | "р" |
 26 |     "с" | "т" | "у" | "ф" | "х" | "ц" | "ч" | "ш" | "щ" |
 27 |     "ъ" | "ы" | "ь" | "э" | "ю" | "я"
 28 | ];
 29 | 
 30 | export voiceless_consonant_letters =
 31 |     "к" | "п" | "с" | "т" | "ф" | "х" | "ц" | "ч" | "ш" | "щ"
 32 | ;
 33 | 
 34 | export voiced_consonant_letters =
 35 |     "б" | "в" | "г" | "д" | "ж" | "з" | "л" | "м" | "н" | "р"
 36 | ;
 37 | 
 38 | export cons_letter = Optimize[voiceless_consonant_letters |
 39 |                               voiced_consonant_letters |
 40 |                               "ъ" | "ь" | "й"
 41 |                              ];
 42 | 
 43 | export soft_vow_letter =  "е" | "ё" | "и" | "ю" | "я";
 44 | export hard_vow_letter =  "а" | "о" | "у" | "э" | "ы" ;
 45 | 
 46 | export vow_letter = Optimize[soft_vow_letter | hard_vow_letter];
 47 | 
 48 | export word_sep = " ";
 49 | 
 50 | # --------------------------------------------------------------------------------------
 51 | 
 52 | # PHONETIC
 53 | # --------
 54 | 
 55 | # plosives
 56 | 
 57 | export hard_plosives = "p"   | "b"   | "t"   | "d"   | "k"   | "g";
 58 | export soft_plosives = "[pJ]" | "[bJ]" | "[tJ]" | "[dJ]" | "[kJ]" | "[gJ]";
 59 | 
 60 | export geminated_hard_plosives = "[t_t]"   | "[d_d]";
 61 | export geminated_soft_plosives = "[t_tJ]" | "[d_dJ]";
 62 | 
 63 | export coronal_plosives = "t" | "d" | "[tJ]" | "[dJ]" | "[t_t]" | "[d_d]" | "[t_tJ]" | "[d_dJ]";
 64 | 
 65 | export anterior_plosives = "p"   | "b"   | "t"   | "d" |
 66 |                            "[pJ]" | "[bJ]" | "[tJ]" | "[dJ]" |
 67 |                            "[t_t]"   | "[d_d]" |
 68 |                            "[t_tJ]" | "[d_dJ]"
 69 | ;
 70 | 
 71 | export geminated_plosives = Optimize[geminated_hard_plosives | geminated_soft_plosives];
 72 | 
 73 | export voiced_plosives =    "b" | "d" | "g" | "[bJ]" | "[dJ]" | "[gJ]" | "[d_d]" | "[d_dJ]";
 74 | export voiceless_plosives = "p" | "t" | "k" | "[pJ]" | "[tJ]" | "[kJ]" | "[t_t]" | "[t_tJ]";
 75 | 
 76 | export plosives = Optimize[hard_plosives | soft_plosives | geminated_plosives];
 77 | 
 78 | # fricatives
 79 | 
 80 | export hard_fricatives = "S"   | "Z"   | "f"   | "s"   | "v"   | "x"  | "z";
 81 | export soft_fricatives = "[SJ]" | "[ZJ]" | "[fJ]" | "[sJ]" | "[vJ]" | "[xJ]" | "[zJ]";
 82 | 
 83 | export geminated_hard_fricatives = "[s_s]";
 84 | export geminated_soft_fricatives = "[s_sJ]";
 85 | 
 86 | export geminated_fricatives = Optimize[geminated_hard_fricatives |
 87 |                                        geminated_soft_fricatives
 88 |                                       ];
 89 | 
 90 | export coronal_fricatives = "S"   | "Z"   | "s"   | "z"   |
 91 |                             "[SJ]" | "[ZJ]" | "[sJ]" | "[zJ]" |
 92 |                             "[S_S]" | "[Z_Z]" | "[s_s]" | "[z_z]" |
 93 |                             "[S_SJ]" | "[Z_ZJ]" | "[s_sJ]" | "[z_zJ]"
 94 | ;
 95 | 
 96 | export anterior_fricatives = "f"   | "s"   | "v"   | "z"   |
 97 |                              "[fJ]" | "[sJ]" | "[vJ]" | "[zJ]" |
 98 |                              "[s_s]"   | "[v_v]"   | "[z_z]" |
 99 |                              "[s_sJ]" | "[v_vJ]" | "[z_zJ]"
100 | ;
101 | 
102 | export voiced_fricatives =    "Z" | "v" | "z" | "[ZJ]" | "[vJ]" | "[zJ]" | "[Z_Z]" | "[v_v]" |
103 |                               "[z_z]" | "[Z_ZJ]" | "[z_zJ]" | "[v_vJ]"
104 | ;
105 | 
106 | export voiceless_fricatives = "S" | "f" | "s" | "[SJ]" | "[fJ]" | "[sJ]" | "[S_S]" | "[s_s]" |
107 |                               "[s_s]" | "[S_SJ]" | "[s_sJ]" | "x" | "[xJ]"
108 | ;
109 | 
110 | export fricatives = Optimize[hard_fricatives | soft_fricatives | geminated_fricatives];
111 | 
112 | # nasals
113 | 
114 | export hard_nasals = "m"   | "n";
115 | export soft_nasals = "[mJ]" | "[nJ]";
116 | export geminated_hard_nasals = "[n_n]";
117 | export geminated_soft_nasals = "[n_nJ]";
118 | export coronal_nasals = "n" | "[nJ]" | "[n_n]" | "[n_nJ]";
119 | 
120 | export geminated_nasals = Optimize[geminated_hard_nasals | geminated_soft_nasals];
121 | 
122 | export nasals = Optimize[hard_nasals | soft_nasals | geminated_nasals];
123 | 
124 | # liquids
125 | 
126 | export hard_liquids = "l"   | "r";
127 | export soft_liquids = "[lJ]" | "[rJ]";
128 | export geminated_hard_liquids = "[l_l]"   | "[r_r]";
129 | export geminated_soft_liquids = "[l_lJ]" | "[r_rJ]";
130 | export laterals = "l" | "[lJ]" | "[l_l]" | "[l_lJ]";
131 | 
132 | export geminated_liquids = Optimize[geminated_hard_liquids | geminated_soft_liquids];
133 | 
134 | export liquids = Optimize[hard_liquids | soft_liquids | geminated_liquids];
135 | 
136 | # approximant
137 | 
138 | export approximant = "j";
139 | 
140 | # affricates
141 | 
142 | export hard_affricates =  "[d_Z]" | "[d_z]" | "[t_s]";
143 | export soft_affricates = "[t_SJ]" | "[d_ZJ]";
144 | 
145 | export voiced_affricates =    "[d_Z]" | "[d_z]" | "[d_ZJ]";
146 | export voiceless_affricates = "[t_s]" | "[t_SJ]";
147 | 
148 | export anterior_affricates = "[d_z]" | "[t_s]";
149 | 
150 | export affricates = Optimize[hard_affricates | soft_affricates];
151 | 
152 | # consonants
153 | 
154 | export hard_cons_phono = Optimize[hard_plosives             |
155 |                                   geminated_hard_plosives   |
156 |                                   hard_fricatives           |
157 |                                   geminated_hard_fricatives |
158 |                                   hard_nasals               |
159 |                                   geminated_hard_nasals     |
160 |                                   hard_liquids              |
161 |                                   geminated_hard_liquids    |
162 |                                   hard_affricates
163 |                                  ];
164 | 
165 | export voiced_consonants = Optimize[voiced_plosives   |
166 |                                     voiced_fricatives |
167 |                                     nasals            |
168 |                                     liquids           |
169 |                                     approximant       |
170 |                                     voiced_affricates
171 |                                    ];
172 | 
173 | export voiceless_consonants = Optimize[voiceless_plosives   |
174 |                                        voiceless_fricatives |
175 |                                        voiceless_affricates
176 |                                       ];
177 | 
178 | export consonants = Optimize[voiced_consonants | voiceless_consonants];
179 | 
180 | # vowels
181 | 
182 | export stressed_close_vowels = "i" | "[i_x]" | "e";
183 | export stressed_other_vowels = "a" | "u" | "o";
184 | 
185 | export stressed_vowels = Optimize[stressed_close_vowels | stressed_other_vowels];
186 | 
187 | export reduced_vowels = "[@_o]" | "E" | "[@_r]" | "I" | "U" | "@" | "[U_x]";
188 | 
189 | export vowels = Optimize[stressed_vowels | reduced_vowels];
190 | 
191 | # diphthongs
192 | 
193 | export stressed_front_diphthongs = "[i_i]" | "[e_i]";
194 | export stressed_other_diphthongs = "[a_i]" | "[o_i]" | "[u_i]";
195 | 
196 | export stressed_diphthongs = Optimize[stressed_front_diphthongs |
197 |                                       stressed_other_diphthongs
198 |                                      ];
199 | 
200 | export reduced_diphthongs = "[I_i]" | "[@_o_i]" | "[@_i]" | "[U_i]" | "[@_r_i]";
201 | 
202 | export diphthongs = Optimize[stressed_diphthongs | reduced_diphthongs];
203 | 
204 | # vowels + diphthongs
205 | 
206 | export stressed_front_nuclei = Optimize[stressed_close_vowels |
207 |                                         stressed_front_diphthongs];
208 | 
209 | export soft_nuclei = "i" | "e" | "[i_i]" | "[e_i]";
210 | 
211 | export nuclei = Optimize[vowels | diphthongs];
212 | 
213 | export phone = Optimize[nuclei | consonants];
214 | 
215 | # --------------------------------------------------------------------------------------
216 | 
217 | # MIXED
218 | # -----
219 | 
220 | export voiced = Optimize[voiced_consonant_letters | voiced_consonants];
221 | export voiceless = Optimize[voiceless_consonant_letters | voiceless_consonants];
222 | 
223 | # --------------------------------------------------------------------------------------
224 | 
225 | # COMPLETE ALPHABET
226 | # -----------------
227 | 
228 | export letter_star = Optimize[letter*];
229 | export phone_star = Optimize[phone*];
230 | export sigma = Optimize[letter   | phone     |
231 |                         " "      | "-"       | "+" |
232 |                         "[SIL]"  | "[ERROR]" |
233 |                         "[ADJ]"  | "[VERB]"  |
234 |                         "[WUD]"
235 |                        ];
236 | export sigma_star = Optimize[sigma*];
237 | 
238 | # --------------------------------------------------------------------------------------
239 | 
240 | # FOR TESTING
241 | # -----------
242 | composed_char = ("I_i":"[I_i]") |
243 |                 ("@_o":"[@_o]") |
244 |                 ("ZJ":"[ZJ]") |
245 |                 ("t_tJ":"[t_tJ]") |
246 |                 ("l_l":"[l_l]") |
247 |                 ("rJ":"[rJ]") |
248 |                 ("tJ":"[tJ]") |
249 |                 ("s_s":"[s_s]") |
250 |                 ("fJ":"[fJ]") |
251 |                 ("t_SJ":"[t_SJ]") |
252 |                 ("vJ":"[vJ]") |
253 |                 ("i_i":"[i_i]") |
254 |                 ("d_d":"[d_d]") |
255 |                 ("dJ":"[dJ]") |
256 |                 ("n_n":"[n_n]") |
257 |                 ("U_i":"[U_i]") |
258 |                 ("l_lJ":"[l_lJ]") |
259 |                 ("d_Z":"[d_Z]") |
260 |                 ("d_dJ":"[d_dJ]") |
261 |                 ("s_sJ":"[s_sJ]") |
262 |                 ("gJ":"[gJ]") |
263 |                 ("kJ":"[kJ]") |
264 |                 ("t_t":"[t_t]") |
265 |                 ("e_i":"[e_i]") |
266 |                 ("t_s":"[t_s]") |
267 |                 ("i_x":"[i_x]") |
268 |                 ("sJ":"[sJ]") |
269 |                 ("d_z":"[d_z]") |
270 |                 ("SJ":"[SJ]") |
271 |                 ("mJ":"[mJ]") |
272 |                 ("@_o_i":"[@_o_i]") |
273 |                 ("pJ":"[pJ]") |
274 |                 ("zJ":"[zJ]") |
275 |                 ("xJ":"[xJ]") |
276 |                 ("a_i":"[a_i]") |
277 |                 ("@_r":"[@_r]") |
278 |                 ("bJ":"[bJ]") |
279 |                 ("u_i":"[u_i]") |
280 |                 ("o_i":"[o_i]") |
281 |                 ("d_ZJ":"[d_ZJ]") |
282 |                 ("nJ":"[nJ]") |
283 |                 ("@_i":"[@_i]") |
284 |                 ("U_x":"[U_x]") |
285 |                 ("lJ":"[lJ]") |
286 |                 ("@_r_i":"[@_r_i]") |
287 |                 ("n_nJ":"[n_nJ]") |
288 |                 ("SIL":"[SIL]") |
289 |                 ("ERROR":"[ERROR]") |
290 |                 ("ADJ":"[ADJ]") |
291 |                 ("VERB":"[VERB]") |
292 |                 ("WUD":"[WUD]")
293 | ;
294 | 
295 | inv_composed_char = ("[@_i]":"@_i") |
296 |                     ("[i_i]":"i_i") |
297 |                     ("[u_i]":"u_i") |
298 |                     ("[o_i]":"o_i") |
299 |                     ("[tJ]":"tJ") |
300 |                     ("[t_tJ]":"t_tJ") |
301 |                     ("[vJ]":"vJ") |
302 |                     ("[t_t]":"t_t") |
303 |                     ("[U_x]":"U_x") |
304 |                     ("[lJ]":"lJ") |
305 |                     ("[n_n]":"n_n") |
306 |                     ("[xJ]":"xJ") |
307 |                     ("[n_nJ]":"n_nJ") |
308 |                     ("[dJ]":"dJ") |
309 |                     ("[I_i]":"I_i") |
310 |                     ("[fJ]":"fJ") |
311 |                     ("[bJ]":"bJ") |
312 |                     ("[l_l]":"l_l") |
313 |                     ("[zJ]":"zJ") |
314 |                     ("[@_r]":"@_r") |
315 |                     ("[d_ZJ]":"d_ZJ") |
316 |                     ("[i_x]":"i_x") |
317 |                     ("[a_i]":"a_i") |
318 |                     ("[e_i]":"e_i") |
319 |                     ("[d_d]":"d_d") |
320 |                     ("[s_s]":"s_s") |
321 |                     ("[kJ]":"kJ") |
322 |                     ("[l_lJ]":"l_lJ") |
323 |                     ("[t_s]":"t_s") |
324 |                     ("[d_z]":"d_z") |
325 |                     ("[nJ]":"nJ") |
326 |                     ("[t_SJ]":"t_SJ") |
327 |                     ("[d_dJ]":"d_dJ") |
328 |                     ("[SJ]":"SJ") |
329 |                     ("[@_o_i]":"@_o_i") |
330 |                     ("[pJ]":"pJ") |
331 |                     ("[s_sJ]":"s_sJ") |
332 |                     ("[gJ]":"gJ") |
333 |                     ("[U_i]":"U_i") |
334 |                     ("[sJ]":"sJ") |
335 |                     ("[@_o]":"@_o") |
336 |                     ("[d_Z]":"d_Z") |
337 |                     ("[ZJ]":"ZJ") |
338 |                     ("[mJ]":"mJ") |
339 |                     ("[@_r_i]":"@_r_i") |
340 |                     ("[rJ]":"rJ") |
341 |                     ("[ERROR]":"ERROR") |
342 |                     ("[SIL]":"SIL") |
343 |                     ("[ADJ]":"ADJ") |
344 |                     ("[VERB]":"VERB") |
345 |                     ("[WUD]":"WUD")
346 | ;
347 | 
348 | simple_char = "S" | "ш" | "a" | "щ" | "п" | "d" | "E" | "д" | "j" |
349 |               "Z" | "u" | "ь" | "k" | "g" | "ч" | "е" | "в" | "t" |
350 |               "б" | "e" | "у" | "а" | "v" | "s" | "й" | "b" | "I" |
351 |               "р" | "к" | "т" | "н" | "z" | "ц" | "U" | "м" | "r" |
352 |               "о" | "ж" | "з" | "x" | "и" | "ъ" | "с" | "ф" | "л" |
353 |               "я" | "@" | "f" | "i" | "n" | "ё" | "m" | "l" | "p" |
354 |               "г" | "х" | "э" | "ы" | "ю" | "o" | " " | "-" | "+"
355 | ;
356 | 
357 | export in_feeder = Optimize[(simple_char|composed_char)*];
358 | export out_feeder = Optimize[(simple_char|inv_composed_char)*];
359 | 


--------------------------------------------------------------------------------
/grammars/inflections.grm:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | #
 13 | # Copyright 2014 Yandex LLC
 14 | # All Rights Reserved.
 15 | #
 16 | # Author : Alexis Wilpert
 17 | 
 18 | 
 19 | 
 20 | import 'alphabets.grm' as alphabets;
 21 | import 'definitions.grm' as defs;
 22 | 
 23 | # set/class definitions
 24 | 
 25 | WSEP = alphabets.word_sep;
 26 | LSEP = defs.LSEP;
 27 | RSEP = (defs.RSEP | "[SIL]" | "[EOS]");
 28 | vowel = defs.vowel;
 29 | cons_letter_hyphen = defs.cons_letter_hyphen;
 30 | stress_minus_1 = defs.stress_minus_1;
 31 | before_stress = defs.before_stress;
 32 | all = alphabets.letter | alphabets.nuclei | "-";
 33 | soft_cons = defs.soft_cons;
 34 | hard_cons = defs.hard_cons;
 35 | hard_cons_phono = alphabets.hard_cons_phono;
 36 | 
 37 | unstressed_seq = (all | "+")* all;
 38 | 
 39 | #----------------------------------------------------------------------------
 40 | 
 41 | # EXCEPTIONS
 42 | 
 43 | # SEE ALSO: letter_simplification rules in consonants.grm
 44 | 
 45 | # letter <и> after hard consonants
 46 | hard_i = CDRewrite[("и":"ы"),
 47 |                    ( (hard_cons "[WUD]"? WSEP "+"?) |
 48 |                      ("ж" | "ш" | "ц") "+"?),
 49 |                    "",
 50 |                    alphabets.sigma_star
 51 |                   ];
 52 | 
 53 | # letter <и> after hard consonants (for WUD words)
 54 | hard_i_phono = CDRewrite[("и":"ы"),
 55 |                          (hard_cons_phono
 56 |                           "[WUD]" WSEP "+"?),
 57 |                          "",
 58 |                          alphabets.sigma_star
 59 |                         ];
 60 | 
 61 | # -чувст- --> -чуст-
 62 | exception1 = CDRewrite[("в":""),
 63 |                        "ч" "+"? "у",
 64 |                        "с" "т",
 65 |                        alphabets.sigma_star
 66 |                       ];
 67 | 
 68 | # г --> х
 69 | # легк- --> лехк-
 70 | # лёгк- --> лёхк-
 71 | exception2 = CDRewrite[("г":"х"),
 72 |                        "л" "+"? ("е" | "ё"),
 73 |                        "к",
 74 |                        alphabets.sigma_star
 75 |                       ];
 76 | 
 77 | # счаст --> щаст
 78 | exception3a = CDRewrite[("с":"щ"),
 79 |                        "",
 80 |                        "ч" "а" "с" "т",
 81 |                        alphabets.sigma_star
 82 |                       ];
 83 | 
 84 | exception3b = CDRewrite[("ч":""),
 85 |                        "щ",
 86 |                        "а" "с" "т",
 87 |                        alphabets.sigma_star
 88 |                       ];
 89 | 
 90 | # for <контрр-> words (like <контрреволюционная>)
 91 | # these are the only words that are pronounced with a double [rr]
 92 | # most of them are included in the lexicon with a pseudo transcription
 93 | # [ррр]; here we will simpliy double <рр> to <р>, which makes the triple
 94 | # pseudo transcription an actual double <рр>, which is then later mapped
 95 | # correctly to double phonetic [rr] or [rJrJ]
 96 | 
 97 | exception4a = CDRewrite[("р":""),
 98 |                         "",
 99 |                         "р" (alphabets.letter - "р"),
100 |                         alphabets.sigma_star
101 |                        ];
102 | 
103 | exception4b = CDRewrite[("р":""),
104 |                         "",
105 |                         "р" "р",
106 |                         alphabets.sigma_star
107 |                        ];
108 | 
109 | exceptions = Optimize[hard_i       @
110 |                       hard_i_phono @
111 |                       exception1   @
112 |                       exception2   @
113 |                       exception3a  @
114 |                       exception3b  @
115 |                       exception4a  @
116 |                       exception4b
117 |                      ];
118 | 
119 | #----------------------------------------------------------------------------
120 | 
121 | # ONLY FOR ADJECTIVES
122 | 
123 | # ого ->ово, его ->ево
124 | 
125 | g_to_v = CDRewrite[("г":"в"),
126 |                    "[ADJ]" (all | "+")* ("о" | "е"),
127 |                    "+"? "о" RSEP,
128 |                    alphabets.sigma_star
129 |                   ];
130 | 
131 | #----------------------------------------------------------------------------
132 | 
133 | # ONLY FOR ADJECTIVES
134 | 
135 | # <ие>/  ST --> [i I]
136 | # <ее>/  ST --> [e E]
137 | # <ое>/  ST --> [o E]
138 | # <ые>/  ST --> [i_x I]
139 | 
140 | e_stressed_pairs_left = ("и":"i")     |
141 |                         ("е":"e")     |
142 |                         ("о":"o")     |
143 |                         ("ы":"[i_x]")
144 | ;
145 | 
146 | e_stress_infl_left = CDRewrite[e_stressed_pairs_left,
147 |                                "[ADJ]" all* "+",
148 |                                "е" RSEP,
149 |                                alphabets.sigma_star
150 |                               ];
151 | 
152 | e_stress_infl_right1 = CDRewrite[("е":"I"),
153 |                                  ("i" | "[i_x]"),
154 |                                  RSEP,
155 |                                  alphabets.sigma_star
156 |                                 ];
157 | 
158 | e_stress_infl_right2 = CDRewrite[("е":"E"),
159 |                                  ("e" | "o"),
160 |                                  RSEP,
161 |                                  alphabets.sigma_star
162 |                                 ];
163 | 
164 | e_stress_infl = Optimize[e_stress_infl_left   @
165 |                          e_stress_infl_right1 @
166 |                          e_stress_infl_right2
167 |                         ];
168 | 
169 | # <ие>/ ¬ST --> [I I]
170 | # <ее>/ ¬ST --> [I_i @]
171 | # <ое>/ ¬ST --> [@_i I]
172 | # <ые>/ ¬ST --> [@_r I]
173 | 
174 | e_unstressed_pairs_left = ("и":"I")     |
175 |                           ("е":"[I_i]") |
176 |                           ("о":"[@_i]") |
177 |                           ("ы":"[@_r]")
178 | ;
179 | 
180 | e_unstress_infl_left = CDRewrite[e_unstressed_pairs_left,
181 |                                  "[ADJ]" unstressed_seq,
182 |                                  "е" RSEP,
183 |                                  alphabets.sigma_star
184 |                                 ];
185 | 
186 | e_unstress_infl_right1 = CDRewrite[("е":"I"),
187 |                                    ("I" | "[@_i]" | "[@_r]"),
188 |                                    RSEP,
189 |                                    alphabets.sigma_star
190 |                                   ];
191 | 
192 | e_unstress_infl_right2 = CDRewrite[("е":"@"),
193 |                                    "[I_i]",
194 |                                    RSEP,
195 |                                    alphabets.sigma_star
196 |                                   ];
197 | 
198 | e_unstress_infl = Optimize[e_unstress_infl_left   @
199 |                            e_unstress_infl_right1 @
200 |                            e_unstress_infl_right2
201 |                           ];
202 | 
203 | # <ую>/  ST --> [u_i U_x]
204 | # <ою>/  ST --> [o_i U_x]
205 | 
206 | yu_stressed_pairs_left = ("у":"[u_i]") |
207 |                          ("о":"[o_i]")
208 | ;
209 | 
210 | yu_stress_infl_left = CDRewrite[yu_stressed_pairs_left,
211 |                                 "[ADJ]" all* "+",
212 |                                 "ю" RSEP,
213 |                                 alphabets.sigma_star
214 |                                ];
215 | 
216 | yu_stress_infl_right = CDRewrite[("ю":"[U_x]"),
217 |                                  ("[u_i]" | "[o_i]"),
218 |                                  RSEP,
219 |                                  alphabets.sigma_star
220 |                                 ];
221 | 
222 | yu_stress_infl = Optimize[yu_stress_infl_left  @
223 |                           yu_stress_infl_right
224 |                          ];
225 | 
226 | # <ую>/ ¬ST --> [U_i U_x]
227 | # <ею>/ ¬ST --> [I_i U_x]
228 | # <ою>/ ¬ST --> [@_i U_x]
229 | 
230 | yu_unstressed_pairs_left = ("у":"[U_i]") |
231 |                            ("е":"[I_i]") |
232 |                            ("о":"[@_i]")
233 | ;
234 | 
235 | yu_unstress_infl_left = CDRewrite[yu_unstressed_pairs_left,
236 |                                  "[ADJ]" unstressed_seq,
237 |                                  "ю" RSEP,
238 |                                  alphabets.sigma_star
239 |                                 ];
240 | 
241 | yu_unstress_infl_right = CDRewrite[("ю":"[U_x]"),
242 |                                    ("[U_i]" | "[I_i]" | "[@_i]"),
243 |                                    RSEP,
244 |                                    alphabets.sigma_star
245 |                                   ];
246 | 
247 | yu_unstress_infl = Optimize[yu_unstress_infl_left  @
248 |                             yu_unstress_infl_right
249 |                            ];
250 | 
251 | # <ой>/  ST --> [o_i]
252 | 
253 | i_stress_infl_left = CDRewrite[("о":"[o_i]"),
254 |                                "[ADJ]" all* "+",
255 |                                "й" RSEP,
256 |                                alphabets.sigma_star
257 |                               ];
258 | 
259 | i_stress_infl_right = CDRewrite[("й":""),
260 |                                 "[o_i]",
261 |                                 RSEP,
262 |                                 alphabets.sigma_star
263 |                                ];
264 | 
265 | i_stress_infl = Optimize[i_stress_infl_left   @
266 |                          i_stress_infl_right
267 |                         ];
268 | 
269 | # <ой>/ ¬ST --> [@_i]
270 | # <ей>/ ¬ST --> [I_i]
271 | # <ый>/ ¬ST --> [@_r_i]
272 | # <ий>/ ¬ST --> [I_i]
273 | 
274 | i_unstress_pairs_left = ("о":"[@_i]")   |
275 |                         ("е":"[I_i]")   |
276 |                         ("ы":"[@_r_i]") |
277 |                         ("и":"[I_i]")
278 | ;
279 | 
280 | i_unstress_infl_left = CDRewrite[i_unstress_pairs_left,
281 |                                  "[ADJ]" unstressed_seq,
282 |                                  "й" RSEP,
283 |                                  alphabets.sigma_star
284 |                                 ];
285 | 
286 | i_unstress_infl_right = CDRewrite[("й":""),
287 |                                   "[@_i]"   |
288 |                                   "[I_i]"   |
289 |                                   "[@_r_i]" |
290 |                                   "[I_i]",
291 |                                   RSEP,
292 |                                   alphabets.sigma_star
293 |                                  ];
294 | 
295 | i_unstress_infl = Optimize[i_unstress_infl_left  @
296 |                            i_unstress_infl_right
297 |                           ];
298 | 
299 | 
300 | # <ая>/  ST --> [a_i @]
301 | 
302 | ya_stress_infl_left = CDRewrite[("а":"[a_i]"),
303 |                                 "[ADJ]" all* "+",
304 |                                 "я" RSEP,
305 |                                 alphabets.sigma_star
306 |                                ];
307 | 
308 | ya_stress_infl_right = CDRewrite[("я":"@"),
309 |                                  "[a_i]",
310 |                                  RSEP,
311 |                                  alphabets.sigma_star
312 |                                 ];
313 | 
314 | ya_stress_infl = Optimize[ya_stress_infl_left   @
315 |                           ya_stress_infl_right
316 |                          ];
317 | 
318 | # <ая>/ ¬ST --> [@_i @]
319 | # <яя>/ ¬ST --> [@_i @]
320 | 
321 | ya_unstress_infl_left1 = CDRewrite[("а":"[@_i]"),
322 |                                    "[ADJ]" unstressed_seq,
323 |                                    "я" RSEP,
324 |                                    alphabets.sigma_star
325 |                                   ];
326 | 
327 | ya_unstress_infl_left2 = CDRewrite[("я":"[@_i]"),
328 |                                    "[ADJ]" unstressed_seq,
329 |                                    "я" RSEP,
330 |                                    alphabets.sigma_star
331 |                                   ];
332 | 
333 | ya_unstress_infl_right = CDRewrite[("я":"@"),
334 |                                    "[@_i]",
335 |                                    RSEP,
336 |                                    alphabets.sigma_star
337 |                                   ];
338 | 
339 | ya_unstress_infl = Optimize[ya_unstress_infl_left1 @
340 |                             ya_unstress_infl_left2 @
341 |                             ya_unstress_infl_right
342 |                            ];
343 | 
344 | #----------------------------------------------------------------------------
345 | 
346 | # ONLY FOR VERBS
347 | 
348 | # -тся, -ться --> [t_s@]
349 | 
350 | verb_infl1 = CDRewrite[("т":"ц"),
351 |                        "[VERB]" (all | "+")*,
352 |                        "ь"? "с" "я" RSEP,
353 |                        alphabets.sigma_star
354 |                       ];
355 | 
356 | verb_infl2 = CDRewrite[("ь":""),
357 |                        "[VERB]" (all | "+")* "ц",
358 |                        "с" "я" RSEP,
359 |                        alphabets.sigma_star
360 |                       ];
361 | 
362 | verb_infl3 = CDRewrite[("с":""),
363 |                        "[VERB]" (all | "+")* "ц",
364 |                        "я" RSEP,
365 |                        alphabets.sigma_star
366 |                       ];
367 | 
368 | verb_infl4 = CDRewrite[("я":"@"),
369 |                        "[VERB]" (all | "+")* "ц",
370 |                        RSEP,
371 |                        alphabets.sigma_star
372 |                       ];
373 | 
374 | verb_infl = Optimize[verb_infl1 @
375 |                      verb_infl2 @
376 |                      verb_infl3 @
377 |                      verb_infl4
378 |                     ];
379 | 
380 | #----------------------------------------------------------------------------
381 | 
382 | # clean POS tags
383 | 
384 | clean_adj_pos = CDRewrite[("[ADJ]":""),
385 |                           "",
386 |                           "",
387 |                           alphabets.sigma_star
388 |                          ];
389 | 
390 | clean_verb_pos = CDRewrite[("[VERB]":""),
391 |                            "",
392 |                            "",
393 |                            alphabets.sigma_star
394 |                           ];
395 | 
396 | clean_pos = Optimize[clean_adj_pos @ clean_verb_pos];
397 | 
398 | #----------------------------------------------------------------------------
399 | 
400 | export inflections = Optimize[exceptions       @
401 |                               g_to_v           @
402 |                               e_stress_infl    @
403 |                               e_unstress_infl  @
404 |                               yu_stress_infl   @
405 |                               yu_unstress_infl @
406 |                               i_stress_infl    @
407 |                               i_unstress_infl  @
408 |                               ya_stress_infl   @
409 |                               ya_unstress_infl @
410 |                               verb_infl        @
411 |                               clean_pos
412 |                              ];
413 | 
414 | 


--------------------------------------------------------------------------------
/grammars/consonants.grm:
--------------------------------------------------------------------------------
  1 | # Licensed under the Apache License, Version 2.0 (the "License");
  2 | # you may not use this file except in compliance with the License.
  3 | # You may obtain a copy of the License at
  4 | #
  5 | #     http://www.apache.org/licenses/LICENSE-2.0
  6 | #
  7 | # Unless required by applicable law or agreed to in writing, software
  8 | # distributed under the License is distributed on an "AS IS" BASIS,
  9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 10 | # See the License for the specific language governing permissions and
 11 | # limitations under the License.
 12 | #
 13 | # Copyright 2014 Yandex LLC
 14 | # All Rights Reserved.
 15 | #
 16 | # Author : Alexis Wilpert
 17 | 
 18 | 
 19 | 
 20 | import 'alphabets.grm' as alphabets;
 21 | 
 22 | SEP = alphabets.word_sep;
 23 | WUD = "[WUD]" SEP;
 24 | 
 25 | #----------------------------------------------------------------------------
 26 | 
 27 | letter_simplification1 = CDRewrite[("т":"")*,
 28 |                                    "с",
 29 |                                    ("н" | "ч" | "ск"),
 30 |                                    alphabets.sigma_star
 31 |                                   ];
 32 | 
 33 | 
 34 | letter_simplification2 = CDRewrite[("д":"")*,
 35 |                                    "з",
 36 |                                    ("н" | "ч"),
 37 |                                    alphabets.sigma_star
 38 |                                   ];
 39 | 
 40 | letter_simplification3 = CDRewrite[("д":"")*,
 41 |                                    "р",
 42 |                                    ("ч" | "ц"),
 43 |                                    alphabets.sigma_star
 44 |                                   ];
 45 | 
 46 | letter_simplification4 = CDRewrite[("в":"")*,
 47 |                                    "л",
 48 |                                    "ств",
 49 |                                    alphabets.sigma_star
 50 |                                   ];
 51 | 
 52 | letter_simplification5 = CDRewrite[("н":"")*,
 53 |                                    "л",
 54 |                                    "ц",
 55 |                                    alphabets.sigma_star
 56 |                                   ];
 57 | 
 58 | 
 59 | letter_simplifications = Optimize[letter_simplification1 @
 60 |                                   letter_simplification2 @
 61 |                                   letter_simplification3 @
 62 |                                   letter_simplification4 @
 63 |                                   letter_simplification5
 64 |                                  ];
 65 | 
 66 | #----------------------------------------------------------------------------
 67 | 
 68 | soft_cons = ("б":"[bJ]") |
 69 |             ("в":"[vJ]") |
 70 |             ("г":"[gJ]") |
 71 |             ("д":"[dJ]") |
 72 |             ("ж":"[ZJ]") |
 73 |             ("з":"[zJ]") |
 74 |             ("к":"[kJ]") |
 75 |             ("л":"[lJ]") |
 76 |             ("м":"[mJ]") |
 77 |             ("н":"[nJ]") |
 78 |             ("п":"[pJ]") |
 79 |             ("р":"[rJ]") |
 80 |             ("с":"[sJ]") |
 81 |             ("т":"[tJ]") |
 82 |             ("ф":"[fJ]") |
 83 |             ("х":"[xJ]") |
 84 |             ("щ":"[SJ]")
 85 | ;
 86 | 
 87 | softening  = CDRewrite[soft_cons,
 88 |                        "",
 89 |                        "ь",
 90 |                        alphabets.sigma_star]
 91 | ;
 92 | 
 93 | hard_cons = ("б":"b") |
 94 |             ("в":"v") |
 95 |             ("д":"d") |
 96 |             ("ж":"Z") |
 97 |             ("з":"z") |
 98 |             ("к":"k") |
 99 |             ("н":"n") |
100 |             ("р":"r") |
101 |             ("с":"s") |
102 |             ("т":"t") |
103 |             ("х":"x")
104 | ;
105 | 
106 | hardening  = CDRewrite[hard_cons,
107 |                        "",
108 |                        "ъ",
109 |                        alphabets.sigma_star]
110 | ;
111 | 
112 | hard_soft_letters = Optimize[softening @ hardening];
113 | 
114 | 
115 | soft_hard_chars = ("ь":"") |
116 |                   ("ъ":"")
117 | ;
118 | 
119 | clean_soft_hard_chars  = CDRewrite[soft_hard_chars,
120 |                                    "",
121 |                                    "",
122 |                                    alphabets.sigma_star
123 |                                   ];
124 | 
125 | #----------------------------------------------------------------------------
126 | 
127 | letter_pairs = Optimize[("п":"p")       |
128 |                         ("б":"b")       |
129 |                         ("т":"t")       |
130 |                         ("д":"d")       |
131 |                         ("к":"k")       |
132 |                         ("г":"g")       |
133 |                         ("м":"m")       |
134 |                         ("н":"n")       |
135 |                         ("л":"l")       |
136 |                         ("р":"r")       |
137 |                         ("ф":"f")       |
138 |                         ("в":"v")       |
139 |                         ("с":"s")       |
140 |                         ("з":"z")       |
141 |                         ("ж":"Z")       |
142 |                         ("ш":"S")       |
143 |                         ("щ":"[SJ]")    |
144 |                         ("х":"x")       |
145 |                         ("ц":"[t_s]")   |
146 |                         ("ч":"[t_SJ]")
147 |                         ];
148 | 
149 | letter_g2p = CDRewrite[letter_pairs,
150 |                        "",
151 |                        "",
152 |                        alphabets.sigma_star
153 |                       ];
154 | 
155 | #----------------------------------------------------------------------------
156 | 
157 | export devoicing_pairs = ("b":"p")           |
158 |                          ("d":"t")           |
159 |                          ("g":"k")           |
160 |                          ("[bJ]":"[pJ]")     |
161 |                          ("[dJ]":"[tJ]")     |
162 |                          ("[gJ]":"[kJ]")     |
163 |                          ("[d_d]":"[t_t]")   |
164 |                          ("[d_dJ]":"[t_tJ]") |
165 | 
166 |                          ("Z":"S")           |
167 |                          ("v":"f")           |
168 |                          ("z":"s")           |
169 |                          ("[ZJ]":"[SJ]")     |
170 |                          ("[vJ]":"[fJ]")     |
171 |                          ("[zJ]":"[sJ]")     |
172 | 
173 |                          ("[d_z]":"[t_s]")   |
174 |                          ("[d_ZJ]":"[t_SJ]")
175 | ;
176 | 
177 | export voicing_pairs = ("p":"b")           |
178 |                        ("t":"d")           |
179 |                        ("k":"g")           |
180 |                        ("[pJ]":"[bJ]")     |
181 |                        ("[tJ]":"[dJ]")     |
182 |                        ("[kJ]":"[gJ]")     |
183 |                        ("[t_t]":"[d_d]")   |
184 |                        ("[t_tJ]":"[d_dJ]") |
185 | 
186 |                        ("S":"Z")           |
187 |                        ("f":"v")           |
188 |                        ("s":"z")           |
189 |                        ("[SJ]":"[ZJ]")     |
190 |                        ("[fJ]":"[vJ]")     |
191 |                        ("[sJ]":"[zJ]")     |
192 | 
193 |                        ("[t_s]":"[d_z]")   |
194 |                        ("[t_SJ]":"[d_ZJ]")
195 | ;
196 | 
197 | export voicing_context = Optimize[(alphabets.voiced_consonants -
198 |                                     ("v" | "[vJ]" | "j" |
199 |                                      alphabets.liquids  |
200 |                                      alphabets.nasals
201 |                                     )
202 |                                   )
203 |                                  ];
204 | 
205 | devoicing_context = alphabets.voiceless_consonants;
206 | 
207 | voicing = CDRewrite[voicing_pairs*,
208 |                     "",
209 |                     WUD? voicing_context,
210 |                     alphabets.sigma_star
211 |                    ];
212 | 
213 | devoicing = CDRewrite[devoicing_pairs*,
214 |                       "",
215 |                       WUD? devoicing_context,
216 |                       alphabets.sigma_star
217 |                      ];
218 | 
219 | export voice_assimilation = Optimize[voicing @ devoicing];
220 | 
221 | #----------------------------------------------------------------------------
222 | 
223 | # context dependent palatalization rules
224 | 
225 | # [d] --> [dJ] / _ [dJ] | [zJ] | [nJ]
226 | phonetic_palatalization1 = CDRewrite[("d":"[dJ]"),
227 |                                      "",
228 |                                      WUD? ("[dJ]" | "[zJ]" | "[nJ]"),
229 |                                      alphabets.sigma_star
230 |                                     ];
231 | 
232 | # [d] --> [tJ] / _ [tJ]
233 | phonetic_palatalization2 = CDRewrite[("d":"[tJ]"),
234 |                                      "",
235 |                                      WUD? "[tJ]",
236 |                                      alphabets.sigma_star
237 |                                     ];
238 | 
239 | # [n] --> [nJ] / _ [tJ] | [sJ] | [t_SJ] | [nJ] | [SJ]
240 | phonetic_palatalization3 = CDRewrite[("n":"[nJ]"),
241 |                                      "",
242 |                                      WUD? ("[tJ]"   | "[sJ]" |
243 |                                            "[t_SJ]" | "[nJ]" | "[SJ]"),
244 |                                      alphabets.sigma_star
245 |                                     ];
246 | 
247 | # [s] --> [SJ] / _ [t_SJ]
248 | phonetic_palatalization4 = CDRewrite[("s":"[SJ]"),
249 |                                      "",
250 |                                      WUD? "[t_SJ]",
251 |                                      alphabets.sigma_star
252 |                                     ];
253 | 
254 | # [s] --> [S]  / _ [S]
255 | phonetic_palatalization5 = CDRewrite[("s":"S"),
256 |                                      "",
257 |                                      WUD? "S",
258 |                                      alphabets.sigma_star
259 |                                     ];
260 | 
261 | # [s] --> [Z]  / _ [Z]
262 | phonetic_palatalization6 = CDRewrite[("s":"Z"),
263 |                                      "",
264 |                                      WUD? "Z",
265 |                                      alphabets.sigma_star
266 |                                     ];
267 | 
268 | # [s] --> [sJ] / _ [sJ] | [tJ] | [nJ]
269 | phonetic_palatalization7 = CDRewrite[("s":"[sJ]"),
270 |                                      "",
271 |                                      WUD? ("[sJ]" | "[tJ]" | "[nJ]"),
272 |                                      alphabets.sigma_star
273 |                                     ];
274 | 
275 | # [s] --> [zJ] / _ [zJ]
276 | phonetic_palatalization8 = CDRewrite[("s":"[zJ]"),
277 |                                      "",
278 |                                      WUD? "[zJ]",
279 |                                      alphabets.sigma_star
280 |                                     ];
281 | 
282 | # [t] --> [dJ] / _ [dJ]
283 | phonetic_palatalization9 = CDRewrite[("t":"[dJ]"),
284 |                                      "",
285 |                                      WUD? "[dJ]",
286 |                                      alphabets.sigma_star
287 |                                     ];
288 | 
289 | # [t] --> [tJ] / _ [tJ] | [sJ] | [t_SJ] | [nJ] | [SJ]
290 | phonetic_palatalization10 = CDRewrite[("t":"[tJ]"),
291 |                                       "",
292 |                                       WUD? ("[tJ]"   | "[sJ]" |
293 |                                             "[t_SJ]" | "[nJ]" | "[SJ]"),
294 |                                       alphabets.sigma_star
295 |                                      ];
296 | 
297 | # [v] --> [vJ] / _ [vJ]
298 | # [f] --> [vJ] / _ [vJ]
299 | phonetic_palatalization11a = CDRewrite[("v":"[vJ]") | ("f":"[vJ]"),
300 |                                        "",
301 |                                        WUD? "[vJ]",
302 |                                        alphabets.sigma_star
303 |                                       ];
304 | 
305 | # [f] --> [fJ] / _ [fJ]
306 | # [v] --> [fJ] / _ [fJ]
307 | phonetic_palatalization11b = CDRewrite[("f":"[fJ]") | ("v":"[fJ]"),
308 |                                        "",
309 |                                        WUD? "[fJ]",
310 |                                        alphabets.sigma_star
311 |                                       ];
312 | 
313 | 
314 | # [m] --> [mJ] / _ [mJ]
315 | phonetic_palatalization11c = CDRewrite[("m":"[mJ]"),
316 |                                        "",
317 |                                        WUD? "[mJ]",
318 |                                        alphabets.sigma_star
319 |                                       ];
320 | 
321 | # [r] --> [rJ] / _ [rJ]
322 | phonetic_palatalization11d = CDRewrite[("r":"[rJ]"),
323 |                                        "",
324 |                                        WUD? "[rJ]",
325 |                                        alphabets.sigma_star
326 |                                       ];
327 | 
328 | # [l] --> [lJ] / _ [lJ]
329 | phonetic_palatalization11e = CDRewrite[("l":"[lJ]"),
330 |                                        "",
331 |                                        WUD? "[lJ]",
332 |                                        alphabets.sigma_star
333 |                                       ];
334 | 
335 | # [z] --> [S]  / _ [S]
336 | phonetic_palatalization12 = CDRewrite[("z":"S"),
337 |                                       "",
338 |                                       WUD? "S",
339 |                                       alphabets.sigma_star
340 |                                      ];
341 | 
342 | # [z] --> [Z]  / _ [Z]
343 | phonetic_palatalization13 = CDRewrite[("z":"Z"),
344 |                                       "",
345 |                                       WUD? "Z",
346 |                                       alphabets.sigma_star
347 |                                      ];
348 | 
349 | # [z] --> [sJ] / _ [sJ] | [tJ]
350 | phonetic_palatalization14 = CDRewrite[("z":"[sJ]"),
351 |                                       "",
352 |                                       WUD? ("[sJ]" | "[tJ]"),
353 |                                       alphabets.sigma_star
354 |                                      ];
355 | 
356 | # [z] --> [zJ] / _ [zJ] | [dJ] | [nJ]
357 | phonetic_palatalization15 = CDRewrite[("z":"[zJ]"),
358 |                                       "",
359 |                                       WUD? ("[zJ]" | "[dJ]" | "[nJ]"),
360 |                                       alphabets.sigma_star
361 |                                      ];
362 | 
363 | phonetic_palatalization = Optimize[phonetic_palatalization1   @
364 |                                    phonetic_palatalization2   @
365 |                                    phonetic_palatalization3   @
366 |                                    phonetic_palatalization4   @
367 |                                    phonetic_palatalization5   @
368 |                                    phonetic_palatalization6   @
369 |                                    phonetic_palatalization7   @
370 |                                    phonetic_palatalization8   @
371 |                                    phonetic_palatalization9   @
372 |                                    phonetic_palatalization10  @
373 |                                    phonetic_palatalization11a @
374 |                                    phonetic_palatalization11b @
375 |                                    phonetic_palatalization11c @
376 |                                    phonetic_palatalization11d @
377 |                                    phonetic_palatalization11e @
378 |                                    phonetic_palatalization12  @
379 |                                    phonetic_palatalization13  @
380 |                                    phonetic_palatalization14  @
381 |                                    phonetic_palatalization15
382 |                                   ];
383 | 
384 | #----------------------------------------------------------------------------
385 | 
386 | # devoice final consonants (before WUD) in SIL or EOS context
387 | 
388 | wud_final_devoicing = CDRewrite[devoicing_pairs*,
389 |                                 "",
390 |                                 WUD ("[SIL]" | "[EOS]"),
391 |                                 alphabets.sigma_star
392 |                                ];
393 | 
394 | #----------------------------------------------------------------------------
395 | 
396 | # ([s] | [z]) --> EPS / _ [S] [t_S]
397 | 
398 | consonant_simplification_pairs = ("s":"") | ("z":"");
399 | 
400 | consonant_simplification1 = CDRewrite[consonant_simplification_pairs*,
401 |                                       "",
402 |                                       ("S" "[t_SJ]"),
403 |                                       alphabets.sigma_star
404 |                                      ];
405 | 
406 | consonant_simplification2 = CDRewrite["[t_s]":"",
407 |                                       "",
408 |                                       "-"? "[t_s]",
409 |                                       alphabets.sigma_star
410 |                                      ];
411 | 
412 | consonant_simplification = Optimize[consonant_simplification1 @
413 |                                     consonant_simplification2
414 |                                    ];
415 | 
416 | #----------------------------------------------------------------------------
417 | 
418 | # gemination
419 | 
420 | # SYMBOL       transcription          example
421 | # ---------------------------------------------
422 | # PH: t_t      @_o t_t o k            отток
423 | # PH: t_tJ     p @_o t_tJ e m @       подтема
424 | # PH: d_d      @_o d_d a m            отдам
425 | # PH: d_dJ     @_o d_dJ e l           отдел
426 | # PH: s_s      r @_o s_s a d @        рассада
427 | # PH: s_sJ     r a s_sJ e l I n @     расселина
428 | # PH: n_n      v a n_n @              ванна
429 | # PH: n_nJ     v a n_nJ @             ванне
430 | # PH: l_l      vJ i l_l @             вилла
431 | # PH: l_lJ     vJ i l_lJ @            Вилли
432 | 
433 | hard_geminate_pairs = (("t":"[t_t]") "t") |
434 |                       (("d":"[d_d]") "d") |
435 |                       (("s":"[s_s]") "s") |
436 |                       (("n":"[n_n]") "n") |
437 |                       (("l":"[l_l]") "l")
438 | ;
439 | 
440 | hard_gemination = CDRewrite[hard_geminate_pairs,
441 |                             "",
442 |                             "",
443 |                             alphabets.sigma_star
444 |                            ];
445 | 
446 | clean_hard_geminate_pairs = ("[t_t]" ("t":"")) |
447 |                             ("[d_d]" ("d":"")) |
448 |                             ("[s_s]" ("s":"")) |
449 |                             ("[n_n]" ("n":"")) |
450 |                             ("[l_l]" ("l":""))
451 | ;
452 | 
453 | clean_hard_gemination = CDRewrite[clean_hard_geminate_pairs,
454 |                                   "",
455 |                                   "",
456 |                                   alphabets.sigma_star
457 |                                  ];
458 | 
459 | soft_geminate_pairs1 = ("t":"[t_tJ]") "[tJ]" |
460 |                        ("d":"[d_dJ]") "[dJ]" |
461 |                        ("s":"[s_sJ]") "[sJ]" |
462 |                        ("n":"[n_nJ]") "[nJ]" |
463 |                        ("l":"[l_lJ]") "[lJ]"
464 | ;
465 | 
466 | soft_geminate_pairs2 = ("[tJ]":"[t_tJ]") "[tJ]" |
467 |                        ("[dJ]":"[d_dJ]") "[dJ]" |
468 |                        ("[sJ]":"[s_sJ]") "[sJ]" |
469 |                        ("[nJ]":"[n_nJ]") "[nJ]" |
470 |                        ("[lJ]":"[l_lJ]") "[lJ]"
471 | ;
472 | 
473 | soft_gemination = CDRewrite[soft_geminate_pairs1 |
474 |                             soft_geminate_pairs2,
475 |                             "",
476 |                             "",
477 |                             alphabets.sigma_star
478 |                            ];
479 | 
480 | clean_soft_geminate_pairs = ("[t_tJ]" ("[tJ]":"")) |
481 |                             ("[d_dJ]" ("[dJ]":"")) |
482 |                             ("[s_sJ]" ("[sJ]":"")) |
483 |                             ("[n_nJ]" ("[nJ]":"")) |
484 |                             ("[l_lJ]" ("[lJ]":""))
485 | ;
486 | 
487 | clean_soft_gemination = CDRewrite[clean_soft_geminate_pairs,
488 |                                   "",
489 |                                   "",
490 |                                   alphabets.sigma_star
491 |                                  ];
492 | 
493 | gemination = Optimize[hard_gemination       @
494 |                       clean_hard_gemination @
495 |                       soft_gemination       @
496 |                       clean_soft_gemination
497 |                      ];
498 | 
499 | #----------------------------------------------------------------------------
500 | 
501 | export consonant_rules = Optimize[letter_simplifications   @
502 |                                   hard_soft_letters        @
503 |                                   clean_soft_hard_chars    @
504 |                                   letter_g2p               @
505 |                                   voice_assimilation       @
506 |                                   phonetic_palatalization  @
507 |                                   wud_final_devoicing      @
508 |                                   consonant_simplification @
509 |                                   gemination
510 |                                  ];
511 | 
512 | 


--------------------------------------------------------------------------------
/test/rus_sentences.txt.g2p:
--------------------------------------------------------------------------------
  1 | sU-vo-r@f s lJU_x-b@_o-pi_xt-stv@m r@_o-spra-S@_r-v@l sla-vn@-g@ mJI-tJeZ-nJI-k@ @_o jI-vo v@_o-je-n_n@_rx dJe_ist-vJI-jI-xJi n@_o-mJe-rJI-nJI-j@x I p@-vJos jI-vo f sJIm-bJirsk SIL kU-da dol-Z@_rn bi_xl prJI-je-x@tJ I graf pa-nJIn
  2 | I pro-st@ z@-d@_ox-nul-sJ@ @_ot nJI-@_o-Zi_x-d@-n_n@_o_i ra-d@-sJtJI
  3 | fsko-rJI n@_o-t_SJal n@_o-kra-p@_r-v@dJ doZdJ SIL m@_o-t_SJif-S@_r_i naz d@_o sa-m@-v@ mJe-st@ la-gJI-rJ@ Ek-spJI-dJi-t_s@-I SIL par-tJI-I jI-lJI-z@_o-vJe-t@_r vl@_o-dJi-mJI-r@_o-vni_x
  4 | @_o-nJi sJI-dJe-lJI n@_od mJi-r@m SIL U-t_SJast-vU_x-j@ v op-SJIm pr@_o-t_se-sJI Zi_x-zJnJI I gJi-bJI-lJI I prJI-s@-jI-dJI-nJa-j@ k sv@_o-je_i suSJ-n@-sJtJI fsJe dU-Si_x SIL lJit_SJ-n@-sJtJI I t_SJust-v@ ja
  5 | jI-vo tol-st@-j@ Se-j@ vzdu-l@zJ b@_o-gro-v@_r-mJI Zi_x-l@-mJI SIL I S@_r-ro-k@-jI skU-la-st@-jI lJI-t_so n@_o-lJi-l@sJ krovJ-jU_x
  6 | tru-s@sJtJ jI-vo nJI d@z-v@_o-lJa-l@ jI-mu x@-r@_o-SenJ-k@ Iz-jI-sJnJitJ-sJ@ SIL I tak s@_ol-gal on ra-z@ dva I-lJI trJi bJIs pra-vJIl SIL @_o kag g@-sp@_o-dJin z@-krJI-t_SJal jI-mu SIL Sto-b@_r on p@-Sol d@_o-mo_i SIL to tJem dJe-l@ I konJ-t_SJI-l@sJ
  7 | U-kla-d@_r-v@-jItJ-sJ@ v u-skJI_i t@_o-nelJ n@_o-ut_SJ-n@_rx zna-nJI_i SIL v@_r-smJe-I-v@-jItJ-sJ@ I-lJI nJI z@-mJI-t_SJa-jItJ-sJ@
  8 | k@-mJI-tJed gr@_oZ-dan-sk@-jI s@_o-dJe_ist-vJI-jI v@_r-d@-jot i_xm x@_o-da-t@_ist-v@ @_o rJI-gJI-stra-t_s@-I I @_o-br@_o-SJe-nJI-j@ k s@_o-trudJ-nJI-k@m mft s r@z-jI-sJnJe-nJI-jIm rJI-alJ-n@-g@ p@-l@_o-Ze-nJI-j@ e-tJIx lJU-dJe_i
  9 | @_od-na-k@ dnk s@_o-ma p@_o sJI-bJe nJI-sJot v@_oZ-nJe_i-SU_x-jU_x funk-t_s@-jU_x k@_o-dJi-r@-v@-nJI-j@ @_o-mJI-n@-kJI-slot-n@_o_i p@_o-slJe-d@-v@-tJIlJ-n@-sJtJI bJIl-kov SIL I mi_x nJI mo-Z@_rm pr@-Iz-volJ-n@ jI-jo m@-dJI-fJI-t_si_x-r@-v@tJ SIL nJI z@_o-tro-nU_xf e-t@_o_i In-f@_or-ma-t_s@-I I nJI I-zmJI-nJif sp@_o-sob-n@-sJtJI dnk k U_xd-v@_o-je-nJI-jU_x
 10 | sJI-t_SJas o-t_SJInJ InJ-tJI-rJe-snU_x-jU_x ZenJ-SJI-nU_x pJi-SU_xt
 11 | z@-st@_o-vlJa-lJI m@_o-lJitJ-sJ@ SIL g@_o-to-vJI-lJI je-dU_x SIL r@_o-ska-z@_r-v@-lJI @_o-nJIg-do-t@_r
 12 | @_o-fJI-t_se-r@m I @_of-t@_o-matJ-t_SJI-k@m Iz ro-t@_r @_o-xra-n@_r p@_o-lJa-k@f prJI-k@_o-zal t_SJSJa-tJIlJ-n@ @_o-sm@_o-trJetJ @_o-krJe-sn@sJtJ SIL @_o sam s nJem-t_s@-mJI I z@-nJIl-sJa nJI-p@_o-srJet-stvJI-n_n@ U-t_SJast-k@m SIL gdJe r@-sp@-l@_o-ga-l@sJ jI-dro grup-p@_r
 13 | k@-lJI-nJIn pr@_o-je-x@l SIL nJI z@-InJ-tJI-rJI-s@_o-val-sJ@
 14 | n@t_SJ-nJom z dJI-pU-ta-t@v
 15 | v nJUrn-bJer-gJI b@_r-la zn@-mJI-nJi-t@-j@ f@_o-mJi-lJI-j@ fU_xg-gJI-r@v SIL b@_on-kJi-r@v SIL kUp-t_sof t@_o-vo vrJe-mJI-nJI SIL vro-dJI rot-S@_rlJ-d@v SIL @_o-nJi prJI-dl@_o-Zi_x-lJI U kUr-fJir-st@ vzJatJ n@_o ot-kU_xp @_rn-dUlJ-gJen-t_s@-I
 16 | iz-za smut I bJI-sp@_o-rJat-k@f e-t@-v@ vrJe-mJI-nJI SIL sU_x-d@_o-xot-stv@ f p@r-tU-galJ-skJIx vo-d@x p@_ot_SJ-tJi prJI-kr@_o-tJi-l@sJ SIL tak Sto nJe-t_SJI-v@ bi_x-l@ @_o-p@_o-satJ-sJ@ da-Z@_r slU-t_SJa_i-n@-g@ @_ob-n@-rU-Ze-nJI-j@ sud-n@ v e-tJIx mJI-stax
 17 | bo-lJI-jI t@_o-vo SIL mi_x SJt_SJI-ta-jIm SIL Sto krJI-dJit v bu-dU_x-SJIm mo-Z@_rt statJ @_olJ-t@_rr-n@_o-tJi-v@_o_i dol-gU_x
 18 | pr@_o-Slo p@_o kra_i-nJI_i mJe-rJI s p@l-t_SJI-sa SIL p@_o-ka U-tJi-xlJI e-tJI xv@_o-ta-jU_x-SJI-jI z@_o sJer-t_s@ r@_r-da-nJI-j@
 19 | n@_o I-kra-n@x SIL b@_olJ-Si_xx I ma-l@_rx SIL t_s@_o-rJat @_o-mJI-rJI-kan-skJI-jI fJilJ-m@_r
 20 | pJI-rJI-xot k pr@_o-sto_i br@_o-nJi bi_xl b@_r slJiS-k@m krut I z@-mJo-tJIn bJIz ra-zn@_rx pJI-rJI-lJi-v@v SIL @_o-t_tJen-k@f I m@_o-stov
 21 | fsJex Z@_rlJ-jom @_o-bJI-spJe-t_SJIl
 22 | k@_og-da s@_o-trudJ-nJI-kJI kkp vi_x-sk@-t_SJI-lJI Iz zda-nJI-j@ SIL to n@_o U-ka-z@-n_n@m mJe-sJtJI @_ob-n@_o-ru-Z@_r-lJI lJiS klJU-t_SJi @_ot m@_o-Si_x-n@_r
 23 | da-Z@_r nJI-k@_o-la_i z@-prJI-SJal @_ob e-t@m slu-t_SJ@-jI fsp@-mJI-natJ SIL sra-zU_x bJI-lJI-nJil-sJ@
 24 | I d@_o-nJi-l@ s@_o-zo-n@_rd_ZJ z@-vJol r@-zg@_o-vor @_o p@-tJom-kJI-nJI SIL k@_o-to-r@_r_i g@-v@_o-rJil jI-mu SIL Sto pJI-rJI-xo-dJIt @_o-pJadJ z@_o m@_o-sko-fskU_x-jU_x z@_o-sta-vU_x
 25 | kom-n@-t@ SIL f k@_o-to-r@_o_i nas prJI-nJI-ma-lJI SIL b@_r-la SIL k@_o-nJet_SJ-n@ SIL sa-m@-j@ pr@_o-stor-n@-j@ v do-mJI SIL jI-jo z@_o-ra-nJI-jI mi_x-lJI I t_SJi-sJtJI-lJI I pJI-rJId o-br@-z@-mJI z@_o-tJe-plJ@-lJI l@_om-pa-d@_r
 26 | p@_o-slJedJ-nJI_i ka_if lJe-t@ pJI-rJIt skU-t_SJi-SJI_i U_x-t_SJo-b@_r I p@f-sJIdJ-nJe-vn@-sJtJI
 27 | ja vJi-dJIl tJI-pJerJ SIL Sto n@_o sa-m@m dJe-lJI on nJI U_x-sk@_o-gla-z@_r_i SIL @_o pro-st@ IsJ tJex lJU-dJe_i SIL k@_o-to-r@_r-jI fsJIg-da smo-trJ@t fprJI-SJur
 28 | tolJ-k@ Iz-vJI-nJi-tJI
 29 | vdruk p@-mJI-sJtJil-sJ@ p@-srJI-dJi t_SJI-sJtJi-lJI-SJ@ SIL kag b@_r nJI-k@_o-mu I nJI mJI-Sa-j@
 30 | t@_o-ko_i InJ-tJI-l_lJIk-tU-alJ-n@_r_i m@-r@_o-fon
 31 | sta-r@_r_i tUrk-mJen SIL proz-v@-n_n@_r_i x@_o-ro-br@_rx t_s@-rJom mJI-nJI-l@-jIm SIL r@-zdU-val slu-xJI SIL Sto v mJer-gJI-lJI sJI-dJid zlo_i dux I bu-dJId ZI-sto-k@ msJtJitJ kaZ-d@-mU_x SIL kto d@_i-dJod d@_o sJer-t_s@ g@_o-ri_x
 32 | b@-g@_o-slo-vJI-jI f kJi-jI-vJI on t_SJI-tal p@_o @_ok-vJI-na-tU_x
 33 | I sJI-v@_osJtJ-ja-n@f nJI U_x-dJI-vlJa-jItJ-sJ@ SIL Sto sJem-k@ pJi-S@_rt pJI-sJmo bra-tU_x SIL spJa-SJI-mU_x f s@_o-sJedJ-nJI_i kom-n@-tJI SIL SIL nJI d@_o t@_o-vo sJI-v@_osJtJ-ja-n@-vU_x
 34 | mi_x nJI m@_o-glJi @_o-prJI-dJI-lJitJ SIL Sto tJI-pJerJ SIL sol-nJIt_SJ-n@_r_i lJI dJenJ I-lJI nJI-pr@-nJI-t_sa-jI-m@_r_i tU-man SIL I-b@ v lJe-sU_x bi_x-lJI su-mJIr-kJI SIL kak v na-S@_rx S@_r-ro-t@x t_SJI-rJISJ t_SJas po-slJI sol-nJIt_SJ-n@-g@ z@_o-ka-t@
 35 | nJI I-zbJI-Zal e-t@_o_i u-t_SJ@-sJtJI I en
 36 | e-t@ Ze nJI pa-st@ SIL @_o t_SJi-st@-j@ @_o-tra-v@
 37 | z@_o nJim @_o-na SIL prJI-kra-sn@-j@ kak fsJIg-da SIL fsJa v bJe-l@m SIL fsJa f t_svJI-tax p@-mJI-ran-t_s@ SIL z dlJi-n_n@_rm blon-d@-v@_rm vU-a-lJIm n@_o g@-l@_o-vJe SIL k@_o-to-r@_r_i Z@_r-v@_o-pJi-sn@ spU-skal-sJ@ n@_o-zad SIL z blJI-sJtJa-SJIm Si_x-fr@m n@_o lJe-v@m plJI-t_SJe
 38 | je-slJI b@_r k@_o-ko_i-lJi-b@ svJI-SJe-n_nJIk vzdu-m@l U-stro-ItJ rJI-lJI-gJI-o-zn@_r-jI t_SJtJe-nJI-j@ dlJ@_o n@_o-ro-d@ f sv@_o-je_i Ze t_serk-vJI SIL no f t_SJI-si_x SIL k@_og-da nJed b@-g@-slU-Ze-nJI-j@ v nJe_i SIL dlJ@_o e-t@-v@ on prJId-v@_o-rJi-tJIlJ-n@ dol-Z@_rn @_r-spr@_o-sJitJ sJI-bJe @_o-so-b@-jI r@-zrJI-Se-nJI-jI jI-p@r-xJI-alJ-n@_o_i vla-sJtJI
 39 | kag bu-t_t@ ranJ-S@_r lJu-dJI s@_o vrJe-mJI-nJIm m@-l@_o-dJe-lJI
 40 | k@_o-bJin-k@ b@_r-la p@_o-t_SJi-SJI
 41 | sJInJ-tJabrJ v @_r-zra-I-lJI SIL mJe-sJ@t_s n@-v@_o-godJ-nJI_i I p@_o-e-t@-mU_x nJI o-t_SJInJ pJI-rJI-gru-Z@_r-n_n@_r_i p@_o-lJi-tJI-k@_o_i
 42 | zJdJe-l@-l@ e-t@ z gru-sn@_rm vJi-d@m ma-mJInJ-k@ SIL tol-st@-j@ da-m@ SIL b@_olJ-Sa-j@ kU-rJi-tJIlJ-nJI-t_s@ I spJI-t_s@_r-@_o-lJist-k@ f prJI-fJI-ranJ-sJI
 43 | z@_o-sJi-mU_x I sav-v@_o-tJi-j@
 44 | t_SJI-rJIz dva mJe-sJ@-t_s@ SIL vi_x b@_olJ-ni_x @_o-t_t@_o-vo SIL Sto nJI p@-bJI-rJI-glJisJ
 45 | r@-SJot U nJI-vo bi_xl pr@_o-sto_i SIL t_SJem mJenJ-S@_r go-sJtJI pr@_o-bu-dU_xt v la-gJI-rJI SIL tJem lut_SJ-S@_r SIL SIL la-gJIrJ fsJIg-da k@_o-zal-sJ@ jI-mu kJI-pJa-SJIm k@_o-tlom SIL jI-Z@_r-mJI-nut-n@ g@_o-to-v@_rm vz@_or-vatJ-sJ@
 46 | fsJIg-da mJIt_SJ-ta-l@ I-mJetJ mno-g@ dJI-tJe_i SIL @_o tut SJt_SJasJtJ-jI v rU-kJi SIL dJe-v@t_SJ-k@ sla-vn@-j@ SIL s@-m@-st@_o-ja-tJIlJ-n@-j@
 47 | I-lJI druk g dru-gU_x SIL vr@_oZ-dJeb-n@ prJI-stra-sn@_r-jI
 48 | g@-sU-dar-stvJI-n_n@-j@ tJI-rJI-to-rJI-j@ SIL d@_o-to-lJI z@-klJU_x-t_SJo-n_n@-j@ f prJI-dJe-l@x pJIr-v@-n@_o-t_SJalJ-n@-g@ r@-s_sJI-lJe-nJI-j@ vJI-lJI-k@_o-ru-sk@-g@ plJe-mJI-nJI SIL tJI-pJerJ pJI-rJI-xo-dJId d@-lJI-ko z@_o e-tJI prJI-dJe-l@_r I p@-sJtJI-pJe-n_n@ vbJI-ra-jIt f sJI-bJa fsJu ru-skU_x-jU_x r@_o-vnJi-nU_x SIL r@-spr@-str@_o-nJa-j@sJ kag d@_o gJI-@_o-gr@_o-fJi-t_SJI-skJIx jI-jo gr@_o-nJit_s SIL tak p@_ot_SJ-tJi vJI-zJdJe d@_o prJI-dJe-l@f ru-sk@-g@ n@-r@-d@-n@-sJI-lJe-nJI-j@
 49 | zJdJesJ prJeZ-dJI fsJI-vo b@_olJ-So-jI zn@_o-t_SJe-nJI-jI I-mJe-jIt pra-vJIlJ-n@_r_i @_od-bor p@-stU-pa-jU_x-SJIx n@_o v@_o-je-n_nU_x-jU_x sluZ-bU_x
 50 | mo-j@ k@_o-bi_x-l@-t@ jI-SJo zJI-mo_i d@_o t@_o-vo prJI-vi_x-kl@ s@_o-lo-mU_x v@_o-zJitJ SIL Sto z z@_o-kri_x-t@_r-mJI gl@_o-za-mJI p@_o t@_o-mu m@_or-Sru-tU_x x@_o-dJi-l@
 51 | s@_o-trudJ-nJI-kJI mJI-lJi-t_s@-I da-Z@_r v@_r-jIZ-Za-lJI n@_o strJel-kJI SIL gdJe o-t_SJInJ bi_x-str@ I d@_o-stup-n@ @_ob-j@-sJnJ@-lJI sv@_o-im @_op-p@_o-nJen-t@m nJI-@_op-x@_o-dJi-m@sJtJ @_o-sta-vJItJ f p@_o-ko-jI @_o-pJI-ka-jI-m@_rx I-lJI k@mJ-mJIr-san-t@v
 52 | lJU-bovJ SIL mJe-r@ @_o-d@-rJI-n_n@-sJtJI Zi_xzJnJ-jU_x lJU-dJe_i SIL no @_o-na SIL v@-prJI-kJi fsJI-mu SIL v o-t_SJInJ ma-l@_o_i sJtJe-pJI-nJI sJIk-sU-alJ-n@sJtJ
 53 | nU SIL vi_x xra-br@_r_i
 54 | box vJesJtJ SIL fstrJe-tJIm-sJ@ lJI jI-SJo
 55 | I-b@ kag b@_r nJI bi_xl t_SJI-l@_o-vJek mal SIL no jesJtJ k@_o-kJi-jI-t@ rJI-zUlJ-ta-t@_r jI-vo Zi_x-zJnJI
 56 | staf @_o-dJin ras vr@_o-zrJes s ma-tJIrJ-jU_x I sJo-str@-mJI SIL @_o-na nJI U-mJe-l@ s nJi-mJI s@_o_i-tJisJ sno-v@ SIL @_o @_o-nJi e-t@-v@ nJI I-ska-lJI
 57 | @_o-dJin z@_o-su-nU_xl jI-vo v n@_o-sJtJe-n_n@_r-jI t_SJI-si_x s kU-kuS-k@_o_i
 58 | t@_og-da ja SIL @_o-b@-r@_o-tJasJ SIL U-vJi-dJIl n@_o go-rJI pro-tJIf nas SIL z@_o rJet_SJ-k@_o_i SIL mno-Z@_rst-v@ k@_o-lJu-Z@_i SIL @_o svJerx t@_o-vo t_SJI-l@_o-vJeg dvat-t_s@dJ bJI-Zaf-S@_rx SIL Stop @_o-trJe-z@tJ nas tr@_o-ix @_ot na-S@_rx t@_o-va-rJI-SJI_i SIL mJeZ-dU_x tJem strJI-li_x si_x-p@-lJIsJ n@_o nas SIL kag grad
 59 | mi_x nJI @_o-st@_o-na-vlJI-v@-jIm-sJ@ n@_o d@_o-sJtJig-nU_x-t@m SIL st@_o-ra-j@sJ @_ox-v@_o-tJitJ vJesJ spJektr n@-pr@_o-vlJe-nJI_i p@d-g@_o-tof-kJI stU-dJen-t@v
 60 | @_o n@_o mnJe SIL ja znal SIL lJI-Za-l@ fsJa @_ot-vJet-stvJI-n_n@zJdJ z@_o U-spJex jI-jo
 61 | s@_o-bo-jU_x ix nJI z@-sl@_o-nJu SIL x@_o-tJa ja I af-t@r SIL vJIr-nJe-jI SIL @_od-no Is ft@-r@-sJtJI-pJe-n_n@_rx lJit_s n@_o z@_o-dax m@_o-s_sof-kJI
 62 | p@_o-ka-t_SJI-v@-j@ g@-l@_o-vo_i I zgor-bJIf-S@_rsJ SIL on v@z-vr@_o-SJa-jItJ-sJ@ k @_ok-nu SIL @_o-trJad dlJi-n_n@_o_i I nJI-ro-vn@_o_i t_sepJ-jU_x v@_r-p@_ol-za-jIt t_SJI-rJIz vo-r@-t@ plat_s-p@_o-ra-d@
 63 | no e-t@ @_o-k@_o-za-l@sJ rJI-alJ-n@sJtJ-jU_x SIL I tJI-pJerJ mJir U-Ze nJI bu-dJIt t@_o-kJim SIL k@_o-kJim on bi_xl ft_SJI-ra
 64 | p@_o SJt_SJasJtJ-jU_x SIL pa-d@-lJI dJI-rJevJ-j@ not_SJ-jU_x SIL t_SJa-s@ f t_SJI-ti_x-rJI SIL v bJI-zlJudJ-jI
 65 | sam t@_o-SJo_i SIL @_o mJe-st@ za-nJ@l SIL kag ba-b@ @_ot-kor-mlJI-n_n@-j@
 66 | nJI-vaZ-n@
 67 | k@_og-da mi_x vi_x-jI-x@-lJI SIL b@_r-la U-Ze not_SJ SIL I d@_o-ro-g@ lJI-Za-l@ t_SJI-rJIzJ dJef-stvJI-n_n@_r_i lJes s vJI-k@_o-vi_x-mJI so-sn@-mJI SIL lJist-vJI-n_nJI-t_s@_i I jelJ-jU_x gJI-gant-skJIx r@_o-zmJe-r@v
 68 | @_o-st@_o-va-l@sJ ro-vn@-j@ SIL sp@_o-ko_i-n@-j@ I nJI-z@_o-muZ-nJ@-j@
 69 | e-t@ sU-ka v @_o-t_sen-kJI mU_x-Z@_r-kof I ix dU-rat_s-kJIx ka-t_SJIstf p@_ot_SJ-tJi fsJIg-da b@_r-va-jIt pr@_o-va
 70 | im e-t@ nJI mJI-Sa-l@
 71 | no jI-SJo pi_x-r@m SIL d@_r-rJa-v@_rm b@_o-tJin-k@m on I tut U-spJel vrJe-z@tJ sJIr-Zan-tU_x mJeS nog SIL I e-t@ to-Z@_r s@_o SJI-stlJi-v@_o_i mi_xslJ-jU_x
 72 | n@-vJI-va-jIt mi_x-slJI @_o st@_o-bJilJ-n@-sJtJI I p@-st@_o-jan-stvJI
 73 | p@_o-sli_x-S@-lJIzJ Zen-skJI-jI fsxlJi-p@_r-v@-nJI-j@
 74 | @_o tak kag dJe-j@-tJIlJ-n@sJtJ jI-vo pr@-Is-x@_o-dJi-l@ srJI-dJi o-t_SJInJ m@-l@_o-di_xx lJU-dJe_i SIL prJI-nJI-maf-S@_rx jI-vo bJI-zgr@_o-nJit_SJ-nU_x-jU_x s@-m@-U-vJe-rJI-n_n@zJdJ z@_o glU_x-b@-k@_o-mi_x-slJI-jI I mu-dr@sJtJ SIL to b@lJ-S@_rn-stvo p@tJ-t_SJI-nJa-l@sJ jI-mu SIL I on @_r-mJel b@_olJ-So_i U-spJex v rJI-v@-lJU_x-t_s@_r-o-n_n@_rx krU-gax
 75 | st@_o-ra-j@sJ nJI zbJitJ-sJ@ s op-SJI-g@ Sa-g@ SIL on sno-v@ prJIt-sta-vJIl sJI-bJe s@_r-n@_o-vJe_i I mi_x-slJI-n_n@ @_o-br@_o-tJil-sJ@ k nJim s pr@-d@_ol-Ze-nJI-jIm sv@-jI-vo r@_o-ska-z@
 76 | op-SJIst-v@ SIL f k@_o-to-r@m vlast-v@-v@-l@ p@_or-tJi_i-n@-j@ n@-mJIn-kl@_o-tu-r@ SIL n@_osk-vosJ pr@_o-pJi-t@-n_n@-j@ dog-m@-mJI SIL nJI-I-zlJI-t_SJi-m@ b@_olJ-na-j@ U_x-t@_o-pJi-t_SJI-sk@_o_i I-dJI-@_o-lo-gJI-jI_i
 77 | e-t@-v@ ZI-la-jIt mo-j@ v@_r-so-k@-j@ p@-vJI-lJi-tJIlJ-nJI-t_s@
 78 | n@-st@_o-ja-SJI_i gr@_o-mJi-l@ s u-skJIm lbom SIL s l@_ox-ma-t@_r-mJI br@_o-vJa-mJI n@_od blJi-sk@ svJI-dJo-n_n@_r-mJI t_SJor-n@_r-mJI gl@_o-za-mJI SIL @_o dlJi-n_n@_r-jI rU-kJi SIL slo-vn@ klJIS-nJi SIL SIL @_od-no_i l@_o-donJ-jU_x fsJu m@_o-ju spJi-nU_x prJI-kro-jU_xt
 79 | tJe t_se-n_n@_r-jI rJI-zUlJ-ta-t@_r SIL @_o k@_o-to-r@_rx mi_x g@-v@_o-rJi-lJI SIL SIL I-tog dlJi-tJIlJ-n@-g@ I n@-prJ@-Zo-n_n@-g@ trU-da
 80 | @_os-v@-b@Z-dJo-n_n@_r_i @_ot nJI-@_op-x@_o-dJi-m@-sJtJI n@_o kaZ-d@m Sa-gU_x d@_o-ka-z@_r-v@tJ sv@_o-ju nJI-z@_o-vJi-sJI-m@sJtJ SIL fsJa-kJI_i dJe-l@l sv@-jo dJe-l@ sp@_o-ko_i-n@ SIL bJIz r@-zdr@_o-Ze-nJI-j@
 81 | sJI-godJ-nJ@ @_o-nJi bolJ-S@_r fsJI-vo b@_o-jatJ-sJ@ SIL Sto prJId-vor-n@-j@ znatJ I krup-n@_r-jI k@-pJI-t@_o-lJi-st@_r nJI z@-x@_o-tJat i_xx bratJ fsJIrJ-joz
 82 | sk@_o-rJe-jI fsJI-vo on nJI U-mJe-jIt p@_o-nJatJ pr@-tJI-v@_o-rJe-t_SJI-jI mJeZ-dU_x @_ot-vJet-stvJI-n_n@sJtJ-jU_x ix mJi-s_sJI-I I tJe-mJI jI-d@_o-vJi-t@_r-mJI x@-r@k-tJI-rJi-sJtJI-k@-mJI SIL k@_o-to-r@_r-mJI n@-dJI-lJa-jIt i_xx lJe-nJIn
 83 | nU SIL vot SIL mi_x I pr@_o-vJe-rJIm SIL mJi-l@-j@ trU-sJi-x@ SIL n@_o-skolJ-k@ @_o-pr@_ov-da-jU_xtJ-sJ@ va-S@_r stra-xJI
 84 | ZenJ-SJI-n@ @_o-bra-d@-v@-l@sJ r@-zg@_o-vo-rU_x SIL I s@_o-ma fsJe r@-sk@_o-za-l@ SIL l@_o-ska-j@ jI-vo u-sk@-jI lJI-t_so S@_rl-k@_o-vJi-st@_r-mJI sv@_o-i-mJI gl@_o-za-mJI
 85 | r@_o-sJi-I xva-tJIt n@_o fsJex
 86 | @_o gdJe-t@ SIL vot-k@ SIL gdJe-t@ SIL s@-m@_o-gon SIL gdJe-t@ SIL t_SJIr-nJi-l@ @_o-b@_rk-n@_o-vJe-n_n@_r-jI
 87 | no @_o-nJi nJI lJU_x-b@_o-pi_xt-n@_r
 88 | rJI-zU_xlJ-t@_o-tJi-vn@sJtJ
 89 | I fsJIg-da mi_xslJ box s@_o mno-jU_x
 90 | jI-fJim @_rg-n@_o-tJit_SJ tolJ-k@ mJI-ga-jIt SIL @_o d@_o g@_o-rJa-t_SJIx pJI-r@_oS-kof nJI d@_o-tra-gJI-v@-jItJ-sJ@
 91 | U tJo-tU_xS-kJI ma-rJI @_o-lJIk-sJe-jI-vn@_r @_o-na pr@-Z@_r-la nJI-dol-g@
 92 | e-t@ p@-st@_o-ja-n_n@-j@ nJI-@_o-bi_xt_SJ-n@zJdJ dlJ@_o nas SIL vJI-r@_o-jat-n@ SIL @_o-sno-v@-n@ n@_o Im-m@_o-nJent-n@_o_i I-l_lJu-zJI-I k@-U-zalJ-n@-sJtJI vrJe-mJI-n_n@_o_i @_or-g@-nJI-za-t_s@-I psJI-xJi-t_SJI-sk@-g@
 93 | si_xr SIL ma-sl@ SIL ko-Z@ SIL mJod SIL lJes I SIL d@_o-lo_i fa-brJI-kJI
 94 | @_od-no vrJe-mJ@ SIL n@-prJI-mJer SIL U_x-g@_o-va-rJI-v@-lJI el
 95 | sto-j@ U p@_od-no-Z@_r-j@ m@-jI-ka SIL na-d@ v@_r-s@_o-ko z@-dJI-radJ go-l@-vU_x SIL Sto-b@_r U-vJi-dJItJ jI-vo vJIr-Si_x-nU_x SIL I tolJ-k@ t@_og-da p@-sJtJI-ga-jIS fsJe vJI-lJi-t_SJI-jI s@-@_o-rU-Ze-nJI-j@
 96 | bolJ-S@_r nJI-t_SJI-go nJI g@-v@_o-rJit SIL g@-v@_o-rJit tolJ-k@ SIL Sto tam o-t_SJInJ plo-x@
 97 | tag Ze kak v n@_o-t_SJa-lJI pro-Sl@-g@ vJe-k@ ko-n_n@_r-jI skat_SJ-kJI SIL @_o v n@_o-t_SJa-lJI p@-z@_o-pro-Sl@-g@ SIL strJIlJ-ba Iz lu-k@
 98 | @_o-fon-sk@_o_i g@_o-ri_x
 99 | kak nJI znatJ @_on-drJe-j@ mJI-x@_o_i-li_x-t_SJ@
100 | sJtJI-p@n tJI-m@_o-xJin
101 | p@_o-slJedJ-nJI-jI m@-jo svJI-da-nJI-jI z go-g@-lJIm bi_x-l@ f pJI-tJIr-bur-gJI SIL k@_og-da on @_o-st@_o-na-vlJI-v@l-sJ@ v zJim-nJIm dv@_or-t_se SIL U ZU-ko-fsk@-g@
102 | mi_x U-Ze U-zna-lJI SIL Sto on s@-bJI-ral-sJ@ pr@_o-t_SJesJtJ nam no-v@-jI sv@-jo pr@-Iz-vJI-dJe-nJI-jI SIL no prJI-stU-pJitJ g dJe-lU_x bi_x-l@ nJI lJIx-ko
103 | go-g@lJ kak nJI f t_SJem nJI b@_r-va-l@ x@_o-dJil p@_o kom-n@-tJI SIL d@-br@_o-duS-n@ p@_ot-smJe-I-v@l-sJ@ n@_odJ nJe-k@-t@-r@_r-mJI op-SJI-mJI zn@_o-ko-m@_r-mJI SIL @_o @_o t_SJtJe-nJI-I I p@_o-mJi-nU_x nJI bi_x-l@
104 | da-Z@_r ras on n@-mJIk-nul SIL Sto moZ-n@ @_o-tl@_o-Zi_xdJ z@-sJI-da-nJI-jI
105 | on p@-d@-Sol g go-g@-lJU_x zza-dJI SIL @_o-SJu-p@l k@_or-ma-n@_r jI-vo fra-k@ SIL vi_x-t@-SJIl @_o-t_tu-d@ tJI-tratJ p@_ot_SJ-to-v@_o_i bU-ma-gJI v @_o-sJmuS-kU_x
106 | go-g@lJ sJIr-dJi-t@ vi_xx-v@-tJIl tJI-trat-kU_x SIL sJel mrat_SJ-n@ n@_o dJI-van I totJ-t_SJaz Ze n@_o-t_SJal t_SJI-tatJ prJI fsJI-op-SJIm m@_ol-t_SJa-nJI-I
107 | on t_SJI-tal bJIs pJI-rJI-ri_x-v@ d@_o tJex por SIL p@_o-ka I-st@_o-SJil-sJ@ vJesJ jI-vo go-l@s I z@-rJI-bJi-l@ v gl@_o-zax
108 | mi_x U-zna-lJI t@_o-kJim o-br@-z@m pJer-v@_r-jI t_SJI-ti_x-rJI gl@_o-vi_x mJort-v@_rx duS
109 | op-SJI_i smJex ma-l@ p@-r@_o-zJil go-g@-lJ@ SIL no Iz-jI-vlJe-nJI-jI nJI-lJI-t_sI-mJer-n@-g@ v@_o-stor-g@ SIL k@_o-to-r@-jI vJi-dJI-m@ bi_x-l@ n@_o fsJex lJi-t_s@x pot k@_o-nJet_s t_SJtJe-nJI-j@ SIL jI-vo tro-nU_x-l@
110 | on bi_xl d@_o-vo-lJIn
111 | kto-t@ sk@_o-zal SIL Sto prJI-vJet-stvJI-jI sJI-lJI-fa-n@ b@_o-so_i dJe-v@t_SJ-kJI SIL k@_o-to-rU_x-jU_x on s@_o-Za-jIt n@_o k@_o-zli_x vmJe-st@ pr@-v@dJ-nJI-ka @_ot k@_o-ro-b@t_SJ-kJI SIL nJI s@_of-sJem prJI-lJit_SJ-n@
112 | fsJe @_o-st@_olJ-ni_x-jI slu-S@-tJI-lJI v@_o-s_sta-lJI pro-tJIf e-t@-v@ z@-mJI-t_SJa-nJI-j@
113 | po-slJI t_SJtJe-nJI-j@ on z@_o-ku-t@l-sJ@ SIL p@_o @_o-b@_rk-n@_o-vJe-nJI-jU_x SIL f Su-bU_x d@_o sa-m@-v@ lba SIL sJel s@_o mno_i n@_o Iz-voSJ-t_SJI-k@ SIL I mi_x mol-t_SJ@ d@_o-je-x@-lJI d@_o zJim-nJI-g@ dv@_or-t_sa SIL gdJe ja jI-vo s_s@_o-dJil
114 | fsko-rJI p@_o-tom on @_o-pJatJ ISJ-t_SJes Is pJI-tJIr-bur-g@
115 | go-g@lJ @_o-bra-d@-v@l-sJ@ na-S@_i no-v@_o_i fstrJe-t_SJI SIL r@_o-spra-S@_r-v@l SIL k@_o-kJim pU_x-tJom prJi-b@_rl ja v @_r-ta-lJI-jU_x
116 | jI-mu k@_o-za-l@sJ SIL Sto po-slJI I-ta-lJI-I p@_o-rJiS st@_o-no-vJItJ-sJ@ sux I bJIZ-Zi_x-zJnJIn SIL @_o zn@_o-t_SJe-nJI-jI I-ta-lJI-I br@_o-sa-jItJ-sJ@ s@_o-mo s@_o-bo_i v gla-z@ po-slJI p@_o-rJiS-sk@_o_i Zi_x-zJnJI I p@_o-rJiS-skJIx @_rnJ-tJI-rJe-s@v
117 | fp@_o-slJet-stvJI-I on t_SJa-st@ r@z-vJI-val e-tU_x mi_xslJ
118 | mJeZ-dU_x tJem vrJe-mJ@ bi_x-l@ @_o-bJe-dJI-n_n@-jI
119 | on p@-vJol mJI-nJa v @_rz-vJe-snU_x-jU_x I-st@_o-rJi-t_SJI-skU_x-jU_x @_o-fsJtJe-rJI-jU_x SIL gdJe z@_o dlJi-n_n@_r-mJI st@_o-la-mJI SIL S@_o-ga-j@ p@_o grJa-zn@-mU_x po-lU_x I U-sa-Z@_r-v@-j@sJ pro-st@ n@_o sk@_o-mJe_i-k@x SIL sJtJI-ka-jItJ-sJ@ k @_o-bJe-dJI-n_n@-mU_x t_SJa-sU_x r@-zn@-@_o-bra-zJnJI_i-S@-j@ pu-blJI-k@
120 | e-t@ fsJe tod Ze rJis SIL b@_o-ra-S@_rk SIL ku-rJI-t_s@ SIL mJI-nJa-jItJ-sJ@ tolJ-k@ zJe-lJInJ p@_o vrJI-mJI-nam go-d@
121 | pr@-st@_o-ta SIL @_op-SJI-Zi_x-tJIlJ-n@sJtJ I-t@_olJ-jan-sk@-j@ fsJI-vo bo-lJI-jI kJI-da-jU_xtJ-sJ@ tut v gla-z@ SIL z@-st@_o-vlJa-j@ prJItJ-t_SJust-v@-v@tJ sJI-bJa I v@_o fsJex drU-gJix sfJe-r@x Zi_x-zJnJI
122 | go-g@lJ p@-r@_o-zJil mJI-nJa SIL @_od-na-k@ SIL k@_o-prJi-zn@_rm SIL vz@_r-ska-tJIlJ-n@_rm @_o-br@_o-SJe-nJI-jIm sv@_o-im s prJI-sluZ-nJI-k@m
123 | ra-z@ dva mJI-nJal on blJu-d@ rJi-s@ SIL n@-x@_o-dJa jI-vo to pJI-rJI-va-rJI-n_n@_rm SIL to nJI-d@_o-va-rJI-n_n@_rm SIL I fsJa-kJI_i ras prJI-sluZ-nJIk pJI-rJI-mJI-nJal blJu-d@ z d@-br@_o-duS-n@_o_i U-li_xp-k@_o_i
124 | p@-lU-t_SJif n@-k@_o-nJet_s t@_o-rJel-kU_x rJi-s@ p@_o sv@-jI-mu fku-sU_x SIL go-g@lJ prJI-stU-pJilk nJe_i s nJI-@_o-b@_r-t_SJa_i-n@-jU_x alt_SJ-n@sJtJ-jU_x SIL n@-kl@_o-nJasJ tak SIL Sto dlJi-n_n@_r-jI vo-l@-s@_r jI-vo U-pa-lJI n@_o sa-m@-jI blJu-d@ SIL I p@-gl@_o-SJa-j@ loS-kU_x z@_o loS-k@_o_i s@_o strasJtJ-jU_x I b@_r-str@_o-to_i SIL k@_o-kJi-mJI SIL g@-v@_o-rJat SIL @_o-b@_rk-n@_o-vJe-n_n@ @_o-tlJI-t_SJa-jU_xtJ-sJ@ z@_o st@_o-lom lJu-dJI SIL r@-sp@_o-lo-Z@_r-n_n@_r-jI k @_r-p@_o-xon-drJI-I
125 | f sJI-rJI-dJi-nJI @_o-bJe-d@ k nam p@_otJ-sJel d@_o-volJ-n@ plot-n@_r_i mUS-t_SJi-n@ SIL s kr@_o-sJi-v@_o_i SIL kru-gl@_o_i b@_o-rot-k@_o_i
126 | @_o-p@-r@_oZ-nJif sv@-jo blJu-d@ SIL go-g@lJ @_ot-kJi-nU_xl-sJ@ n@_o-zad SIL zJdJe-l@l-sJ@ vJe-sJIl SIL r@-zg@_o-vor-t_SJIf I n@_o-t_SJal SU-tJitJ s prJI-sluZ-nJI-k@m SIL jI-SJo tak nJI-da-vn@
127 | @_o-s@_r-pa-jI-m@_rm stro-gJI-mJI vi_x-g@-v@-r@-mJI I U_x-k@_o-rJi-zn@-mJI
128 | p@_o @_o-k@_onJ-t_SJa-nJI-I r@SJ-t_SJo-t@ z@_o @_o-bJed go-g@lJ @_o-sta-vJIl prJI-sluZ-nJI-kU_x SIL kak I fsJe drU-gJi-jI p@-sJI-tJi-tJI-lJI SIL dva b@_o_i-o-k@ SIL @_o k@_og-da ja s@_o sv@_o-je_i st@-r@_o-ni_x t_SJto-t@ pJI-rJI-l@_o-Zi_xl pro-tJIf e-t@_o_i skud-n@_o_i sum-m@_r SIL on @_o-st@-n@_o-vJil mJI-nJa z@-mJI-t_SJa-nJI-jIm
129 | Iz-vJe-sn@ SIL Sto Z@_r-tJe_i-sk@_o_i mu-dr@-sJtJI v nJem bi_x-l@ p@_ot_SJ-tJi stolJ-k@ Ze SIL skolJ-k@ I t@_o-lan-t@
130 | on bi_xl f sv@_o-je_i t@_o-rJel-kJI I mog SIL Sto jI-mu nuZ-n@ bi_x-l@ I-lJI Sto sto-I-l@ e-t@-v@ SIL pol-n@_o_i rU-ko_i SIL nJI d@_o-va-j@ sam nJI-t_SJI-go
131 | ja nJI-k@_og-da nJI x@_o-t_SJu @_o-bJe-d@tJ
132 | mnJe tak x@-r@_o-So v@_o dv@_o-rJe I-gratJ
133 | ja fsJu Zi_xzJnJ b@_r v@_o dv@_o-rJe I-gral
134 | I nJI-k@_og-da nJI @_o-bJe-d@l b@_r
135 | ja s@_of-sJem nJI lJU-blJu borSJ s k@_o-pu-st@_o_i
136 | I v@_op-SJe ja sup nJI lJU-blJu
137 | I ka-SU_x ja nJI lJU-blJu
138 | I k@_o-tlJe-t@_r to-Z@_r nJI o-t_SJInJ lJU-blJu
139 | ja lJU-blJu @_o-brJI-ko-s@_r
140 | vi_x je-lJI @_o-brJI-ko-s@_r
141 | ja tak lJU-blJu @_o-brJI-ko-s@_r
142 | no vot ma-m@ z@-vJot mJI-nJa jezJdJ borSJ SIL mnJe prJI-xo-dJItJ-sJ@ fsJo br@_o-satJ
143 | mo_i brad b@_o-ba lJu-bJId borSJ
144 | on smJI-jotJ-sJ@ SIL k@_og-da jezd borSJ SIL @_o ja mor-SJU_xsJ
145 | on v@_op-SJe fsJIg-da smJI-jotJ-sJ@ I ti_x-t_SJIt sJI-bJe loS-k@_o_i v nos vmJe-st@ rta SIL p@-t@_o-mu Sto jI-mu trJi go-d@
146 | nJet SIL borSJ ja m@_o-gu sjesJtJ
147 | I k@_o-tlJe-t@_r ja to-Z@_r sjI-da-jU_x
148 | vJI-n@_o-gra-t_t@ ja jem s U_x-d@_o-volJ-stvJI-jIm
149 | t@_og-da I s@_o-Za-jU_xt mJI-nJa z@_o r@_o-jalJ
150 | p@_o-Za-lU_i SIL ja sjel b@_r jI-SJo raz borSJ
151 | tolJ-k@ b@_r nJI I-gratJ n@_o r@_o-ja-lJI
152 | ja I-gra-jU_x SIL @_o brat sJI-dJit n@_o po-lU_x I smJI-jotJ-sJ@
153 | v rU-kax U nJI-vo z@-v@_od-na-j@ m@_o-Si_x-n@
154 | on @_o-t@_or-val @_ot m@_o-Si_x-n@_r k@-lJo-s@
155 | I k@_o-ta-jIt i_xx p@_o po-lU_x
156 | I e-t@ jI-mu o-t_SJInJ nra-vJItJ-sJ@
157 | nJIk-to jI-mu nJI mJI-Sa-jIt
158 | nJI z@-st@_o-vlJa-jIt @_r-gratJ n@_o r@_o-ja-lJI
159 | I p@-t@_o-mu jI-mu o-t_SJInJ vJe-sJI-l@
160 | pla-t_SJIt on o-t_SJInJ rJet-k@
161 | k@_og-da U nJI-vo t_SJto-nJI-butJ @_otJ-nJI-ma-jU_xt
162 | I-lJI k@_og-da jI-vo strJI-gut
163 | on s@-vJIr-Se-n_n@ nJI lJu-bJIt strJit_SJ-sJ@
164 | on tak I x@_o-dJil b@_r fsJu Zi_xzJnJ l@_ox-ma-t@_r_i
165 | n@_o e-t@ on nJI @_o-br@_o-SJa-jIt vnJI-ma-nJI-j@
166 | v op-SJIm SIL jI-mu x@-r@_o-So SIL @_o mnJe plo-x@
167 | pa-p@ s ma-m@_o_i slu-S@-jU_xt SIL kak ja I-gra-jU_x
168 | brat k@_o-ta-jIt p@_o po-lU_x k@-lJo-sJI-kJI
169 | z@_o @_ok-nom krJI-t_SJat t_SJI-ti_x-rJI bra-t@
170 | @_o-nJi krJI-t_SJat ra-zn@_r-mJI g@-l@_o-sa-mJI
171 | ja vJi-ZU_x v @_ok-no SIL @_o-nJi ma-SU_xt rU-ka-mJI
172 | @_o-nJi z@_o-vut mJI-nJa
173 | im @_odJ-nJim skut_SJ-n@
174 | 


--------------------------------------------------------------------------------