├── src ├── build │ └── .gitignore ├── yatts_util.h ├── 3rdparty │ └── utf8 │ │ ├── utf8.h │ │ ├── Makefile │ │ ├── unchecked.h │ │ ├── core.h │ │ └── checked.h ├── Makefile ├── Utf8Transducer.h ├── utf8ext.h ├── yatts_util.cpp ├── transduce.cpp └── Utf8Transducer.cpp ├── AUTHORS ├── COPYING ├── misc ├── README └── parser.yy.patch ├── grammars ├── make.sh ├── tester.sh ├── transduce.sh ├── README ├── g2p.grm ├── crossword.grm ├── definitions.grm ├── palatalization.grm ├── diphthongs.grm ├── syllabification.grm ├── vowels.grm ├── alphabets.grm ├── inflections.grm └── consonants.grm ├── test ├── README ├── rus_sentences.txt └── rus_sentences.txt.g2p └── README.md /src/build/.gitignore: -------------------------------------------------------------------------------- 1 | Utf8Transducer.d transduce.d yatts_util.d 2 | Utf8Transducer.o 3 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Principal Contacts: 2 | 3 | Alexis Wilpert 4 | Schamai Safra 5 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | Licensed under the Apache License, Version 2.0 (the "License"); 2 | you may not use these files except in compliance with the License. 3 | You may obtain a copy of the License at 4 | 5 | http://www.apache.org/licenses/LICENSE-2.0 6 | 7 | Unless required by applicable law or agreed to in writing, software 8 | distributed under the License is distributed on an "AS IS" BASIS, 9 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | See the License for the specific language governing permissions and 11 | limitations under the License. 12 | 13 | Copyright 2014 Yandex LLC 14 | -------------------------------------------------------------------------------- /misc/README: -------------------------------------------------------------------------------- 1 | Before compiling the Thrax package (http://openfst.cs.nyu.edu/twiki/bin/view/GRM/Thrax), 2 | you need to apply the patch file parser.yy.patch to the following file in the Thrax 3 | original source code distribution: 4 | 5 | /thrax-1.0.2/src/lib/main/parser.yy 6 | 7 | Like this: 8 | 9 | patch parser.yy parser.yy.patch 10 | 11 | The patch is required to interpret the strings in the grammar files as UTF-8 encoded 12 | (default is otherwise BYTE). 13 | 14 | Depending on the compiler's version or OS you are using, you might get some issues when 15 | trying to compile the program thraxcompiler. My experience is that this can be solved 16 | changing the order in which the libraries are passed to the compiler. 17 | -------------------------------------------------------------------------------- /misc/parser.yy.patch: -------------------------------------------------------------------------------- 1 | *** parser.yy.old 2012-06-23 21:00:50.000000000 +0200 2 | --- parser.yy 2014-06-23 11:12:36.601662500 +0200 3 | *************** 4 | *** 376,382 **** 5 | 6 | string_fst: 7 | quoted_fst_string 8 | ! { StringFstNode* node = new StringFstNode(StringFstNode::BYTE); 9 | node->AddArgument($1); 10 | node->SetLine($1->getline()); // Get the line from the actual text line. 11 | $$ = node; } 12 | --- 376,383 ---- 13 | 14 | string_fst: 15 | quoted_fst_string 16 | ! //{ StringFstNode* node = new StringFstNode(StringFstNode::BYTE); 17 | ! { StringFstNode* node = new StringFstNode(StringFstNode::UTF8); 18 | node->AddArgument($1); 19 | node->SetLine($1->getline()); // Get the line from the actual text line. 20 | $$ = node; } 21 | -------------------------------------------------------------------------------- /grammars/make.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Copyright 2014 Yandex LLC 16 | # All Rights Reserved. 17 | # 18 | # Author : Alexis Wilpert 19 | 20 | 21 | 22 | make clean 23 | rm -f Makefile 24 | rm -f *.far 25 | rm -f *.stackdump 26 | thraxmakedep g2p.grm 27 | make 28 | farextract g2p.far 29 | -------------------------------------------------------------------------------- /grammars/tester.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Copyright 2014 Yandex LLC 16 | # All Rights Reserved. 17 | # 18 | # Author : Alexis Wilpert 19 | 20 | 21 | 22 | thraxrewrite-tester --input_mode=utf8 --far=g2p.far --rules="\ 23 | READ,\ 24 | INFL,\ 25 | PALT,\ 26 | DIPH,\ 27 | VOWL,\ 28 | CONS,\ 29 | SYLL,\ 30 | CROS,\ 31 | WRIT\ 32 | " 33 | -------------------------------------------------------------------------------- /grammars/transduce.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Copyright 2014 Yandex LLC 16 | # All Rights Reserved. 17 | # 18 | # Author : Alexis Wilpert 19 | 20 | 21 | 22 | if [ -z $1 ] 23 | then 24 | echo -e "\nUsage: transduce.sh INPUT_FILE\n" 25 | exit 26 | fi 27 | 28 | IN=$1 29 | OUT=$1.transduced 30 | 31 | rm -f transduce.stackdump 32 | farextract g2p.far 33 | 34 | transduce -fst="\ 35 | READ,\ 36 | INFL,\ 37 | PALT,\ 38 | DIPH,\ 39 | VOWL,\ 40 | CONS,\ 41 | CROS,\ 42 | SYLL,\ 43 | WRIT\ 44 | " $IN > $IN.g2p 45 | -------------------------------------------------------------------------------- /src/yatts_util.h: -------------------------------------------------------------------------------- 1 | /* Licensed under the Apache License, Version 2.0 (the "License"); 2 | * you may not use this file except in compliance with the License. 3 | * You may obtain a copy of the License at 4 | * 5 | * http://www.apache.org/licenses/LICENSE-2.0 6 | * 7 | * Unless required by applicable law or agreed to in writing, software 8 | * distributed under the License is distributed on an "AS IS" BASIS, 9 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | * See the License for the specific language governing permissions and 11 | * limitations under the License. 12 | * 13 | * Copyright 2014 Yandex LLC 14 | * All Rights Reserved. 15 | * 16 | * Author : Schamai Safra 17 | * 18 | * 19 | * yatts_util.h 20 | */ 21 | 22 | 23 | 24 | #include 25 | #include 26 | #include 27 | 28 | #ifndef YATTS_UTIL_H_ 29 | #define YATTS_UTIL_H_ 30 | 31 | int to_uint(char const *s); 32 | 33 | std::vector tokenize_utf8_string( std::string* utf8_string, std::string* delimiter, int limit = 0 ); 34 | 35 | template 36 | T join(const A &begin, const A &end, const T &t) 37 | { 38 | T result; 39 | A it = begin; 40 | if (it != end) { 41 | result.append(*it++); 42 | } 43 | for( ; it!=end; ++it) { 44 | result.append(t).append(*it); 45 | } 46 | return result; 47 | } 48 | 49 | 50 | #endif /* YATTS_UTIL_H_ */ 51 | -------------------------------------------------------------------------------- /grammars/README: -------------------------------------------------------------------------------- 1 | To compile the rules and generate the final FSTs, just run ./make.sh. The 2 | exported FSTs are defined in the grammar file g2p.grm. The two FST files 3 | G2P1 and G2P2 are defined for being used with the Python transcriber 4 | script. They are split in two parts to minimize size on disk, but you may 5 | just export a single FST file, if you wish. 6 | 7 | After succesful compilation, you might test the rules using either: 8 | 9 | - tester.sh: a very simple interactive loop. Just write in or paste the 10 | words/sentences you want to test. 11 | 12 | - transcribe.sh: which will transcribe a file with words or sentences 13 | given as input. 14 | 15 | Please, take the following in account: 16 | 17 | - you will be able only to test words or sentences that have previously 18 | been normalized. "Normalization" means in this case that the set of 19 | characters used in the input string must be contained in the input 20 | alphabet defined in the first FST (in_feeder in alphabets.grm). This 21 | is done already when you use scripts/tts_transcriber.py 22 | 23 | - second, the strings should contain a stress marker (the "+" char) for 24 | optimal accuracy. This information comes either from the exceptions 25 | lexicon or from the stress prediction model. 26 | 27 | Thanks: 28 | 29 | The implementation of the rules would not have been possible without the 30 | Russian language advice from my colleague at Yandex Anastasiya Polkanova. 31 | 32 | Sources: 33 | 34 | - Chew, Peter A. (2003): A Computational Phonology of Russian. Dissertation.com 35 | - Jones, Daniel & Ward, Dennis (1969): The Phonetics of Russian. Cambridge University Press 36 | -------------------------------------------------------------------------------- /src/3rdparty/utf8/utf8.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include "utf8/checked.h" 32 | #include "utf8/unchecked.h" 33 | 34 | #endif // header guard 35 | -------------------------------------------------------------------------------- /src/3rdparty/utf8/Makefile: -------------------------------------------------------------------------------- 1 | ### Makefile --- 2 | 3 | ## Author: 4 | ## Keywords: 5 | ## X-URL: 6 | 7 | TARGETS=../libs/libjson.a 8 | OBJS=json/json.o 9 | 10 | 11 | ######################################################################## 12 | # Macro definitions for "standard" C and C++ compilations 13 | ## 14 | #CPPFLAGS=-g3 -O0 -fprofile-arcs -ftest-coverage -Ijson -fPIC 15 | CPPFLAGS=-g3 -O0 -Ijson -fPIC 16 | # 17 | CFLAGS=-g 18 | # 19 | # What is the name of the program you want to create? (See below for notes 20 | # on using this makefile to generate multiple programs.) 21 | # 22 | LINK=g++ $(CPPFLAGS) 23 | #LINK=gcc $(CFLAGS) 24 | # 25 | # Define special linkage flags. Usually, these are used to include 26 | # special libraries of code, e.g., -lm to add the library of mathematical 27 | # routines such as sqrt, sin, cos, etc. 28 | LFLAGS=-lm 29 | # 30 | # 31 | # 32 | # In most cases, you should not change anything below this line. 33 | # 34 | # The following is "boilerplate" to set up the standard compilation 35 | # commands: 36 | # 37 | .SUFFIXES: 38 | .SUFFIXES: .d .o .h .c .cc .C .cpp 39 | .c.o: ; $(CC) $(CFLAGS) -MMD -c $*.c 40 | .cc.o: ; $(CPP) $(CPPFLAGS) -MMD -c $*.cc 41 | .C.o: ; $(CPP) $(CPPFLAGS) -MMD -c $*.C 42 | .cpp.o: ; $(CPP) $(CPPFLAGS) -MMD -c $*.cpp -o $@ 43 | 44 | CC=gcc 45 | CPP=g++ 46 | 47 | %.d: %.c 48 | touch $@ 49 | %.d: %.cc 50 | touch $@ 51 | %.d: %.C 52 | touch $@ 53 | %.d: %.cpp 54 | touch $@ 55 | 56 | DEPENDENCIES = $(OBJS:.o=.d) 57 | 58 | # 59 | # Targets: 60 | # 61 | all: $(TARGETS) 62 | 63 | clean: 64 | -rm -f $(TARGETS) $(DEPENDENCIES) $(OBJS) make.dep 65 | 66 | ../libs/libjson.a: json/json.o 67 | ar rcs $@ $^ 68 | 69 | make.dep: $(DEPENDENCIES) 70 | -cat $(DEPENDENCIES) > make.dep 71 | 72 | include make.dep 73 | 74 | ### Makefile ends here 75 | -------------------------------------------------------------------------------- /grammars/g2p.grm: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | # 13 | # Copyright 2014 Yandex LLC 14 | # All Rights Reserved. 15 | # 16 | # Author : Alexis Wilpert 17 | 18 | 19 | 20 | import 'alphabets.grm' as alphabets; 21 | 22 | import 'palatalization.grm' as palatal; 23 | import 'inflections.grm' as infl; 24 | import 'diphthongs.grm' as diphthongs; 25 | import 'vowels.grm' as vowels; 26 | import 'consonants.grm' as consonants; 27 | import 'syllabification.grm' as syll; 28 | import 'crossword.grm' as cross; 29 | 30 | 31 | export READ = Optimize[alphabets.in_feeder]; 32 | export INFL = Optimize[infl.inflections]; 33 | export PALT = Optimize[palatal.palatalization]; 34 | export DIPH = Optimize[diphthongs.diphthongs]; 35 | export VOWL = Optimize[vowels.reduced]; 36 | export CONS = Optimize[consonants.consonant_rules]; 37 | export CROS = Optimize[cross.crossword]; 38 | export SYLL = Optimize[syll.syllabified]; 39 | export WRIT = Optimize[alphabets.out_feeder]; 40 | 41 | export G2P1 = Optimize[alphabets.in_feeder @ 42 | infl.inflections @ 43 | palatal.palatalization @ 44 | diphthongs.diphthongs @ 45 | vowels.reduced @ 46 | alphabets.out_feeder 47 | ]; 48 | 49 | export G2P2 = Optimize[alphabets.in_feeder @ 50 | consonants.consonant_rules @ 51 | cross.crossword @ 52 | syll.syllabified @ 53 | alphabets.out_feeder 54 | ]; 55 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | # 13 | # Copyright 2014 Yandex LLC 14 | # All Rights Reserved. 15 | # 16 | # Author : Schamai Safra 17 | # 18 | # 19 | # Makefile 20 | 21 | 22 | 23 | TARGETS=transduce 24 | 25 | ### transduce: stand-alone transducer tool (UTF8 characters as input labels) 26 | OBJS_transduce=transduce.o yatts_util.o Utf8Transducer.o 27 | LIB_transduce = fst dl m rt 28 | 29 | LIBS=../libs /usr/local/lib 30 | 31 | TARGET_LIBS = ${LIB_${1}:%=-l%} 32 | TARGET_OBJS = ${OBJS_${1}:%=%} 33 | OBJS=${foreach target,${TARGETS:%=OBJS_%},${${target}}} 34 | 35 | ######################################################################## 36 | # Macro definitions for "standard" C and C++ compilations 37 | # 38 | CPPFLAGS= -g3 -O0 -std=c++11 -I../include -I./3rdparty -I/usr/include -fPIC 39 | # 40 | CFLAGS=-g 41 | # 42 | LINK=g++ $(CPPFLAGS) 43 | 44 | LFLAGS=$(LIBS:%=-L%) 45 | # 46 | .SUFFIXES: 47 | .SUFFIXES: .d .o .h .c .cc .C .cpp 48 | .c.o: ; $(CC) $(CFLAGS) -MMD -c $*.c 49 | .cc.o: ; $(CPP) $(CPPFLAGS) -MMD -c $*.cc 50 | .C.o: ; $(CPP) $(CPPFLAGS) -MMD -c $*.C 51 | .cpp.o: ; $(CPP) $(CPPFLAGS) -MMD -c $*.cpp 52 | 53 | CC=gcc 54 | CPP=g++ 55 | 56 | %.d: %.c 57 | touch $@ 58 | %.d: %.cc 59 | touch $@ 60 | %.d: %.C 61 | touch $@ 62 | %.d: %.cpp 63 | touch $@ 64 | 65 | DEPENDENCIES = $(OBJS:.o=.d) 66 | 67 | # 68 | # Targets: 69 | # 70 | all: $(TARGETS:%=./build/%) 71 | 72 | .SECONDEXPANSION: 73 | ./build/%: $$(OBJS_%) 74 | echo dependencies $^ 75 | $(LINK) $(FLAGS) -o $@ $^ $(LFLAGS) $(call TARGET_LIBS,$*) 76 | 77 | cleantargets: 78 | -rm -f $(TARGETS:%=../build/%) $(OBJS) 79 | 80 | clean: cleantargets 81 | -rm -f $(DEPENDENCIES) 82 | 83 | ### Makefile ends here 84 | 85 | -------------------------------------------------------------------------------- /src/Utf8Transducer.h: -------------------------------------------------------------------------------- 1 | /* Licensed under the Apache License, Version 2.0 (the "License"); 2 | * you may not use this file except in compliance with the License. 3 | * You may obtain a copy of the License at 4 | * 5 | * http://www.apache.org/licenses/LICENSE-2.0 6 | * 7 | * Unless required by applicable law or agreed to in writing, software 8 | * distributed under the License is distributed on an "AS IS" BASIS, 9 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | * See the License for the specific language governing permissions and 11 | * limitations under the License. 12 | * 13 | * Copyright 2014 Yandex LLC 14 | * All Rights Reserved. 15 | * 16 | * Author : Schamai Safra 17 | * 18 | * 19 | * Utf8Transducer.h 20 | */ 21 | 22 | 23 | 24 | #ifndef UTF8TRANSDUCER_H_ 25 | #define UTF8TRANSDUCER_H_ 26 | #include 27 | #include 28 | 29 | namespace yatts { 30 | 31 | using namespace fst; 32 | 33 | static const int maxMsgLength = 1000; 34 | 35 | class Utf8Transducer { 36 | public: 37 | enum Status { 38 | OK, 39 | WARN, 40 | ERROR, 41 | }; 42 | Utf8Transducer(); 43 | virtual ~Utf8Transducer(); 44 | Status appendFst(VectorFst*& transducer, string id = string("")); 45 | Status appendFst(const string& file_name, string id = string("")); 46 | Status transduceText(string text, string& result); 47 | Status readOrNewSymtab(string file, string name); 48 | const char* getMessage() const; 49 | 50 | protected: 51 | template 52 | VectorFst * MakeInputFST(vector input); 53 | vector*> transducers; 54 | SymbolTable * symbolTable; 55 | vector transducer_ids; 56 | char message[maxMsgLength]; 57 | }; 58 | 59 | template 60 | VectorFst * Utf8Transducer::MakeInputFST(vector input) { 61 | typedef typename Arc::StateId StateId; 62 | typedef typename Arc::Weight Weight; 63 | typedef typename Arc::Label Label; 64 | fst::VectorFst * ifst = new fst::VectorFst(); 65 | ifst->DeleteStates(); 66 | StateId s = ifst->AddState(), nextstate = fst::kNoStateId; 67 | ifst->SetStart(s); 68 | for (size_t i = 0; i < input.size(); i++) { 69 | nextstate = ifst->AddState(); 70 | Arc arc(input[i], input[i], Weight::One(), nextstate); 71 | ifst->AddArc(s, arc); 72 | s = nextstate; 73 | } 74 | ifst->SetFinal(s, Weight::One()); 75 | return ifst; 76 | } 77 | 78 | 79 | } /* namespace yatts */ 80 | 81 | 82 | #endif /* UTF8TRANSDUCER_H_ */ 83 | -------------------------------------------------------------------------------- /test/README: -------------------------------------------------------------------------------- 1 | This directory contains the following files: 2 | 3 | - rus_sentences.txt: 173 Russian sentences to test the transcriber output. 4 | - rus_sentences.txt.g2p: the phonetic transcription of all 173 sentences. 5 | - rus_sentences.txt.log: transcription log for every sentence, for debugging purposes. 6 | 7 | The log file contains the following information (in this order): 8 | 9 | [SNUM]: sentence number. It should correlate with the line number of previous file. 10 | Useful to quickly locate any sentence. 11 | [SENT]: the original sentence. 12 | [YOWR]: a word in the sentence was reconstructed as having a yo letter 13 | [NORM]: the normalized sentence (currently only case normalization and separation 14 | of punctuation symbols). 15 | [WORD]: the word being processed. 16 | [POSP]: POS prediction output for the word if available. 17 | 18 | One of the following: 19 | 20 | [DISA]: if the word was found in the homographs list. Values here can be: 21 | * morpho-syntactic tags that were used for the disambiguation. 22 | * "SINGLETON": the word is not really an homograph (only one unique 23 | transcription found). 24 | * "LEX1": no disambiguation possible, entry marked by LEX1 was chosen. 25 | * "FREQ": no LEX information, the most frequent variant was chosen 26 | (or the first one, if all were equally frequent). 27 | [INFO]: word was not found in the homograph list. Possible values: 28 | * "entry found in user lexicon". 29 | * "entry found in lexicon". 30 | * "stress predicted". 31 | [STRS]: predicted string with stress information, if the word was not found in 32 | any dictionary. 33 | 34 | [SPHO]: phonetic transcription for the whole sentence after applying cross-word 35 | assimilations. 36 | 37 | Notes/disclaimer: 38 | 39 | - The output files (g2p and log) are kept here for information purposes only. More 40 | specifically, you should not expect to get the same results if you run the 41 | transcriber on the input sentences. The final output will heavily depend on the 42 | actual stress prediction model that you use. It can depend also on the version of 43 | the software packages on which the transcription process depend. 44 | 45 | - The transcriptions in rus_sentences.txt.g2p are not guaranteed to be correct. 46 | Actually, the contrary is the case: since the version of the transcriber uploaded 47 | to GitHub does not contain any POS prediction software many of the words whose 48 | pronunciation depend on their function in the sentence will not be predicted 49 | correctly. 50 | 51 | Thanks: 52 | 53 | The test sentences were carefully selected by my colleague at Yandex Anastasiya 54 | Polkanova. They were used to assess the quality of the TTS transcriber. 55 | -------------------------------------------------------------------------------- /grammars/crossword.grm: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | # 13 | # Copyright 2014 Yandex LLC 14 | # All Rights Reserved. 15 | # 16 | # Author : Alexis Wilpert 17 | 18 | 19 | 20 | import 'alphabets.grm' as alphabets; 21 | import 'definitions.grm' as defs; 22 | import 'consonants.grm' as cons; 23 | 24 | # set/class definitions 25 | 26 | LSEP = defs.LSEP; 27 | RSEP = defs.RSEP; 28 | EOS = "[EOS]" | "[SIL]"; 29 | vowel = defs.vowel; 30 | cons_letter_hyphen = defs.cons_letter_hyphen; 31 | stress_minus_1 = defs.stress_minus_1; 32 | before_stress = defs.before_stress; 33 | all = defs.all; 34 | soft_cons = defs.soft_cons; 35 | hard_cons = defs.hard_cons; 36 | 37 | devoicing_pairs = cons.devoicing_pairs; 38 | voicing_pairs = cons.voicing_pairs; 39 | voicing_context = cons.voicing_context; 40 | 41 | #---------------------------------------------------------------------------- 42 | 43 | # devoicing 44 | 45 | devoicing_context = alphabets.voiceless_consonants | 46 | "n" | "[nJ]" | "m" | "[mJ]" | 47 | "l" | "[lJ]" | "r" | "[rJ]" | 48 | "j" | "v" | "[vJ]" | 49 | ("+"? alphabets.nuclei) 50 | ; 51 | 52 | cross_devoicing = CDRewrite[devoicing_pairs, 53 | "", 54 | EOS | 55 | (alphabets.word_sep 56 | devoicing_context), 57 | alphabets.sigma_star 58 | ]; 59 | 60 | #---------------------------------------------------------------------------- 61 | 62 | # voicing 63 | 64 | cross_voicing = CDRewrite[voicing_pairs, 65 | "", 66 | (alphabets.word_sep 67 | voicing_context), 68 | alphabets.sigma_star 69 | ]; 70 | 71 | #---------------------------------------------------------------------------- 72 | 73 | # remove [WUD] trigger (marking certain function words in the lexicon 74 | 75 | clean_wud = CDRewrite[("[WUD]":""), 76 | "", 77 | "", 78 | alphabets.sigma_star 79 | ]; 80 | 81 | #---------------------------------------------------------------------------- 82 | 83 | export crossword = Optimize[cross_devoicing @ 84 | cross_voicing @ 85 | cons.voice_assimilation @ 86 | clean_wud 87 | ]; 88 | -------------------------------------------------------------------------------- /grammars/definitions.grm: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | # 13 | # Copyright 2014 Yandex LLC 14 | # All Rights Reserved. 15 | # 16 | # Author : Alexis Wilpert 17 | 18 | 19 | 20 | import 'alphabets.grm' as alphabets; 21 | 22 | # set/class definitions 23 | 24 | export LSEP = (alphabets.word_sep | "-"); 25 | export RSEP = (alphabets.word_sep); 26 | 27 | # coming from the palatalization done above 28 | soft_cons_phonetic = ("[bJ]" | "[vJ]" | "[gJ]" | 29 | "[dJ]" | "[zJ]" | "[kJ]" | 30 | "[lJ]" | "[mJ]" | "[nJ]" | 31 | "[pJ]" | "[rJ]" | "[sJ]" | 32 | "[tJ]" | "[fJ]" | "[xJ]" 33 | ); 34 | 35 | # stressed vowels are already phonetic 36 | export vowel = Optimize[(alphabets.vow_letter | alphabets.nuclei)]; 37 | 38 | # [j] could appear after the approximant insertion 39 | export cons_letter_hyphen = alphabets.cons_letter | 40 | soft_cons_phonetic | 41 | "j" | 42 | "-" 43 | ; 44 | 45 | export stress_minus_1 = Optimize[cons_letter_hyphen* "+"]; 46 | 47 | export before_stress = Optimize[(cons_letter_hyphen | vowel)* "+"]; 48 | 49 | export all = Optimize[cons_letter_hyphen | vowel]; 50 | 51 | # Vi = <и, е, ё, я, ю> - vowels that trigger palatalization 52 | # Cvar = <б, п, в, ф, з, с, г, к, д, х, т, л, м, н, р> 53 | # 54 | # / ¬_ --> hard consonant 55 | # / _ --> soft consonant 56 | # / _ <ь> --> soft consonant 57 | # / _ <ъ> --> hard consonant 58 | # 59 | # always hard letters: 60 | # <ж, ш, ц> 61 | # 62 | # always soft letters: 63 | # <ч, щ> 64 | # 65 | # "Vi-palatalization" does not exist at the words junction. 66 | 67 | # the rest of the contexts are not needed, because they are represented 68 | # by the phonetic palatal consonants that were generated previously 69 | export always_soft_cons = ("ч" | "щ" | "ь") "-"?; 70 | 71 | cd_soft_cons = (soft_cons_phonetic | "j") "-"?; 72 | 73 | export always_hard_cons = ("ж" | "ш" | "ц" | "ъ") "-"?; 74 | 75 | # no other palatalization will take place, so we can safely assume that 76 | # consonants that are not phonetically soft, are hard consonants 77 | # not a context-dependent rule, but we leave it as it is in case of changes 78 | # in the future 79 | cd_hard_cons = ("б" | "п" | "в" | 80 | "ф" | "з" | "с" | 81 | "г" | "к" | "д" | 82 | "х" | "т" | "л" | 83 | "м" | "н" | "р" 84 | ) "-"? 85 | ; 86 | 87 | export soft_cons = Optimize[(always_soft_cons | cd_soft_cons)]; 88 | 89 | export hard_cons = Optimize[(always_hard_cons | cd_hard_cons)]; 90 | 91 | -------------------------------------------------------------------------------- /src/utf8ext.h: -------------------------------------------------------------------------------- 1 | /* Licensed under the Apache License, Version 2.0 (the "License"); 2 | * you may not use this file except in compliance with the License. 3 | * You may obtain a copy of the License at 4 | * 5 | * http://www.apache.org/licenses/LICENSE-2.0 6 | * 7 | * Unless required by applicable law or agreed to in writing, software 8 | * distributed under the License is distributed on an "AS IS" BASIS, 9 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | * See the License for the specific language governing permissions and 11 | * limitations under the License. 12 | * 13 | * Copyright 2014 Yandex LLC 14 | * All Rights Reserved. 15 | * 16 | * Author : Schamai Safra 17 | * 18 | * 19 | * utf8ext.h 20 | */ 21 | 22 | 23 | 24 | #ifndef UTF8EXT_H_ 25 | #define UTF8EXT_H_ 26 | 27 | #include "utf8/utf8.h" 28 | #include 29 | 30 | namespace utf8 { 31 | 32 | using namespace std; 33 | template 34 | T* begin(T(&arr)[N]) { return &arr[0]; } 35 | template 36 | T* end(T(&arr)[N]) { return &arr[0]+N; } 37 | 38 | template 39 | bool is_in(T x, const T(&arr)[N]) { 40 | const T* lb = lower_bound(begin(arr), end(arr), x); 41 | return (end(arr) != lb) && (*lb == x); 42 | } 43 | 44 | 45 | const uint32_t WHITESPACE_CHARS[] = {32, 160, 5760, 6158, 8192, 8193, 8194, 8195,8196, 8197, 8198, 8199, 8200, 8201, 8202, 8203, 8204, 8205, 8239, 8287, 8288, 12288, 65279}; 46 | 47 | const uint32_t CR = 13, LF = 10; 48 | const uint32_t EOL_CHARS[] = {10, 11, 12, 13, 133, 8232, 8233}; 49 | 50 | // CR=13, LF=10, 11, 12, 133, 8232, 8233 51 | 52 | 53 | inline bool is_whitespace(uint32_t cp) { 54 | return is_in(cp, WHITESPACE_CHARS); 55 | } 56 | 57 | inline bool is_EOL(uint32_t cp) { 58 | return is_in(cp, EOL_CHARS); 59 | } 60 | 61 | 62 | /** 63 | * like utf8::next, but in case of invalid sequence, 64 | * consumes irreparable octets before throwing. 65 | * @param start 66 | * @param end 67 | * @return next code point, if valid 68 | */ 69 | template 70 | uint32_t next_skip_invalid(octet_iterator& it, octet_iterator end) 71 | { 72 | uint32_t cp = 0; 73 | internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); 74 | octet_iterator bad; 75 | switch (err_code) { 76 | case internal::UTF8_OK : 77 | break; 78 | case internal::NOT_ENOUGH_ROOM : 79 | throw not_enough_room(); 80 | break; 81 | case internal::INVALID_LEAD : 82 | bad = it; 83 | it++; 84 | throw invalid_utf8(*bad); 85 | break; 86 | case internal::INCOMPLETE_SEQUENCE : 87 | case internal::OVERLONG_SEQUENCE : 88 | bad = it; 89 | it++; 90 | while (it != end && utf8::internal::is_trail(*it)) 91 | ++it; 92 | throw invalid_utf8(*bad); 93 | break; 94 | case internal::INVALID_CODE_POINT : 95 | it++; 96 | while (it != end && utf8::internal::is_trail(*it)) 97 | ++it; 98 | throw invalid_code_point(cp); 99 | } 100 | return cp; 101 | } 102 | 103 | 104 | } 105 | 106 | 107 | #endif /* UTF8EXT_H_ */ 108 | -------------------------------------------------------------------------------- /grammars/palatalization.grm: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | # 13 | # Copyright 2014 Yandex LLC 14 | # All Rights Reserved. 15 | # 16 | # Author : Alexis Wilpert 17 | 18 | 19 | 20 | import 'alphabets.grm' as alphabets; 21 | import 'definitions.grm' as defs; 22 | 23 | # approximant insertion is necessary when: 24 | # - are after another vowel 25 | # - or after <ь, ъ> 26 | # - or at the beginning of the word. 27 | 28 | BOS = "[BOS]" | "[SIL]"; 29 | SEP = alphabets.word_sep; 30 | WUD = "[WUD]" SEP; 31 | 32 | #---------------------------------------------------------------------------- 33 | 34 | insertion_context = (alphabets.vow_letter | 35 | "й" | 36 | ("ь" | "ъ") | 37 | defs.LSEP | 38 | BOS 39 | ) "+"? 40 | ; 41 | 42 | insertion_pairs = ("е":"jе") | 43 | ("ю":"jю") | 44 | ("я":"jя") | 45 | ("ё":"jё") 46 | ; 47 | 48 | initial_approximant_insertion = CDRewrite[insertion_pairs, 49 | insertion_context, 50 | "", 51 | alphabets.sigma_star 52 | ]; 53 | 54 | # correct approximant insertion after WUD 55 | 56 | approx_insertion_correction = CDRewrite[("j":""), 57 | "[WUD]" SEP, 58 | "", 59 | alphabets.sigma_star 60 | ]; 61 | 62 | approximant_insertion = Optimize[initial_approximant_insertion @ 63 | approx_insertion_correction 64 | ]; 65 | 66 | # WARNING: possible output --> + j V 67 | 68 | #---------------------------------------------------------------------------- 69 | 70 | # we do the palatalization here since later we will loose the orthographic 71 | # context triggered by <я>, <ю> and <ё> 72 | 73 | # we need to take this in consideration in the rules below 74 | 75 | palatal_cons_pairs = ("б":"[bJ]") | 76 | ("в":"[vJ]") | 77 | ("г":"[gJ]") | 78 | ("д":"[dJ]") | 79 | ("з":"[zJ]") | 80 | ("к":"[kJ]") | 81 | ("л":"[lJ]") | 82 | ("м":"[mJ]") | 83 | ("н":"[nJ]") | 84 | ("п":"[pJ]") | 85 | ("р":"[rJ]") | 86 | ("с":"[sJ]") | 87 | ("т":"[tJ]") | 88 | ("ф":"[fJ]") | 89 | ("х":"[xJ]") 90 | ; 91 | 92 | cd_palatalization = CDRewrite[palatal_cons_pairs, 93 | "", 94 | "ь" | ("-"? "+"? 95 | alphabets.soft_vow_letter), 96 | alphabets.sigma_star 97 | ]; 98 | 99 | #---------------------------------------------------------------------------- 100 | 101 | export palatalization = Optimize[approximant_insertion @ 102 | cd_palatalization 103 | ]; 104 | 105 | -------------------------------------------------------------------------------- /src/yatts_util.cpp: -------------------------------------------------------------------------------- 1 | /* Licensed under the Apache License, Version 2.0 (the "License"); 2 | * you may not use this file except in compliance with the License. 3 | * You may obtain a copy of the License at 4 | * 5 | * http://www.apache.org/licenses/LICENSE-2.0 6 | * 7 | * Unless required by applicable law or agreed to in writing, software 8 | * distributed under the License is distributed on an "AS IS" BASIS, 9 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | * See the License for the specific language governing permissions and 11 | * limitations under the License. 12 | * 13 | * Copyright 2014 Yandex LLC 14 | * All Rights Reserved. 15 | * 16 | * Author : Schamai Safra 17 | * 18 | * 19 | * yatts_util.cpp 20 | */ 21 | 22 | 23 | 24 | #include "yatts_util.h" 25 | #include 26 | #include 27 | 28 | int to_uint(char const *s) { 29 | int result = 0; 30 | if (!*s) { 31 | throw std::invalid_argument("invalid input string"); 32 | } 33 | while (*s) { 34 | if (*s >= '0' && *s <= '9') { 35 | result = result * 10 + (*s - '0'); 36 | } else { 37 | throw std::invalid_argument("invalid input string"); 38 | } 39 | s++; 40 | } 41 | return result; 42 | } 43 | 44 | 45 | std::vector tokenize_utf8_string(std::string* utf8_string, std::string* delimiters, 46 | int limit) { 47 | /* 48 | Support for tokenizing a utf-8 string. Adapted to also support delimiters and a limit. 49 | Note that (unlike Joe's version) any of the utf8 characters in delimiters is a delimiter (like strtok), 50 | not the whole string. 51 | Note that leading, trailing or multiple consecutive delimiters will result in 52 | empty vector elements. Normally should not be a problem but just in case. 53 | Also note that any tokens that cannot be found in the model symbol table will be 54 | deleted from the input word prior to grapheme-to-phoneme conversion. 55 | 56 | http://stackoverflow.com/questions/2852895/c-iterate-or-split-utf-8-string-into-array-of-symbols#2856241 57 | 58 | schsafra: adapted from http://code.google.com/p/phonetisaurus/ (phonetisaurus-0.7.8) by Josef Robert Novak 59 | 60 | */ 61 | char* str = (char*) utf8_string->c_str(); // utf-8 string 62 | char* str_i = str; // string iterator 63 | char* str_j = str; 64 | char* end = str + strlen(str) + 1; // end iterator 65 | std::vector string_vec; 66 | std::vector delim_code; 67 | if (delimiters->compare("") != 0) { 68 | string_vec.push_back(""); 69 | char* delim_i = (char*) delimiters->c_str(); 70 | char* delim_end = delim_i + strlen(delim_i) + 1; 71 | do { 72 | delim_code.push_back(utf8::next(delim_i, delim_end)); 73 | } while (delim_i < delim_end); 74 | } 75 | do { 76 | str_j = str_i; 77 | utf8::uint32_t code = utf8::next(str_i, end); // get 32 bit code of a utf-8 symbol 78 | if (code == 0) { 79 | continue; 80 | } 81 | int start = strlen(str) - strlen(str_j); 82 | int end = strlen(str) - strlen(str_i); 83 | int len = end - start; 84 | 85 | if (delimiters->compare("") == 0) { 86 | string_vec.push_back(utf8_string->substr(start, len)); 87 | } else { 88 | if ((limit == 0 || string_vec.size() < limit) && 89 | std::find(delim_code.begin(), delim_code.end(), code) != delim_code.end()) { 90 | string_vec.push_back(""); 91 | } else { 92 | string_vec[string_vec.size() - 1] += utf8_string->substr(start, 93 | len); 94 | } 95 | } 96 | } while (str_i < end); 97 | 98 | return string_vec; 99 | } 100 | 101 | 102 | /* 103 | * http://stackoverflow.com/questions/9620437/string-const-char-size-t-to-int 104 | */ 105 | int to_int(char const *s, size_t count) 106 | { 107 | size_t i = 0 ; 108 | if ( s[0] == '+' || s[0] == '-' ) 109 | ++i; 110 | int result = 0; 111 | while(i < count) 112 | { 113 | if ( s[i] >= '0' && s[i] <= '9' ) 114 | { 115 | result = result * 10 - (s[i] - '0'); //assume negative number 116 | } 117 | else 118 | throw std::invalid_argument("invalid input string"); 119 | i++; 120 | } 121 | return s[0] == '-' ? result : -result; //-result is positive! 122 | } 123 | -------------------------------------------------------------------------------- /src/transduce.cpp: -------------------------------------------------------------------------------- 1 | /* Licensed under the Apache License, Version 2.0 (the "License"); 2 | * you may not use this file except in compliance with the License. 3 | * You may obtain a copy of the License at 4 | * 5 | * http://www.apache.org/licenses/LICENSE-2.0 6 | * 7 | * Unless required by applicable law or agreed to in writing, software 8 | * distributed under the License is distributed on an "AS IS" BASIS, 9 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | * See the License for the specific language governing permissions and 11 | * limitations under the License. 12 | * 13 | * Copyright 2014 Yandex LLC 14 | * All Rights Reserved. 15 | * 16 | * Author : Schamai Safra 17 | * 18 | * 19 | * transduce.cpp 20 | */ 21 | 22 | 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include "utf8ext.h" 34 | #include "yatts_util.h" 35 | #include "Utf8Transducer.h" 36 | 37 | using namespace fst; 38 | using namespace yatts; 39 | 40 | #define VERBOSE 1 41 | #if VERBOSE 42 | #define msg(x, ...) fprintf(stderr, x, ##__VA_ARGS__); 43 | #else 44 | #define msg(x, ...) 45 | #endif 46 | #define SAVE_INTERMEDIATE 0 47 | 48 | DEFINE_string(fst, "", "rewrite FST."); 49 | DEFINE_string( symbols, "", "symbol table of the rewrite FST."); 50 | 51 | 52 | 53 | void ProcessCorpus(string corpus_filename, Utf8Transducer& transducer, FILE * fp) { 54 | istream * ifp = &cin; 55 | ifstream corpus_fp; 56 | if (corpus_filename != "-") { 57 | corpus_fp.open(corpus_filename.c_str()); 58 | if (corpus_fp.is_open()) { 59 | ifp = &corpus_fp; 60 | } else { 61 | msg("*** warning: Can't open '%s' for reading.\n", 62 | corpus_filename.c_str()); 63 | exit(1); 64 | } 65 | } 66 | string line; 67 | int lineCount = 0; 68 | while (ifp->good()) { 69 | getline(*ifp, line); 70 | if (line.compare("") == 0) { 71 | continue; 72 | } 73 | lineCount++; 74 | try { 75 | string utf8line; 76 | int status = transducer.transduceText(line, utf8line); 77 | switch (status) { 78 | case Utf8Transducer::OK: 79 | fprintf(fp, "%s\n", utf8line.c_str()); 80 | break; 81 | case Utf8Transducer::WARN: 82 | msg("*** warning: %s\n", transducer.getMessage()); 83 | fprintf(fp, "%s\n", utf8line.c_str()); 84 | break; 85 | default: 86 | msg("*** warning: %s\n", transducer.getMessage()); 87 | fprintf(fp, "\n"); 88 | break; 89 | } 90 | 91 | } catch (std::exception& e) { 92 | cerr << e.what() << endl; 93 | throw; 94 | } 95 | } 96 | corpus_fp.close(); 97 | } 98 | 99 | 100 | 101 | int main(int argc, char **argv) { 102 | string usage = 103 | "Transduce (rewrite) words according to rewrite-fst .\n\n Usage: "; 104 | usage += argv[0]; 105 | usage += " [input.utf [output.utf]]\n"; 106 | set_new_handler(FailedNewHandler); 107 | SetFlags(usage.c_str(), &argc, &argv, true); 108 | 109 | #define MANDATORY(name) \ 110 | if (FLAGS_ ## name == "") { \ 111 | fprintf(stderr,"*** Error: --" # name " is mandatory\n"); \ 112 | exit(1); \ 113 | } 114 | 115 | MANDATORY(fst); 116 | 117 | if (argc > 3) { 118 | ShowUsage(); 119 | return 1; 120 | } 121 | 122 | string in_name = (argc > 1 && (strcmp(argv[1], "-") != 0)) ? argv[1] : "-"; 123 | string out_name = argc > 2 ? argv[2] : "-"; 124 | 125 | yatts::Utf8Transducer transducer; 126 | 127 | // currently not used: transducer deletes symbol tables when reading fsts 128 | // later we may use one, or even read one for each fst 129 | transducer.readOrNewSymtab(FLAGS_symbols, "symbol_table"); 130 | 131 | string delim = string(","); 132 | 133 | vector rules = tokenize_utf8_string(&FLAGS_fst, &delim); 134 | for (int i = 0; i < rules.size(); i++) { 135 | Utf8Transducer::Status s = transducer.appendFst(rules[i], rules[i]); 136 | switch (s) { 137 | case Utf8Transducer::OK: 138 | msg("loaded fst '%s'\n", rules[i].c_str()); 139 | break; 140 | case Utf8Transducer::WARN: 141 | msg("*** warning: %s\n", transducer.getMessage()); 142 | break; 143 | case Utf8Transducer::ERROR: 144 | throw runtime_error(transducer.getMessage()); 145 | default: 146 | msg("*** warning: %s\n", transducer.getMessage()); 147 | break; 148 | } 149 | } 150 | 151 | FILE * fp; 152 | if (out_name == "-") { 153 | fp = stdout; 154 | } else { 155 | fp = fopen(out_name.c_str(), "w"); 156 | } 157 | ProcessCorpus(in_name, transducer, fp); 158 | } 159 | -------------------------------------------------------------------------------- /src/Utf8Transducer.cpp: -------------------------------------------------------------------------------- 1 | /* Licensed under the Apache License, Version 2.0 (the "License"); 2 | * you may not use this file except in compliance with the License. 3 | * You may obtain a copy of the License at 4 | * 5 | * http://www.apache.org/licenses/LICENSE-2.0 6 | * 7 | * Unless required by applicable law or agreed to in writing, software 8 | * distributed under the License is distributed on an "AS IS" BASIS, 9 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | * See the License for the specific language governing permissions and 11 | * limitations under the License. 12 | * 13 | * Copyright 2014 Yandex LLC 14 | * All Rights Reserved. 15 | * 16 | * Author : Schamai Safra 17 | * 18 | * 19 | * Utf8Transducer.cpp 20 | */ 21 | 22 | 23 | 24 | #include "utf8ext.h" 25 | #include "Utf8Transducer.h" 26 | 27 | namespace yatts { 28 | 29 | Utf8Transducer::Utf8Transducer(): symbolTable(0) { 30 | message[0] = 0; 31 | // TODO Auto-generated constructor stub 32 | 33 | } 34 | 35 | Utf8Transducer::~Utf8Transducer() { 36 | // TODO Auto-generated destructor stub 37 | } 38 | 39 | Utf8Transducer::Status Utf8Transducer::appendFst(VectorFst*& transducer, string id) { 40 | if (id == "") { 41 | char buf[10]; 42 | snprintf(buf, 10, "%lu",transducers.size()); 43 | id = string(buf); 44 | } 45 | if (transducer) { 46 | transducers.push_back(transducer); 47 | transducer_ids.push_back(id); 48 | return OK; 49 | } else { 50 | snprintf(message, maxMsgLength, "Cannot append null FST '%s'", id.c_str()); 51 | return WARN; 52 | } 53 | } 54 | 55 | Utf8Transducer::Status Utf8Transducer::appendFst(const string& file_name, string id) { 56 | if (id == "") { 57 | char buf[10]; 58 | snprintf(buf, 10, "%lu",transducers.size()); 59 | id = string(buf); 60 | } 61 | VectorFst * fst = 0; 62 | try { 63 | fst = VectorFst::Read(file_name); 64 | } catch (exception& e) { 65 | snprintf(message, maxMsgLength, "Cannot load FST '%s' from file '%s': %s", id.c_str(), file_name.c_str(),e.what()); 66 | return WARN; 67 | } 68 | if (!fst) { 69 | snprintf(message, maxMsgLength, "Cannot load FST '%s' from file '%s'", id.c_str(), file_name.c_str()); 70 | return WARN; 71 | } 72 | // sort arcs by input label 73 | ArcSort(fst, ILabelCompare()); 74 | // ------- Delete input symbols (assume utf8 input) 75 | fst->SetInputSymbols(NULL); 76 | this->appendFst(fst, id); 77 | return OK; 78 | } 79 | 80 | Utf8Transducer::Status Utf8Transducer::transduceText(string text, string& result) { 81 | string::iterator readPos = text.begin(); 82 | utf8::uint32_t codePoint; 83 | vector input; 84 | //string utf8line; 85 | while (readPos < text.end()) { 86 | codePoint = utf8::next_skip_invalid(readPos, text.end()); 87 | input.push_back(codePoint); 88 | } 89 | try { 90 | bool ok = true; 91 | VectorFst * fst = MakeInputFST(input), * fst2; 92 | for (size_t i = 0; i < transducers.size(); i++) { 93 | fst2 = new VectorFst( 94 | ComposeFst(*fst, *transducers[i])); 95 | delete fst; 96 | fst = fst2; 97 | Connect(fst); 98 | if (fst->NumStates() == 0) { 99 | snprintf(message, maxMsgLength, "No transduction after applying transducer '%s'", transducer_ids[i].c_str()); 100 | break; 101 | } 102 | } 103 | if (fst && fst->Start() >= 0 && fst->NumArcs(fst->Start()) > 0) { 104 | fst::VectorFst nbest_paths; 105 | fst::ShortestPath(*fst, &nbest_paths, 2); 106 | delete fst; 107 | vector utf16line; 108 | StdArc::StateId cur_state = nbest_paths.Start(); 109 | if (cur_state < 0 || nbest_paths.NumArcs(cur_state) < 1) { 110 | ok = false; 111 | } else { 112 | if (nbest_paths.NumArcs(cur_state) != 1) { 113 | snprintf(message, maxMsgLength, 114 | "ambiguous transduction (%s)", text.c_str()); 115 | } 116 | for (; 117 | nbest_paths.Final(cur_state) 118 | == StdArc::Weight::Zero();) { 119 | fst::ArcIterator > aiter(nbest_paths, 120 | cur_state); 121 | StdArc arc = aiter.Value(); 122 | if (arc.olabel != 0) { 123 | utf16line.push_back(arc.olabel); 124 | } 125 | cur_state = arc.nextstate; 126 | } 127 | utf8::utf16to8(utf16line.begin(), utf16line.end(), 128 | back_inserter(result)); 129 | } 130 | } else { 131 | ok = false; 132 | } 133 | if (ok) { 134 | return OK; 135 | } else { 136 | //that's a temporary hack for Alexis' script to work. It is wrong because an empty 137 | //result line could also be a valid transduction 138 | return WARN; 139 | } 140 | } catch (std::exception& e) { 141 | cerr << e.what() << endl; 142 | throw; 143 | } 144 | } 145 | 146 | 147 | Utf8Transducer::Status Utf8Transducer::readOrNewSymtab(string file, string name) { 148 | if (!file.empty()) { 149 | ifstream st_fp; 150 | st_fp.open(file.c_str()); 151 | if (st_fp.is_open()) { 152 | symbolTable = SymbolTable::ReadText(st_fp,name); 153 | } 154 | st_fp.close(); 155 | } else { 156 | symbolTable = new SymbolTable("name"); 157 | } 158 | if (!symbolTable) { 159 | snprintf(message, maxMsgLength, "Couldn't read symbol table %s from file %s\n", name.c_str(), file.c_str()); 160 | return WARN; 161 | } 162 | return OK; 163 | } 164 | 165 | 166 | const char* Utf8Transducer::getMessage() const { 167 | return message; 168 | } 169 | 170 | } /* namespace yatts */ 171 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RusPhonetizer 2 | 3 | _**IMPORTANT NOTE: this repository has been archived, as I am not maintaining it actively any more**_ 4 | 5 | ## General 6 | 7 | *RusPhonetizer* is a simple script together with a set of Thrax grammar rules and dictionaries for the phonetic transcription of Russian sentences. 8 | 9 | ## Software requirements 10 | 11 | *RusPhonetizer* depends on the following software packages: 12 | 13 | - [OpenFST](http://www.openfst.org/twiki/bin/view/FST/WebHome): used by Phonetisaurus, Thrax, and the Transcribe tool (see below). Tested with version 1.5.4. 14 | - [OpenGrm Thrax Grammar Development Tools](http://openfst.cs.nyu.edu/twiki/bin/view/GRM/Thrax): needed to compile the grammar rules. Tested with version 1.2.2. 15 | - [The WFST-driven Phoneticizer Phonetisaurus](https://github.com/JosefNovak/Phonetisaurus): required to build and use the stress prediction model. Tested with version 0.8a (https://www.dropbox.com/s/154q9yt3xenj2gr/phonetisaurus-0.8a.tgz) 16 | 17 | The compiler needs to be C++11 compliant. 18 | 19 | A tiny patch in the Thrax source code is needed before compilation. Please, refer to misc/README. 20 | 21 | ## Transcribe tool 22 | 23 | The transcription process relies on a small tool used to apply the G2P FST rules. The sources are kept in src/. A Makefile is provided for easy compilation. 24 | 25 | ## Grammars 26 | 27 | The grammars need first to be compiled before being used by the transcription tool. Please, refer to grammars/README for more information. 28 | 29 | See [PhoneGroups](https://github.com/wilpert/PhoneGroups/blob/master/tables/YANDEX/map_YANDEX-ttssampa_ru-RU.dat) for the list of valid phoneme symbols and their meaning as used in the Thrax grammars. 30 | 31 | ## Dictionaries 32 | 33 | The most common type of transcriptions is what I call "pseudo-transcriptions": basically the same Cyrillic string as the entry word enriched with the stress information and possibly with some other lexical pronunciation exceptions. The following dictionaries are available: 34 | 35 | - **tts-dict-simple.pruned.txt**: exceptions dictionary that contains mainly words for which the stress prediction model did not predict the stress correctly. There are also some other few entries, mainly function words, with pure phonetic transcription. Depending on the stress prediction model you build some more entries might be needed in this file for correct phonetic transcriptions. The format of the entries in this dictionary is as follows: 36 | 37 | ``` 38 | ORTHO \t PHONO(,\s*PHONO)* 39 | 40 | кредитно-расчётный кредитно-расчётный 41 | крем-брюле кр+ем-брюле, крем-брюл+е 42 | ``` 43 | 44 | - **tts-dict-homographs.txt**: dictionary with multiple transcriptions and morpho-syntactic information for homograph words. To be able to use this information, a tool for word disambiguation is required, which is not provided in the current package. 45 | 46 | ``` 47 | FREQ \t ORTHO \t POS'('FEATS*')' \t '[' PHONO ']' (LEX\d)? 48 | FEATS = FEAT'('','\s?FEAT')'* 49 | 50 | 147286 войска NN(sg) [в+ойска] 51 | 47286 войска NN(pl) [войск+а] 52 | 66172 пола NN(gen, sg, msc) [п+ола] LEX1 53 | 66172 пола NN(nom, sg, fem) [пол+а] LEX2 54 | ``` 55 | 56 | - **tts-dict-yo-list.txt**: a list of words that should be written with letter <ё> (yo), used for reconstructing those words in the case that the input does not contain it. 57 | 58 | ``` 59 | ORTHO \t ORTHO 60 | 61 | артем артём 62 | ``` 63 | 64 | ## Main script options 65 | 66 | ```AsciiDoc 67 | -h, --help show this help message and exit 68 | -i INPUT, --input=INPUT 69 | The file containing the words to transcribe 70 | -y YO_LIST, --yo_list=YO_LIST 71 | List of words that contain the letter (OPT) 72 | -l DICTIONARY, --dictionary=DICTIONARY 73 | A simple dictionary file (OPT) 74 | -u USER, --user=USER A user lexicon file in the same format as simple 75 | dictionary (OPT) 76 | -a HOMOGRAPHS, --homographs=HOMOGRAPHS 77 | A file with homographs (OPT) 78 | -m MODEL_FILE, --model_file=MODEL_FILE 79 | Read g2p model from FILE (for stress prediction) 80 | -g G2P_FST, --g2p_fst=G2P_FST 81 | Path to the G2P FST(s) 82 | 83 | python scripts/tts_transcriber.py \ 84 | -i test/rus_sentences.txt \ 85 | -y dictionaries/tts-dict-yo-list.txt \ 86 | -l dictionaries/tts-dict-simple.pruned.txt \ 87 | -a dictionaries/tts-dict-homographs.txt \ 88 | -m stress_prediction.fst \ 89 | -g "grammars/G2P1,grammars/G2P2" 90 | ``` 91 | 92 | ## Transcription flow 93 | 94 | 1. Tokenize/normalize sentence 95 | 2. Get POS analysis for the tokenized/normalized sentence 96 | 3. For every token after the POS analysis: 97 | - If no POS/features are available, give the word a generic GEN_POS. 98 | - Look up dictionaries: 99 | - First, try to find the word in the user dictionary. If found, retrieve its transcription. 100 | - Second, try to find the word in the homographs dictionary. If found, retrieve its transcription as follows: 101 | - Find the correct transcriptions in the homographs dictionary using the POS analysis (best intersection). 102 | - If no intersection is found, get the transcription variant tagged by 'LEX1'. 103 | - If no 'LEX' tags are available for entry, get the most frequent one. 104 | - If everything fails, take the first transcription found. 105 | - Third, try to find the word in the simple dictionary. 106 | - Finally, if the word is not found in any dictionary, predict stress with the stress prediction FST model: 107 | - For correct G2P (information used by Thrax rules), attach POS information to the token in the cases supported in the 108 | G2P rules (currently, only adjectives and verbs). 109 | 4. Send the result of concatenating all resulting tokens to the G2P FST chain. 110 | 111 | ## Stress prediction model 112 | 113 | Due to file size limitations in GitHub, it is not possible to include in the repository the data required for building 114 | the stress prediction model. However, I have made it accessible from the following link: 115 | 116 | https://www.mycloud.ch/s/S00DD7C0E5E1814BDE44BFBB92868EDB7E94CEA7AB7 117 | 118 | I have also included in the package a prebuilt model for the case that you do not succeed building it yourself. Let me 119 | know, if you meet any problems accessing the data. 120 | -------------------------------------------------------------------------------- /grammars/diphthongs.grm: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | # 13 | # Copyright 2014 Yandex LLC 14 | # All Rights Reserved. 15 | # 16 | # Author : Alexis Wilpert 17 | 18 | 19 | 20 | import 'alphabets.grm' as alphabets; 21 | import 'definitions.grm' as defs; 22 | 23 | # set/class definitions 24 | 25 | LSEP = defs.LSEP; 26 | RSEP = defs.RSEP; 27 | LSEP_ALL = (defs.RSEP | "[SIL]" | "[BOS]" | "-"); 28 | RSEP_ALL = (defs.RSEP | "[SIL]" | "[EOS]"); 29 | vowel = defs.vowel; 30 | cons_letter_hyphen = defs.cons_letter_hyphen; 31 | stress_minus_1 = defs.stress_minus_1; 32 | before_stress = defs.before_stress; 33 | all = defs.all; 34 | soft_cons = defs.soft_cons; 35 | hard_cons = defs.hard_cons; 36 | always_hard_cons = defs.always_hard_cons; 37 | 38 | all_cons = (alphabets.consonants - "j") | 39 | (alphabets.cons_letter - "й") 40 | ; 41 | 42 | #---------------------------------------------------------------------------- 43 | 44 | # stressed diphthongs 45 | 46 | stressed_pairs = ("и":"[i_i]") | 47 | ("е":"[e_i]") | 48 | ("э":"[e_i]") | 49 | ("а":"[a_i]") | 50 | ("я":"[a_i]") | 51 | ("о":"[o_i]") | 52 | ("ё":"[o_i]") | 53 | ("у":"[u_i]") | 54 | ("ю":"[u_i]") 55 | ; 56 | 57 | stress_diphthongs1 = CDRewrite[stressed_pairs, 58 | "+" "j"?, 59 | "й", 60 | alphabets.sigma_star 61 | ]; 62 | 63 | # only for stressed <ый>, fo example <выйдя> 64 | 65 | stress_diphthongs2a = CDRewrite[("ы":"[i_x]"), 66 | "+", 67 | "й", 68 | alphabets.sigma_star 69 | ]; 70 | stress_diphthongs2b = CDRewrite[("й":"I"), 71 | "[i_x]", 72 | "", 73 | alphabets.sigma_star 74 | ]; 75 | 76 | stress_diphthongs = Optimize[stress_diphthongs1 @ 77 | stress_diphthongs2a @ 78 | stress_diphthongs2b 79 | ]; 80 | 81 | #---------------------------------------------------------------------------- 82 | 83 | # unstressed diphthongs 84 | 85 | 86 | diphthongs6a = CDRewrite[("и":"[I_i]") | ("е":"[I_i]"), 87 | soft_cons, 88 | "й", 89 | alphabets.sigma_star 90 | ]; 91 | 92 | diphthongs6b = CDRewrite[("и":"[@_i]") | ("е":"[@_i]"), 93 | always_hard_cons, 94 | "й" RSEP_ALL, 95 | alphabets.sigma_star 96 | ]; 97 | 98 | diphthongs7 = CDRewrite[("ы":"[@_r_i]"), 99 | hard_cons, 100 | "й", 101 | alphabets.sigma_star 102 | ]; 103 | 104 | # left context 105 | diphthongs8 = CDRewrite[("а":"[@_o_i]") | 106 | ("о":"[@_o_i]") | 107 | ("я":"[@_o_i]") | 108 | ("ё":"[@_o_i]"), 109 | vowel | LSEP, 110 | "й", 111 | alphabets.sigma_star 112 | ]; 113 | 114 | # right context 115 | diphthongs9 = CDRewrite[("а":"[@_o_i]") | 116 | ("о":"[@_o_i]") | 117 | ("я":"[@_o_i]") | 118 | ("ё":"[@_o_i]"), 119 | cons_letter_hyphen, 120 | "й" (stress_minus_1 | RSEP), 121 | alphabets.sigma_star 122 | ]; 123 | 124 | diphthongs10 = CDRewrite[("а":"[@_i]") | 125 | ("о":"[@_i]") | 126 | ("я":"[@_i]") | 127 | ("ё":"[@_i]"), 128 | "", 129 | "й", 130 | alphabets.sigma_star 131 | ]; 132 | 133 | diphthongs11 = CDRewrite[("у":"[U_i]") | ("ю":"[U_i]"), 134 | all | LSEP, 135 | "й", 136 | alphabets.sigma_star 137 | ]; 138 | 139 | 140 | reduce_diphthongs = Optimize[diphthongs6a @ 141 | diphthongs6b @ 142 | diphthongs7 @ 143 | diphthongs8 @ 144 | diphthongs9 @ 145 | diphthongs10 @ 146 | diphthongs11 147 | ]; 148 | 149 | #---------------------------------------------------------------------------- 150 | 151 | # handling of <й> in other cases: 152 | 153 | # 1. stand-alone --> lexicon entry (<и краткое>) 154 | # 2. after consonant --> [j] 155 | # 3. at BOW --> [j] 156 | # 4. in all other cases --> DEL 157 | 158 | i_kratkoye1 = CDRewrite[("й":"j"), 159 | (LSEP_ALL | all_cons), 160 | "+"? vowel, 161 | alphabets.sigma_star 162 | ]; 163 | 164 | # this rule could catch also stand-alone <й>, but this is already 165 | # caught by the lexicon lookup 166 | i_kratkoye2 = CDRewrite[("й":"и")*, 167 | (LSEP_ALL | all_cons), 168 | (RSEP_ALL | all_cons), 169 | alphabets.sigma_star 170 | ]; 171 | 172 | i_kratkoye3 = CDRewrite[("й":"")*, 173 | "", 174 | "", 175 | alphabets.sigma_star 176 | ]; 177 | 178 | i_kratkoye = Optimize[i_kratkoye1 @ 179 | i_kratkoye2 @ 180 | i_kratkoye3 181 | ]; 182 | 183 | #---------------------------------------------------------------------------- 184 | 185 | export diphthongs = Optimize[stress_diphthongs @ 186 | reduce_diphthongs @ 187 | i_kratkoye 188 | ]; 189 | 190 | -------------------------------------------------------------------------------- /src/3rdparty/utf8/unchecked.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include "core.h" 32 | 33 | namespace utf8 34 | { 35 | namespace unchecked 36 | { 37 | template 38 | octet_iterator append(uint32_t cp, octet_iterator result) 39 | { 40 | if (cp < 0x80) // one octet 41 | *(result++) = static_cast(cp); 42 | else if (cp < 0x800) { // two octets 43 | *(result++) = static_cast((cp >> 6) | 0xc0); 44 | *(result++) = static_cast((cp & 0x3f) | 0x80); 45 | } 46 | else if (cp < 0x10000) { // three octets 47 | *(result++) = static_cast((cp >> 12) | 0xe0); 48 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 49 | *(result++) = static_cast((cp & 0x3f) | 0x80); 50 | } 51 | else { // four octets 52 | *(result++) = static_cast((cp >> 18) | 0xf0); 53 | *(result++) = static_cast(((cp >> 12) & 0x3f)| 0x80); 54 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 55 | *(result++) = static_cast((cp & 0x3f) | 0x80); 56 | } 57 | return result; 58 | } 59 | 60 | template 61 | uint32_t next(octet_iterator& it) 62 | { 63 | uint32_t cp = utf8::internal::mask8(*it); 64 | typename std::iterator_traits::difference_type length = utf8::internal::sequence_length(it); 65 | switch (length) { 66 | case 1: 67 | break; 68 | case 2: 69 | it++; 70 | cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); 71 | break; 72 | case 3: 73 | ++it; 74 | cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); 75 | ++it; 76 | cp += (*it) & 0x3f; 77 | break; 78 | case 4: 79 | ++it; 80 | cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); 81 | ++it; 82 | cp += (utf8::internal::mask8(*it) << 6) & 0xfff; 83 | ++it; 84 | cp += (*it) & 0x3f; 85 | break; 86 | } 87 | ++it; 88 | return cp; 89 | } 90 | 91 | template 92 | uint32_t peek_next(octet_iterator it) 93 | { 94 | return utf8::unchecked::next(it); 95 | } 96 | 97 | template 98 | uint32_t prior(octet_iterator& it) 99 | { 100 | while (utf8::internal::is_trail(*(--it))) ; 101 | octet_iterator temp = it; 102 | return utf8::unchecked::next(temp); 103 | } 104 | 105 | // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) 106 | template 107 | inline uint32_t previous(octet_iterator& it) 108 | { 109 | return utf8::unchecked::prior(it); 110 | } 111 | 112 | template 113 | void advance (octet_iterator& it, distance_type n) 114 | { 115 | for (distance_type i = 0; i < n; ++i) 116 | utf8::unchecked::next(it); 117 | } 118 | 119 | template 120 | typename std::iterator_traits::difference_type 121 | distance (octet_iterator first, octet_iterator last) 122 | { 123 | typename std::iterator_traits::difference_type dist; 124 | for (dist = 0; first < last; ++dist) 125 | utf8::unchecked::next(first); 126 | return dist; 127 | } 128 | 129 | template 130 | octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) 131 | { 132 | while (start != end) { 133 | uint32_t cp = utf8::internal::mask16(*start++); 134 | // Take care of surrogate pairs first 135 | if (utf8::internal::is_lead_surrogate(cp)) { 136 | uint32_t trail_surrogate = utf8::internal::mask16(*start++); 137 | cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; 138 | } 139 | result = utf8::unchecked::append(cp, result); 140 | } 141 | return result; 142 | } 143 | 144 | template 145 | u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) 146 | { 147 | while (start < end) { 148 | uint32_t cp = utf8::unchecked::next(start); 149 | if (cp > 0xffff) { //make a surrogate pair 150 | *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); 151 | *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); 152 | } 153 | else 154 | *result++ = static_cast(cp); 155 | } 156 | return result; 157 | } 158 | 159 | template 160 | octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) 161 | { 162 | while (start != end) 163 | result = utf8::unchecked::append(*(start++), result); 164 | 165 | return result; 166 | } 167 | 168 | template 169 | u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) 170 | { 171 | while (start < end) 172 | (*result++) = utf8::unchecked::next(start); 173 | 174 | return result; 175 | } 176 | 177 | // The iterator class 178 | template 179 | class iterator : public std::iterator { 180 | octet_iterator it; 181 | public: 182 | iterator () {} 183 | explicit iterator (const octet_iterator& octet_it): it(octet_it) {} 184 | // the default "big three" are OK 185 | octet_iterator base () const { return it; } 186 | uint32_t operator * () const 187 | { 188 | octet_iterator temp = it; 189 | return utf8::unchecked::next(temp); 190 | } 191 | bool operator == (const iterator& rhs) const 192 | { 193 | return (it == rhs.it); 194 | } 195 | bool operator != (const iterator& rhs) const 196 | { 197 | return !(operator == (rhs)); 198 | } 199 | iterator& operator ++ () 200 | { 201 | ::std::advance(it, utf8::internal::sequence_length(it)); 202 | return *this; 203 | } 204 | iterator operator ++ (int) 205 | { 206 | iterator temp = *this; 207 | ::std::advance(it, utf8::internal::sequence_length(it)); 208 | return temp; 209 | } 210 | iterator& operator -- () 211 | { 212 | utf8::unchecked::prior(it); 213 | return *this; 214 | } 215 | iterator operator -- (int) 216 | { 217 | iterator temp = *this; 218 | utf8::unchecked::prior(it); 219 | return temp; 220 | } 221 | }; // class iterator 222 | 223 | } // namespace utf8::unchecked 224 | } // namespace utf8 225 | 226 | 227 | #endif // header guard 228 | 229 | -------------------------------------------------------------------------------- /grammars/syllabification.grm: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | # 13 | # Copyright 2014 Yandex LLC 14 | # All Rights Reserved. 15 | # 16 | # Author : Alexis Wilpert 17 | 18 | 19 | 20 | import 'alphabets.grm' as alphabets; 21 | 22 | # definitions used in the syllabification rules 23 | 24 | stops = alphabets.plosives | alphabets.affricates; 25 | 26 | liquids_glides = alphabets.liquids | alphabets.approximant; 27 | 28 | consonantal = alphabets.consonants; 29 | 30 | vocalic = alphabets.nuclei | liquids_glides; 31 | 32 | sonorant = vocalic | alphabets.nasals; 33 | 34 | continuant = (alphabets.phone - stops); 35 | 36 | coronal = alphabets.liquids | 37 | alphabets.coronal_nasals | 38 | alphabets.coronal_fricatives | 39 | alphabets.coronal_plosives | 40 | alphabets.affricates 41 | ; 42 | 43 | anterior = alphabets.laterals | 44 | alphabets.nasals | 45 | alphabets.anterior_plosives | 46 | alphabets.anterior_fricatives | 47 | alphabets.anterior_affricates 48 | ; 49 | 50 | lateral = alphabets.laterals; 51 | 52 | nasal = alphabets.nasals; 53 | 54 | delayed_release = alphabets.affricates; 55 | 56 | #------------------------------------------------------------- 57 | 58 | # RULES 59 | 60 | SEP = alphabets.word_sep | "[BOS]" | "[SIL]"; 61 | 62 | # 1-consonant onset 63 | 64 | onset1 = consonantal; 65 | 66 | # 2-consonants onsets 67 | 68 | 69 | onset_33 = (consonantal - vocalic - delayed_release) 70 | (consonantal @ vocalic) 71 | ; 72 | 73 | onset_36 = ((consonantal @ continuant @ coronal @ anterior) - sonorant) 74 | (consonantal - continuant) 75 | ; 76 | 77 | onset_37 = SEP ((consonantal @ continuant @ coronal) - sonorant - anterior) 78 | (consonantal - continuant - delayed_release) 79 | ; 80 | 81 | onset_38a = SEP ((consonantal @ continuant @ anterior) - sonorant) 82 | consonantal 83 | ; 84 | 85 | onset_38b = ((consonantal @ continuant @ anterior) - sonorant) 86 | (consonantal @ sonorant) 87 | ; 88 | 89 | onset_40a = SEP (consonantal - sonorant - delayed_release) 90 | ((consonantal @ sonorant @ nasal) - vocalic) 91 | ; 92 | 93 | onset_40b = (consonantal - sonorant) 94 | ((consonantal @ sonorant) - nasal) 95 | ; 96 | 97 | onset2 = Optimize[onset_33 | onset_36 | onset_37 | 98 | onset_38a | onset_38b | onset_40a | 99 | onset_40b 100 | ]; 101 | 102 | # 3-consonants onsets 103 | 104 | onset_42 = SEP ((consonantal @ continuant @ anterior) - sonorant - coronal) 105 | ((consonantal @ continuant @ coronal @ anterior) - sonorant) 106 | (consonantal @ sonorant) 107 | ; 108 | 109 | onset_43 = ((consonantal @ continuant @ anterior) - sonorant - coronal) 110 | ((consonantal @ continuant @ coronal @ anterior) - sonorant) 111 | (consonantal - continuant - delayed_release) 112 | ; 113 | 114 | onset_45 = ((consonantal @ continuant @ anterior) - sonorant) 115 | (consonantal - continuant - delayed_release) 116 | ((consonantal @ sonorant) - nasal) 117 | ; 118 | 119 | onset_double_r = stops 120 | ("r" | "[rJ]") 121 | ("r" | "[rJ]") 122 | ; 123 | 124 | onset3 = Optimize[onset_42 | onset_43 | onset_45 | onset_double_r]; 125 | 126 | # 4-consonants onsets 127 | 128 | onset_47 = SEP ((consonantal @ continuant @ anterior) - sonorant - coronal) 129 | ((consonantal @ continuant @ coronal @ anterior) - sonorant) 130 | (consonantal - continuant - delayed_release) 131 | (consonantal @ vocalic) 132 | ; 133 | 134 | onset4 = Optimize[onset_47]; 135 | 136 | # onset exceptions 137 | 138 | onset_except_1 = SEP "t" "[SJ]" "t"; 139 | onset_except_2 = SEP "[lJ]" "d"; 140 | onset_except_3 = SEP "g" "[dJ]"; 141 | onset_except_4 = SEP "k" "t"; 142 | onset_except_5 = SEP "l" "b"; 143 | onset_except_6 = SEP "m" "[t_SJ]"; 144 | onset_except_7 = SEP "m" "[nJ]"; 145 | onset_except_8 = SEP "m" "n"; 146 | onset_except_9 = SEP "p" "[tJ]"; 147 | onset_except_10 = SEP "r" "t"; 148 | onset_except_11 = SEP "s" "x" "v"; 149 | onset_except_12 = SEP "[t_S]" "[t_SJ]"; 150 | onset_except_13 = SEP "S" "[t_SJ]"; 151 | 152 | onset_except = Optimize[onset_except_1 | 153 | onset_except_2 | 154 | onset_except_3 | 155 | onset_except_4 | 156 | onset_except_5 | 157 | onset_except_6 | 158 | onset_except_7 | 159 | onset_except_8 | 160 | onset_except_9 | 161 | onset_except_10 | 162 | onset_except_11 | 163 | onset_except_12 | 164 | onset_except_13 165 | ]; 166 | 167 | onset = Optimize[onset1 | onset2 | onset3 | onset4 | onset_except]; 168 | 169 | # codas 170 | 171 | coda1 = consonantal; 172 | 173 | coda_48 = (consonantal @ vocalic) 174 | (consonantal - vocalic) 175 | ; 176 | 177 | coda_49 = (consonantal - vocalic - delayed_release) 178 | ((consonantal @ coronal @ anterior) - continuant - delayed_release) 179 | ; 180 | 181 | coda_50a = ((consonantal @ sonorant @ nasal) - vocalic) 182 | ((consonantal @ continuant @ coronal) - sonorant) 183 | ; 184 | 185 | coda_50b = ((consonantal @ sonorant @ nasal) - vocalic) 186 | (consonantal - continuant) 187 | ; 188 | 189 | coda_x1 = (consonantal - continuant - coronal) 190 | ((consonantal @ continuant @ coronal @ anterior) - sonorant) 191 | ; 192 | 193 | coda_x2 = ((consonantal @ continuant @ coronal @ anterior) - sonorant) 194 | (consonantal - continuant - coronal - anterior) 195 | ; 196 | 197 | coda_x3 = ((consonantal @ continuant @ coronal) - sonorant - anterior) 198 | ((consonantal @ coronal) - continuant - anterior) 199 | ; 200 | 201 | coda_x4 = ((consonantal @ continuant @ coronal @ anterior) - sonorant) 202 | ((consonantal @ anterior) - vocalic) 203 | ; 204 | 205 | coda_x5 = (consonantal - continuant - delayed_release) 206 | (consonantal @ vocalic @ coronal) 207 | ; 208 | 209 | # coda exceptions 210 | 211 | coda_except1 ="s" "[lJ]"; 212 | coda_except2 = "t" "[vJ]"; 213 | coda_except3 = "Z" "b"; 214 | coda_except4 = "l" "n" "t" "s"; 215 | coda_except5 = "l" "l"; 216 | coda_except6 = "r" "l"; 217 | coda_except7 = "s" "l"; 218 | coda_except8 = "m" "m"; 219 | coda_except9 = "n" "n"; 220 | coda_except10 = "p" "p"; 221 | coda_except11 = "s" "[tJ]" "r"; 222 | coda_except12 ="f" "r"; 223 | coda_except13 = "n" "t" "r"; 224 | coda_except14 = "s" "t" "r"; 225 | coda_except15 = "n" "k" "t"; 226 | coda_except16 = "[t_SJ]" "v"; 227 | coda_except17 = "k" "v"; 228 | coda_except18 = "s" "t" "v"; 229 | 230 | # coda_except19 = ("j" | "[lJ]" | "d" | "n" | "r" | "s" | "t" |"v") "s" "t" "v"; 231 | 232 | coda_except = Optimize[coda_except1 | 233 | coda_except2 | 234 | coda_except3 | 235 | coda_except4 | 236 | coda_except5 | 237 | coda_except6 | 238 | coda_except7 | 239 | coda_except8 | 240 | coda_except9 | 241 | coda_except10 | 242 | coda_except11 | 243 | coda_except12 | 244 | coda_except13 | 245 | coda_except14 | 246 | coda_except15 | 247 | coda_except16 | 248 | coda_except17 | 249 | coda_except18 250 | ]; 251 | 252 | coda = Optimize[coda1 | coda_48 | coda_49 | 253 | coda_50b | coda_x1 | coda_x2 | 254 | coda_x3 | coda_x4 | coda_x5 | 255 | coda_except 256 | ]; 257 | 258 | # syllable definitions 259 | 260 | syllable1 = onset alphabets.nuclei coda; 261 | syllable2 = onset alphabets.nuclei; 262 | syllable3 = alphabets.nuclei coda; 263 | syllable4 = alphabets.nuclei; 264 | 265 | syllable = syllable1 | syllable2 | syllable3 | syllable4; 266 | 267 | rewrite_alpha = (alphabets.phone | "-" | " " | 268 | "[SIL]" | "[ERROR]")*; 269 | 270 | syllabified0 = CDRewrite["":"-", 271 | syllable, # left context 272 | "j", # right context 273 | rewrite_alpha # alphabet 274 | ]; 275 | 276 | syllabified1 = CDRewrite["":"-", 277 | syllable, # left context 278 | syllable, # right context 279 | rewrite_alpha # alphabet 280 | ]; 281 | 282 | # residual rule (trying to catch unsyllabified sequences) 283 | syllabified2 = CDRewrite["":"-", 284 | onset? alphabets.nuclei coda, # left context 285 | consonantal* alphabets.nuclei, # right context 286 | rewrite_alpha # alphabet 287 | ]; 288 | 289 | trailing_syll_removal = CDRewrite["-":"", 290 | "", 291 | SEP, 292 | rewrite_alpha 293 | ]; 294 | 295 | export syllabified = Optimize[syllabified0 @ 296 | syllabified1 @ 297 | syllabified2 @ 298 | trailing_syll_removal 299 | ]; 300 | -------------------------------------------------------------------------------- /grammars/vowels.grm: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | # 13 | # Copyright 2014 Yandex LLC 14 | # All Rights Reserved. 15 | # 16 | # Author : Alexis Wilpert 17 | 18 | 19 | 20 | import 'alphabets.grm' as alphabets; 21 | import 'definitions.grm' as defs; 22 | 23 | # set/class definitions 24 | 25 | WSEP = "[WUD]"? alphabets.word_sep; 26 | LSEP = WSEP | "-"; 27 | RSEP = defs.RSEP; 28 | EOS = "[EOS]" | "[SIL]"; 29 | BOS = "[BOS]" | "[SIL]"; 30 | vowel = defs.vowel; 31 | cons_letter_hyphen = defs.cons_letter_hyphen; 32 | stress_minus_1 = defs.stress_minus_1; 33 | before_stress = defs.before_stress; 34 | all = defs.all; 35 | soft_cons = defs.soft_cons; 36 | hard_cons = defs.hard_cons; 37 | 38 | # orthographic and phonetic [ao] vowels 39 | a_o_vowels =("а" | "a" | "о" | "o"); 40 | 41 | #---------------------------------------------------------------------------- 42 | 43 | # stressed vowels 44 | 45 | # special case <ё>: always stressed (no stress marker present) 46 | stress_yo = CDRewrite[("ё":"o"), 47 | "", 48 | "", 49 | alphabets.sigma_star 50 | ]; 51 | 52 | stressed_pairs = (("я":"a") | 53 | ("ю":"u") | 54 | ("и":"i") | 55 | ("ы":"[i_x]") | 56 | ("е":"e") | 57 | ("а":"a") | 58 | ("у":"u") | 59 | ("о":"o") 60 | ); 61 | 62 | stress_vowel = CDRewrite[stressed_pairs, 63 | "+" "j"?, 64 | "", 65 | alphabets.sigma_star 66 | ]; 67 | 68 | # letter <э> will be mapped later, since its phonetic representation [e] 69 | # is in conflict with the one for the letter <е>, which does trigger 70 | # palatalization of the previous consonant. We need this information in 71 | # vowel reduction rules below 72 | 73 | stress_vowels = Optimize[stress_yo @ 74 | stress_vowel 75 | ]; 76 | 77 | #---------------------------------------------------------------------------- 78 | 79 | # reduction of unstressed vowels 80 | 81 | 82 | # first level of reduction 83 | 84 | reduction1 = CDRewrite["э":"E", 85 | (BOS | vowel), 86 | "", 87 | alphabets.sigma_star 88 | ]; 89 | 90 | I_reduction_letters1 = ("э":"I") | ("е":"I"); 91 | 92 | I_reduction_letters2 = ("я":"I"); 93 | 94 | I_reduction_letters3 = ("а":"I"); 95 | 96 | I_reduction_left_context = vowel | BOS | soft_cons | (soft_cons WSEP); 97 | 98 | I_reduction_right_context = stress_minus_1 | EOS; 99 | 100 | 101 | reduction2a = CDRewrite[I_reduction_letters1, 102 | I_reduction_left_context, 103 | "", 104 | alphabets.sigma_star 105 | ]; 106 | 107 | reduction2b = CDRewrite[I_reduction_letters1, 108 | "", 109 | I_reduction_right_context, 110 | alphabets.sigma_star 111 | ]; 112 | 113 | 114 | reduction2c = CDRewrite[I_reduction_letters2, 115 | I_reduction_left_context, 116 | before_stress, 117 | alphabets.sigma_star 118 | ]; 119 | 120 | reduction2d = CDRewrite[I_reduction_letters3, 121 | defs.always_soft_cons, 122 | before_stress, 123 | alphabets.sigma_star 124 | ]; 125 | 126 | 127 | schwa_r_reduction_letter1 = ("ы":"[@_r]"); 128 | 129 | schwa_r_reduction_letter2 = ("э":"[@_r]"); 130 | 131 | schwa_r_reduction_letter3 = (("и":"[@_r]") | ("е":"[@_r]")); 132 | 133 | 134 | # <ы> is always reduced to [@_r] when unstressed 135 | reduction3a = CDRewrite[schwa_r_reduction_letter1, 136 | all | WSEP | BOS, 137 | "", 138 | alphabets.sigma_star 139 | ]; 140 | 141 | reduction3b = CDRewrite[schwa_r_reduction_letter2, 142 | hard_cons | 143 | (hard_cons WSEP), 144 | "", 145 | alphabets.sigma_star 146 | ]; 147 | 148 | reduction3c = CDRewrite[schwa_r_reduction_letter3, 149 | defs.always_hard_cons, 150 | "", 151 | alphabets.sigma_star 152 | ]; 153 | 154 | # <и> is always reduced to [I] when unstressed, except in the cases 155 | # where it should become [@_r] (done before) 156 | reduction3d = CDRewrite[("и":"I"), 157 | all | WSEP | BOS, 158 | "", 159 | alphabets.sigma_star 160 | ]; 161 | 162 | schwa_o_reduction_letters = ("а":"[@_o]") | ("о":"[@_o]"); 163 | 164 | # always unstressed (otherwise immediaate l_context would be "+") 165 | schwa_o_reduction_l_context = (BOS | WSEP | a_o_vowels); 166 | 167 | schwa_o_reduction_r_context = stress_minus_1 | 168 | EOS | 169 | a_o_vowels 170 | ; 171 | 172 | # left context 173 | reduction4 = CDRewrite[schwa_o_reduction_letters, 174 | schwa_o_reduction_l_context, 175 | "", 176 | alphabets.sigma_star 177 | ]; 178 | 179 | # right context 180 | reduction5 = CDRewrite[schwa_o_reduction_letters, 181 | all, 182 | schwa_o_reduction_r_context, 183 | alphabets.sigma_star 184 | ]; 185 | 186 | U_reduction_letters = ("ю":"U") | ("у":"U"); 187 | 188 | # left context 189 | reduction6a = CDRewrite[U_reduction_letters, 190 | vowel | BOS, 191 | "", 192 | alphabets.sigma_star 193 | ]; 194 | 195 | # right context 196 | reduction6b = CDRewrite[U_reduction_letters, 197 | "", 198 | stress_minus_1 | EOS, 199 | alphabets.sigma_star 200 | ]; 201 | 202 | first_level_reduction = Optimize[reduction1 @ 203 | reduction2a @ 204 | reduction2b @ 205 | reduction2c @ 206 | reduction2d @ 207 | reduction3a @ 208 | reduction3b @ 209 | reduction3c @ 210 | reduction3d @ 211 | reduction4 @ 212 | reduction5 @ 213 | reduction6a @ 214 | reduction6b 215 | ]; 216 | 217 | #---------------------------------------------------------------------------- 218 | 219 | # second level of reduction 220 | 221 | schwa_reduction_pairs = Optimize[("е":"@") | 222 | ("я":"@") | 223 | ("а":"@") | 224 | ("о":"@") 225 | ]; 226 | 227 | second_reduction_l_context = Optimize[cons_letter_hyphen | LSEP]; 228 | 229 | second_reduction_r_context = Optimize[cons_letter_hyphen* 230 | (vowel | RSEP | EOS) 231 | ]; 232 | 233 | reduction7 = CDRewrite[schwa_reduction_pairs, 234 | second_reduction_l_context, 235 | second_reduction_r_context, 236 | alphabets.sigma_star 237 | ]; 238 | 239 | U_x_reduction_letters = ("ю":"[U_x]") | ("у":"[U_x]"); 240 | 241 | reduction8 = CDRewrite[U_x_reduction_letters, 242 | second_reduction_l_context, 243 | second_reduction_r_context, 244 | alphabets.sigma_star 245 | ]; 246 | 247 | second_level_reduction = Optimize[reduction7 @ 248 | reduction8 249 | ]; 250 | 251 | #---------------------------------------------------------------------------- 252 | 253 | # exceptions 254 | 255 | # after <ц> and with the second degree of reduction we should better 256 | # have [@], not [@_r]) 257 | 258 | # post-tonic unstressed <е> --> [@] after <ц> 259 | 260 | # Ex. <сердце> 261 | 262 | exception1= CDRewrite[("[@_r]":"@"), 263 | "ц", 264 | (cons_letter_hyphen | vowel)* (EOS | WSEP), 265 | alphabets.sigma_star 266 | ]; 267 | 268 | exceptions = Optimize[exception1]; 269 | 270 | #---------------------------------------------------------------------------- 271 | 272 | # reduce vowels after other vowels that were not reduced so far 273 | 274 | V_V_reduction_pairs = ("э":"E") | ("и":"I") | 275 | ("е":"I") | ("я":"I") | 276 | ("ы":"[@_r]") | ("а":"[@_o]") | 277 | ("о":"[@_o]") | ("ю":"U") | 278 | ("у":"U") 279 | ; 280 | 281 | reduce_V_V_vowels = CDRewrite[V_V_reduction_pairs, 282 | vowel "-"?, 283 | "", 284 | alphabets.sigma_star 285 | ]; 286 | 287 | #---------------------------------------------------------------------------- 288 | 289 | # cleaning 290 | 291 | stress_non_soft_e = CDRewrite[("э":"e"), 292 | "+", 293 | "", 294 | alphabets.sigma_star 295 | ]; 296 | 297 | # needed for foreign words --> TO BE CHECKED 298 | transcribe_remaining_eps = CDRewrite[("э":"E"), 299 | "", 300 | "", 301 | alphabets.sigma_star 302 | ]; 303 | 304 | 305 | clean_hyphen = CDRewrite["-":"", 306 | "", 307 | "", 308 | alphabets.sigma_star 309 | ]; 310 | 311 | clean_stress_marker = CDRewrite["+":"", 312 | "", 313 | "", 314 | alphabets.sigma_star 315 | ]; 316 | 317 | cleaning = Optimize[stress_non_soft_e @ 318 | transcribe_remaining_eps @ 319 | clean_hyphen @ 320 | clean_stress_marker 321 | ]; 322 | 323 | #---------------------------------------------------------------------------- 324 | 325 | export reduced = Optimize[stress_vowels @ 326 | first_level_reduction @ 327 | second_level_reduction @ 328 | exceptions @ 329 | reduce_V_V_vowels @ 330 | cleaning]; 331 | -------------------------------------------------------------------------------- /src/3rdparty/utf8/core.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include 32 | 33 | namespace utf8 34 | { 35 | // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers 36 | // You may need to change them to match your system. 37 | // These typedefs have the same names as ones from cstdint, or boost/cstdint 38 | typedef unsigned char uint8_t; 39 | typedef unsigned short uint16_t; 40 | typedef unsigned int uint32_t; 41 | 42 | // Helper code - not intended to be directly called by the library users. May be changed at any time 43 | namespace internal 44 | { 45 | // Unicode constants 46 | // Leading (high) surrogates: 0xd800 - 0xdbff 47 | // Trailing (low) surrogates: 0xdc00 - 0xdfff 48 | const uint16_t LEAD_SURROGATE_MIN = 0xd800u; 49 | const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; 50 | const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; 51 | const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; 52 | const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); 53 | const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; 54 | 55 | // Maximum valid value for a Unicode code point 56 | const uint32_t CODE_POINT_MAX = 0x0010ffffu; 57 | 58 | template 59 | inline uint8_t mask8(octet_type oc) 60 | { 61 | return static_cast(0xff & oc); 62 | } 63 | template 64 | inline uint16_t mask16(u16_type oc) 65 | { 66 | return static_cast(0xffff & oc); 67 | } 68 | template 69 | inline bool is_trail(octet_type oc) 70 | { 71 | return ((utf8::internal::mask8(oc) >> 6) == 0x2); 72 | } 73 | 74 | template 75 | inline bool is_lead_surrogate(u16 cp) 76 | { 77 | return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); 78 | } 79 | 80 | template 81 | inline bool is_trail_surrogate(u16 cp) 82 | { 83 | return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); 84 | } 85 | 86 | template 87 | inline bool is_surrogate(u16 cp) 88 | { 89 | return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); 90 | } 91 | 92 | template 93 | inline bool is_code_point_valid(u32 cp) 94 | { 95 | return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); 96 | } 97 | 98 | template 99 | inline typename std::iterator_traits::difference_type 100 | sequence_length(octet_iterator lead_it) 101 | { 102 | uint8_t lead = utf8::internal::mask8(*lead_it); 103 | if (lead < 0x80) 104 | return 1; 105 | else if ((lead >> 5) == 0x6) 106 | return 2; 107 | else if ((lead >> 4) == 0xe) 108 | return 3; 109 | else if ((lead >> 3) == 0x1e) 110 | return 4; 111 | else 112 | return 0; 113 | } 114 | 115 | template 116 | inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) 117 | { 118 | if (cp < 0x80) { 119 | if (length != 1) 120 | return true; 121 | } 122 | else if (cp < 0x800) { 123 | if (length != 2) 124 | return true; 125 | } 126 | else if (cp < 0x10000) { 127 | if (length != 3) 128 | return true; 129 | } 130 | 131 | return false; 132 | } 133 | 134 | enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; 135 | 136 | /// Helper for get_sequence_x 137 | template 138 | utf_error increase_safely(octet_iterator& it, octet_iterator end) 139 | { 140 | if (++it == end) 141 | return NOT_ENOUGH_ROOM; 142 | 143 | if (!utf8::internal::is_trail(*it)) 144 | return INCOMPLETE_SEQUENCE; 145 | 146 | return UTF8_OK; 147 | } 148 | 149 | #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} 150 | 151 | /// get_sequence_x functions decode utf-8 sequences of the length x 152 | template 153 | utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point) 154 | { 155 | if (it == end) 156 | return NOT_ENOUGH_ROOM; 157 | 158 | code_point = utf8::internal::mask8(*it); 159 | 160 | return UTF8_OK; 161 | } 162 | 163 | template 164 | utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point) 165 | { 166 | if (it == end) 167 | return NOT_ENOUGH_ROOM; 168 | 169 | code_point = utf8::internal::mask8(*it); 170 | 171 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 172 | 173 | code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); 174 | 175 | return UTF8_OK; 176 | } 177 | 178 | template 179 | utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point) 180 | { 181 | if (it == end) 182 | return NOT_ENOUGH_ROOM; 183 | 184 | code_point = utf8::internal::mask8(*it); 185 | 186 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 187 | 188 | code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); 189 | 190 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 191 | 192 | code_point += (*it) & 0x3f; 193 | 194 | return UTF8_OK; 195 | } 196 | 197 | template 198 | utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point) 199 | { 200 | if (it == end) 201 | return NOT_ENOUGH_ROOM; 202 | 203 | code_point = utf8::internal::mask8(*it); 204 | 205 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 206 | 207 | code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); 208 | 209 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 210 | 211 | code_point += (utf8::internal::mask8(*it) << 6) & 0xfff; 212 | 213 | UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) 214 | 215 | code_point += (*it) & 0x3f; 216 | 217 | return UTF8_OK; 218 | } 219 | 220 | #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR 221 | 222 | template 223 | utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point) 224 | { 225 | // Save the original value of it so we can go back in case of failure 226 | // Of course, it does not make much sense with i.e. stream iterators 227 | octet_iterator original_it = it; 228 | 229 | uint32_t cp = 0; 230 | // Determine the sequence length based on the lead octet 231 | typedef typename std::iterator_traits::difference_type octet_difference_type; 232 | const octet_difference_type length = utf8::internal::sequence_length(it); 233 | 234 | // Get trail octets and calculate the code point 235 | utf_error err = UTF8_OK; 236 | switch (length) { 237 | case 0: 238 | return INVALID_LEAD; 239 | case 1: 240 | err = utf8::internal::get_sequence_1(it, end, cp); 241 | break; 242 | case 2: 243 | err = utf8::internal::get_sequence_2(it, end, cp); 244 | break; 245 | case 3: 246 | err = utf8::internal::get_sequence_3(it, end, cp); 247 | break; 248 | case 4: 249 | err = utf8::internal::get_sequence_4(it, end, cp); 250 | break; 251 | } 252 | 253 | if (err == UTF8_OK) { 254 | // Decoding succeeded. Now, security checks... 255 | if (utf8::internal::is_code_point_valid(cp)) { 256 | if (!utf8::internal::is_overlong_sequence(cp, length)){ 257 | // Passed! Return here. 258 | code_point = cp; 259 | ++it; 260 | return UTF8_OK; 261 | } 262 | else 263 | err = OVERLONG_SEQUENCE; 264 | } 265 | else 266 | err = INVALID_CODE_POINT; 267 | } 268 | 269 | // Failure branch - restore the original value of the iterator 270 | it = original_it; 271 | return err; 272 | } 273 | 274 | template 275 | inline utf_error validate_next(octet_iterator& it, octet_iterator end) { 276 | uint32_t ignored; 277 | return utf8::internal::validate_next(it, end, ignored); 278 | } 279 | 280 | } // namespace internal 281 | 282 | /// The library API - functions intended to be called by the users 283 | 284 | // Byte order mark 285 | const uint8_t bom[] = {0xef, 0xbb, 0xbf}; 286 | 287 | template 288 | octet_iterator find_invalid(octet_iterator start, octet_iterator end) 289 | { 290 | octet_iterator result = start; 291 | while (result != end) { 292 | utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); 293 | if (err_code != internal::UTF8_OK) 294 | return result; 295 | } 296 | return result; 297 | } 298 | 299 | template 300 | inline bool is_valid(octet_iterator start, octet_iterator end) 301 | { 302 | return (utf8::find_invalid(start, end) == end); 303 | } 304 | 305 | template 306 | inline bool starts_with_bom (octet_iterator it, octet_iterator end) 307 | { 308 | return ( 309 | ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && 310 | ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && 311 | ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) 312 | ); 313 | } 314 | 315 | //Deprecated in release 2.3 316 | template 317 | inline bool is_bom (octet_iterator it) 318 | { 319 | return ( 320 | (utf8::internal::mask8(*it++)) == bom[0] && 321 | (utf8::internal::mask8(*it++)) == bom[1] && 322 | (utf8::internal::mask8(*it)) == bom[2] 323 | ); 324 | } 325 | } // namespace utf8 326 | 327 | #endif // header guard 328 | 329 | 330 | -------------------------------------------------------------------------------- /src/3rdparty/utf8/checked.h: -------------------------------------------------------------------------------- 1 | // Copyright 2006 Nemanja Trifunovic 2 | 3 | /* 4 | Permission is hereby granted, free of charge, to any person or organization 5 | obtaining a copy of the software and accompanying documentation covered by 6 | this license (the "Software") to use, reproduce, display, distribute, 7 | execute, and transmit the Software, and to prepare derivative works of the 8 | Software, and to permit third-parties to whom the Software is furnished to 9 | do so, all subject to the following: 10 | 11 | The copyright notices in the Software and this entire statement, including 12 | the above license grant, this restriction and the following disclaimer, 13 | must be included in all copies of the Software, in whole or in part, and 14 | all derivative works of the Software, unless such copies or derivative 15 | works are solely in the form of machine-executable object code generated by 16 | a source language processor. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 21 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 22 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 23 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 | DEALINGS IN THE SOFTWARE. 25 | */ 26 | 27 | 28 | #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 29 | #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 30 | 31 | #include "core.h" 32 | #include 33 | 34 | namespace utf8 35 | { 36 | // Base for the exceptions that may be thrown from the library 37 | class exception : public ::std::exception { 38 | }; 39 | 40 | // Exceptions that may be thrown from the library functions. 41 | class invalid_code_point : public exception { 42 | uint32_t cp; 43 | public: 44 | invalid_code_point(uint32_t cp) : cp(cp) {} 45 | virtual const char* what() const throw() { return "Invalid code point"; } 46 | uint32_t code_point() const {return cp;} 47 | }; 48 | 49 | class invalid_utf8 : public exception { 50 | uint8_t u8; 51 | public: 52 | invalid_utf8 (uint8_t u) : u8(u) {} 53 | virtual const char* what() const throw() { return "Invalid UTF-8"; } 54 | uint8_t utf8_octet() const {return u8;} 55 | }; 56 | 57 | class invalid_utf16 : public exception { 58 | uint16_t u16; 59 | public: 60 | invalid_utf16 (uint16_t u) : u16(u) {} 61 | virtual const char* what() const throw() { return "Invalid UTF-16"; } 62 | uint16_t utf16_word() const {return u16;} 63 | }; 64 | 65 | class not_enough_room : public exception { 66 | public: 67 | virtual const char* what() const throw() { return "Not enough space"; } 68 | }; 69 | 70 | /// The library API - functions intended to be called by the users 71 | 72 | template 73 | octet_iterator append(uint32_t cp, octet_iterator result) 74 | { 75 | if (!utf8::internal::is_code_point_valid(cp)) 76 | throw invalid_code_point(cp); 77 | 78 | if (cp < 0x80) // one octet 79 | *(result++) = static_cast(cp); 80 | else if (cp < 0x800) { // two octets 81 | *(result++) = static_cast((cp >> 6) | 0xc0); 82 | *(result++) = static_cast((cp & 0x3f) | 0x80); 83 | } 84 | else if (cp < 0x10000) { // three octets 85 | *(result++) = static_cast((cp >> 12) | 0xe0); 86 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 87 | *(result++) = static_cast((cp & 0x3f) | 0x80); 88 | } 89 | else { // four octets 90 | *(result++) = static_cast((cp >> 18) | 0xf0); 91 | *(result++) = static_cast(((cp >> 12) & 0x3f) | 0x80); 92 | *(result++) = static_cast(((cp >> 6) & 0x3f) | 0x80); 93 | *(result++) = static_cast((cp & 0x3f) | 0x80); 94 | } 95 | return result; 96 | } 97 | 98 | template 99 | output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) 100 | { 101 | while (start != end) { 102 | octet_iterator sequence_start = start; 103 | internal::utf_error err_code = utf8::internal::validate_next(start, end); 104 | switch (err_code) { 105 | case internal::UTF8_OK : 106 | for (octet_iterator it = sequence_start; it != start; ++it) 107 | *out++ = *it; 108 | break; 109 | case internal::NOT_ENOUGH_ROOM: 110 | throw not_enough_room(); 111 | case internal::INVALID_LEAD: 112 | out = utf8::append (replacement, out); 113 | ++start; 114 | break; 115 | case internal::INCOMPLETE_SEQUENCE: 116 | case internal::OVERLONG_SEQUENCE: 117 | case internal::INVALID_CODE_POINT: 118 | out = utf8::append (replacement, out); 119 | ++start; 120 | // just one replacement mark for the sequence 121 | while (start != end && utf8::internal::is_trail(*start)) 122 | ++start; 123 | break; 124 | } 125 | } 126 | return out; 127 | } 128 | 129 | template 130 | inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) 131 | { 132 | static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); 133 | return utf8::replace_invalid(start, end, out, replacement_marker); 134 | } 135 | 136 | template 137 | uint32_t next(octet_iterator& it, octet_iterator end) 138 | { 139 | uint32_t cp = 0; 140 | internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); 141 | switch (err_code) { 142 | case internal::UTF8_OK : 143 | break; 144 | case internal::NOT_ENOUGH_ROOM : 145 | throw not_enough_room(); 146 | case internal::INVALID_LEAD : 147 | case internal::INCOMPLETE_SEQUENCE : 148 | case internal::OVERLONG_SEQUENCE : 149 | throw invalid_utf8(*it); 150 | case internal::INVALID_CODE_POINT : 151 | throw invalid_code_point(cp); 152 | } 153 | return cp; 154 | } 155 | 156 | template 157 | uint32_t peek_next(octet_iterator it, octet_iterator end) 158 | { 159 | return utf8::next(it, end); 160 | } 161 | 162 | template 163 | uint32_t prior(octet_iterator& it, octet_iterator start) 164 | { 165 | // can't do much if it == start 166 | if (it == start) 167 | throw not_enough_room(); 168 | 169 | octet_iterator end = it; 170 | // Go back until we hit either a lead octet or start 171 | while (utf8::internal::is_trail(*(--it))) 172 | if (it == start) 173 | throw invalid_utf8(*it); // error - no lead byte in the sequence 174 | return utf8::peek_next(it, end); 175 | } 176 | 177 | /// Deprecated in versions that include "prior" 178 | template 179 | uint32_t previous(octet_iterator& it, octet_iterator pass_start) 180 | { 181 | octet_iterator end = it; 182 | while (utf8::internal::is_trail(*(--it))) 183 | if (it == pass_start) 184 | throw invalid_utf8(*it); // error - no lead byte in the sequence 185 | octet_iterator temp = it; 186 | return utf8::next(temp, end); 187 | } 188 | 189 | template 190 | void advance (octet_iterator& it, distance_type n, octet_iterator end) 191 | { 192 | for (distance_type i = 0; i < n; ++i) 193 | utf8::next(it, end); 194 | } 195 | 196 | template 197 | typename std::iterator_traits::difference_type 198 | distance (octet_iterator first, octet_iterator last) 199 | { 200 | typename std::iterator_traits::difference_type dist; 201 | for (dist = 0; first < last; ++dist) 202 | utf8::next(first, last); 203 | return dist; 204 | } 205 | 206 | template 207 | octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) 208 | { 209 | while (start != end) { 210 | uint32_t cp = utf8::internal::mask16(*start++); 211 | // Take care of surrogate pairs first 212 | if (utf8::internal::is_lead_surrogate(cp)) { 213 | if (start != end) { 214 | uint32_t trail_surrogate = utf8::internal::mask16(*start++); 215 | if (utf8::internal::is_trail_surrogate(trail_surrogate)) 216 | cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; 217 | else 218 | throw invalid_utf16(static_cast(trail_surrogate)); 219 | } 220 | else 221 | throw invalid_utf16(static_cast(cp)); 222 | 223 | } 224 | // Lone trail surrogate 225 | else if (utf8::internal::is_trail_surrogate(cp)) 226 | throw invalid_utf16(static_cast(cp)); 227 | 228 | result = utf8::append(cp, result); 229 | } 230 | return result; 231 | } 232 | 233 | template 234 | u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) 235 | { 236 | while (start != end) { 237 | uint32_t cp = utf8::next(start, end); 238 | if (cp > 0xffff) { //make a surrogate pair 239 | *result++ = static_cast((cp >> 10) + internal::LEAD_OFFSET); 240 | *result++ = static_cast((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); 241 | } 242 | else 243 | *result++ = static_cast(cp); 244 | } 245 | return result; 246 | } 247 | 248 | template 249 | octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) 250 | { 251 | while (start != end) 252 | result = utf8::append(*(start++), result); 253 | 254 | return result; 255 | } 256 | 257 | template 258 | u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) 259 | { 260 | while (start != end) 261 | (*result++) = utf8::next(start, end); 262 | 263 | return result; 264 | } 265 | 266 | // The iterator class 267 | template 268 | class iterator : public std::iterator { 269 | octet_iterator it; 270 | octet_iterator range_start; 271 | octet_iterator range_end; 272 | public: 273 | iterator () {} 274 | explicit iterator (const octet_iterator& octet_it, 275 | const octet_iterator& range_start, 276 | const octet_iterator& range_end) : 277 | it(octet_it), range_start(range_start), range_end(range_end) 278 | { 279 | if (it < range_start || it > range_end) 280 | throw std::out_of_range("Invalid utf-8 iterator position"); 281 | } 282 | // the default "big three" are OK 283 | octet_iterator base () const { return it; } 284 | uint32_t operator * () const 285 | { 286 | octet_iterator temp = it; 287 | return utf8::next(temp, range_end); 288 | } 289 | bool operator == (const iterator& rhs) const 290 | { 291 | if (range_start != rhs.range_start || range_end != rhs.range_end) 292 | throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); 293 | return (it == rhs.it); 294 | } 295 | bool operator != (const iterator& rhs) const 296 | { 297 | return !(operator == (rhs)); 298 | } 299 | iterator& operator ++ () 300 | { 301 | utf8::next(it, range_end); 302 | return *this; 303 | } 304 | iterator operator ++ (int) 305 | { 306 | iterator temp = *this; 307 | utf8::next(it, range_end); 308 | return temp; 309 | } 310 | iterator& operator -- () 311 | { 312 | utf8::prior(it, range_start); 313 | return *this; 314 | } 315 | iterator operator -- (int) 316 | { 317 | iterator temp = *this; 318 | utf8::prior(it, range_start); 319 | return temp; 320 | } 321 | }; // class iterator 322 | 323 | } // namespace utf8 324 | 325 | #endif //header guard 326 | 327 | 328 | -------------------------------------------------------------------------------- /test/rus_sentences.txt: -------------------------------------------------------------------------------- 1 | Суворов с любопытством расспрашивал славного мятежника о его военных действияхи намерениях и повез его в Симбирск, куда должен был приехать и граф Панин . 2 | И просто задохнулся от неожиданной радости . 3 | Вскоре начал накрапывать дождь , мочивший нас до самого места лагеря экспедиции , партии Елизаветы Владимировны . 4 | Они сидели над миром , участвуя в общем процессе жизни и гибели и присоединяя к своей сущности все души , личности и чувства « Я » . 5 | Его толстая шея вздулась багровыми жилами , и широкое скуластое лицо налилось кровью . 6 | Трусость его не дозволяла ему хорошенько изъясниться ; и так солгал он раза два или три без правил , а как господин закричал ему , чтобы он пошел домой , то тем дело и кончилось . 7 | укладывается в узкий тоннель научных знаний - высмеивается или не замечается . 8 | Комитет « Гражданское содействие » выдает им ходатайства о регистрации и обращения к сотрудникам МВД с разъяснением реального положения этих людей . 9 | Однако ДНК сама по себе несет важнейшую функцию кодирования аминокислотной последовательности белков , и мы не можем произвольно ее модифицировать , не затронув этой информации и не изменив способности ДНК к удвоению . 10 | Сейчас очень интересную женщину пишут . 11 | Заставляли молиться , готовили еду , рассказывали анекдоты . 12 | Офицерам и автоматчикам из роты охраны Поляков приказал тщательно осмотреть окрестность , а сам с немцами и занялся непосредственно участком , где располагалось ядро группы . 13 | Калинин проехал , не заинтересовался . 14 | Начнём с депутатов ! 15 | В Нюрнберге была знаменитая фамилия Фуггеров , банкиров - купцов того времени , вроде Ротшильдов ; они предложили у курфирста взять на откуп индульгенции . 16 | Из-за смут и беспорядков этого времени , судоходство в португальских водах почти прекратилось , так что нечего было опасаться даже случайного обнаружения судна в этих местах . 17 | Более того , мы считаем , что кредит в будущем может стать альтернативой долгу . 18 | Прошло по крайней мере с полчаса , пока утихли эти хватающие за сердце рыдания . 19 | На экранах - больших и малых - царят американские фильмы . 20 | Переход к простой брани был бы слишком крут и заметен без разных переливов , оттенков и мостов . 21 | Всех жильем обеспечил . 22 | Когда сотрудники КГБ выскочили из здания , то на указанном месте обнаружили лишь ключи от машины . 23 | Даже Николай запрещал об этом случае вспоминать - сразу беленился . 24 | И Данило Сазоныч завел разговор о Потемкине , который говорил ему , что переходит опять за Московскую заставу . 25 | Комната , в которой нас принимали , была , конечно , самая просторная в доме ; ее заранее мыли и чистили и перед образами затепляли лампады . 26 | Последний кайф лета перед скучищей учебы и повседневности . 27 | Я видел теперь , что на самом деле он не узкоглазый , а просто из тех людей , которые всегда смотрят вприщур . 28 | Только извините . 29 | Вдруг поместился посреди « чистилища » , как бы никому и не мешая . 30 | такой интеллектуальный марафон . 31 | Старый туркмен , прозванный Хоробрых « царем Менелаем » , раздувал слухи , что в мергеле сидит злой дух и будет жестоко мстить каждому , кто дойдет до сердца горы . 32 | Богословие в Киеве он читал по Аквинату . 33 | И Севастьянов не удивляется , что Семка пишет письмо брату , спящему в соседней комнате , - не до того Севастьянову . 34 | Мы не могли определить , что теперь : солнечный ли день или непроницаемый туман , ибо в лесу были сумерки , как в наших широтах через час после солнечного заката . 35 | Не избежал этой участи и Н . 36 | Это же не паста , а чистая отрава ! 37 | За ним она , прекрасная как всегда , вся в белом , вся в цветах померанца , с длинным блондовым вуалем на голове , который живописно спускался назад ; с блестящим шифром на левом плече . 38 | Если бы какой-либо священник вздумал устроить религиозные чтения для народа в своей же церкви , но в часы , когда нет богослужения в ней , для этого он предварительно должен испросить себе особое разрешение епархиальной власти . 39 | Как будто раньше люди со временем молодели . 40 | Кабинка была почище . 41 | Сентябрь в Израиле - месяц новогодний и поэтому не очень перегруженный политикой . 42 | Сделала это с грустным видом маменька , толстая дама , большая курительница и специалистка в преферансе . 43 | Зосиму и Савватия . 44 | Через два месяца : вы больны оттого , что не побереглись . 45 | Расчет у него был простой : чем меньше гости пробудут в лагере , тем лучше , - лагерь всегда казался ему кипящим котлом , ежеминутно готовым взорваться . 46 | Всегда мечтала иметь много детей , а тут счастье в руки : девочка славная , самостоятельная . 47 | Или друг к другу - враждебно пристрастные . 48 | Государственная территория , дотоле заключенная в пределах первоначального расселения великорусского племени , теперь переходит далеко за эти пределы и постепенно вбирает в себя всю русскую равнину , распространяясь как до географических ее границ , так почти везде до пределов русского народонаселения . 49 | Здесь прежде всего большое значение имеет правильный отбор поступающих на военную службу . 50 | Моя кобыла-то еще зимой до того привыкла солому возить , что с закрытыми глазами по тому маршруту ходила . 51 | Сотрудники милиции даже выезжали на « стрелки » , где очень быстро и доступно объясняли своим оппонентам необходимость оставить в покое опекаемых или коммерсантов . 52 | Любовь - мера одаренности жизнью людей , но она , вопреки всему , в очень малой степени сексуальность . 53 | Ну , вы храбрый . 54 | Бог весть , встретимся ли еще . 55 | ибо как бы ни был человек мал , но есть какие-то результаты его жизни ! 56 | Став один раз вразрез с матерью и сестрами , она не умела с ними сойтись снова , а они этого не искали . 57 | Один засунул его в настенные часы с кукушкой . 58 | Тогда я , оборотясь , увидел на горе против нас , за речкой , множество колюжей , а сверх того человек двадцать бежавших , чтоб отрезать нас троих от наших товарищей ; между тем стрелы сыпались на нас , как град . 59 | Мы не останавливаемся на достигнутом , стараясь охватить весь спектр направлений подготовки студентов . 60 | А на мне , я знал , лежала вся ответственность за успех ее . 61 | Собою их не заслоню , хотя я и автор , вернее - одно из второстепенных лиц на задах массовки . 62 | Покачивая головой и сгорбившись , он возвращается к окну ; отряд длинной и неровной цепью выползает через ворота плац-парада . 63 | Но это оказалось реальностью , и теперь мир уже не будет таким , каким он был вчера . 64 | По счастью , падали деревья ночью , часа в четыре , в безлюдье . 65 | Сам тощой , а места занял , как баба откормленная . 66 | Неважно ! 67 | Когда мы выехали , была уже ночь , и дорога лежала через девственный лес с вековыми соснами , лиственницей и елью гигантских размеров . 68 | Оставалась ровная , спокойная и незамужняя . 69 | Эта сука в оценке мужиков и их дурацких качеств почти всегда бывает права . 70 | Им это не мешало . 71 | Но еще пыром , дырявым ботинком он и тут успел врезать сержанту меж ног - и это тоже со счастливой мыслью . 72 | Навевает мысли о стабильности и постоянстве . 73 | Послышались женские всхлипывания . 74 | А так как деятельность его происходила среди очень молодых людей , принимавших его безграничную самоуверенность за глубокомыслие и мудрость , то большинство подчинялось ему , и он имел большой успех в революционных кругах . 75 | Стараясь не сбиться с общего шага , он снова представил себе сыновей и мысленно обратился к ним с продолжением своего рассказа . 76 | Общество , в котором властвовала партийная номенклатура , насквозь пропитанная догмами , неизлечимо больная утопической идеологией . 77 | Этого желает моя высокая повелительница . 78 | Настоящий громила с узким лбом , с лохматыми бровями над близко сведенными черными глазами , а длинные руки , словно клешни , - одной ладонью всю мою спину прикроют . 79 | Те ценные результаты , о которых мы говорили , - итог длительного и напряженного труда . 80 | Освобожденный от необходимости на каждом шагу доказывать свою независимость , всякий делал свое дело спокойно , без раздражения . 81 | Сегодня они больше всего боятся , что придворная знать и крупные капиталисты не захотят их брать всерьез . 82 | Скорее всего он не умеет понять противоречие между ответственностью их миссии и теми « ядовитыми » характеристиками , которыми наделяет их Ленин . 83 | Ну , вот , мы и проверим , милая трусиха , насколько оправдаются ваши страхи . 84 | Женщина обрадовалась разговору , и сама все рассказала , лаская его узкое лицо шелковистыми своими глазами . 85 | России хватит на всех . 86 | А где-то - водка , где-то - самогон , где-то - « чернила » обыкновенные . 87 | Но они не любопытны . 88 | результативность . 89 | И всегда мысль « Бог со мною » . 90 | Ефим Игнатьич только мигает , а до горячих пирожков не дотрагивается . 91 | У тетушки Марьи Алексеевны она прожила недолго . 92 | Эта постоянная необычность для нас , вероятно , основана на имманентной иллюзии каузальности временной организации психического . 93 | Сыр , масло , кожа , мед , лес и - долой фабрики ! 94 | Одно время , например , уговаривали Л . 95 | Стоя у подножия маяка , надо высоко задирать голову , чтобы увидеть его вершину , и только тогда постигаешь все величие сооружения . 96 | Больше ничего не говорит , говорит только , что там очень плохо . 97 | Так же как в начале прошлого века конные скачки , а в начале позапрошлого - стрельба из лука . 98 | Афонской горы . 99 | Как не знать Андрея Михайлыча ! 100 | Степан Тимохин . 101 | Последнее мое свидание с Гоголем было в Петербурге , когда он останавливался в Зимнем дворце , у Жуковского . 102 | Мы уже узнали, что он собирался прочесть нам новое свое произведение , но приступить к делу было не легко . 103 | Гоголь как ни в чем не бывало ходил по комнате , добродушно подсмеивался над некоторыми общими знакомыми , а о чтении и помину не было . 104 | Даже раз он намекнул , что можно отложить заседание . 105 | Он подошел к Гоголю сзади, ощупал карманы его фрака, вытащил оттуда тетрадь почтовой бумаги в осьмушку . 106 | Гоголь сердито выхватил тетрадку , сел мрачно на диван и тотчас же начал читать при всеобщем молчании . 107 | Он читал без перерыва до тех пор , пока истощился весь его голос и зарябило в глазах . 108 | Мы узнали таким образом первые четыре главы «Мертвых душ» . 109 | Общий смех мало поразил Гоголя , но изъявление нелицемерного восторга , которое видимо было на всех лицах под конец чтения , его тронуло . 110 | Он был доволен . 111 | Кто-то сказал , что приветствие Селифана босой девочке , которую он сажает на козлы вместо проводника от Коробочки , не совсем прилично. 112 | Все остальные слушатели восстали против этого замечания . 113 | После чтения он закутался , по обыкновению , в шубу до самого лба , сел со мной на извозчика , и мы молча доехали до Зимнего дворца , где я его ссадил . 114 | Вскоре потом он опять исчез из Петербурга . 115 | Гоголь обрадовался нашей новой встрече, расспрашивал, каким путем прибыл я в Италию . 116 | Ему казалось , что после Италии Париж становится сух и безжизнен , а значение Италии бросается само собой в глаза после парижской жизни и парижских интересов . 117 | Впоследствии он часто развивал эту мысль . 118 | Между тем время было обеденное . 119 | Он повел меня в известную историческую австерию , где за длинными столами , шагая по грязному полу и усаживаясь просто на скамейках , стекается к обеденному часу разнообразнейшая публика . 120 | Это все тот же рис , барашек , курица - меняется только зелень по временам года . 121 | Простота, общежительность итальянская всего более кидаются тут в глаза , заставляя предчувствовать себя и во всех других сферах жизни . 122 | Гоголь поразил меня , однако, капризным , взыскательным обращением своим с прислужником . 123 | Раза два менял он блюдо риса , находя его то переваренным , то недоваренным , и всякий раз прислужник переменял блюдо с добродушной улыбкой . 124 | Получив наконец тарелку риса по своему вкусу , Гоголь приступилк ней с необычайною алчностью , наклонясь так , что длинные волосы его упали на самое блюдо , и поглощая ложку за ложкой со страстью и быстротой , какими , говорят, обыкновенно отличаются за столом люди , расположенные к ипохондрии . 125 | В середине обеда к нам подсел довольно плотный мужчина , с красивой , круглой бородкой . 126 | Опорожнив свое блюдо , Гоголь откинулся назад, сделался весел , разговорчив и начал шутить с прислужником, еще так недавно 127 | осыпаемым строгими выговорами и укоризнами . 128 | По окончании расчета за обед Гоголь оставил прислужнику , как и все другие посетители , два байока , а когда я со своей стороны что-то переложил против этой скудной суммы , он остановил меня замечанием . 129 | Известно, что житейской мудрости в нем было почти столько же , сколько и таланта . 130 | Он был в своей тарелке и мог , что ему нужно было или что стоило этого , полной рукой , не давая сам ничего . 131 | Я никогда не хочу обедать . 132 | Мне так хорошо во дворе играть . 133 | Я всю жизнь бы во дворе играл . 134 | И никогда не обедал бы . 135 | Я совсем не люблю борщ с капустой . 136 | И вообще я суп не люблю . 137 | И кашу я не люблю . 138 | И котлеты тоже не очень люблю . 139 | Я люблю абрикосы . 140 | Вы ели абрикосы . 141 | Я так люблю абрикосы . 142 | Но вот мама зовёт меня есть борщ , мне приходится всё бросать . 143 | Мой брат Боба любит борщ . 144 | Он смеётся, когда ест борщ , а я морщусь . 145 | Он вообще всегда смеётся и тычет себе ложкой в нос вместо рта , потому что ему три года . 146 | Нет , борщ я могу съесть . 147 | И котлеты я тоже съедаю . 148 | Виноград-то я ем с удовольствием . 149 | Тогда и сажают меня за рояль . 150 | Пожалуй, я съел бы ещё раз борщ . 151 | Только бы не играть на рояле . 152 | Я играю , а брат сидит на полу и смеётся . 153 | В руках у него заводная машина . 154 | Он оторвал от машины колёса . 155 | И катает их по полу . 156 | И это ему очень нравится . 157 | Никто ему не мешает . 158 | Не заставляет играть на рояле . 159 | И потому ему очень весело . 160 | Плачет он очень редко . 161 | Когда у него что-нибудь отнимают . 162 | Или когда его стригут . 163 | Он совершенно не любит стричься . 164 | Он так и ходил бы всю жизнь лохматый . 165 | На это он не обращает внимания . 166 | В общем, ему хорошо , а мне плохо . 167 | Папа с мамой слушают , как я играю . 168 | Брат катает по полу колёсики . 169 | За окном кричат четыре брата . 170 | Они кричат разными голосами . 171 | Я вижу в окно : они машут руками . 172 | Они зовут меня . 173 | Им одним скучно . 174 | -------------------------------------------------------------------------------- /grammars/alphabets.grm: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | # 13 | # Copyright 2014 Yandex LLC 14 | # All Rights Reserved. 15 | # 16 | # Author : Alexis Wilpert 17 | 18 | 19 | 20 | # CYRILLIC 21 | # -------- 22 | 23 | export letter = Optimize[ 24 | "а" | "б" | "в" | "г" | "д" | "е" | "ё" | "ж" | "з" | 25 | "и" | "й" | "к" | "л" | "м" | "н" | "о" | "п" | "р" | 26 | "с" | "т" | "у" | "ф" | "х" | "ц" | "ч" | "ш" | "щ" | 27 | "ъ" | "ы" | "ь" | "э" | "ю" | "я" 28 | ]; 29 | 30 | export voiceless_consonant_letters = 31 | "к" | "п" | "с" | "т" | "ф" | "х" | "ц" | "ч" | "ш" | "щ" 32 | ; 33 | 34 | export voiced_consonant_letters = 35 | "б" | "в" | "г" | "д" | "ж" | "з" | "л" | "м" | "н" | "р" 36 | ; 37 | 38 | export cons_letter = Optimize[voiceless_consonant_letters | 39 | voiced_consonant_letters | 40 | "ъ" | "ь" | "й" 41 | ]; 42 | 43 | export soft_vow_letter = "е" | "ё" | "и" | "ю" | "я"; 44 | export hard_vow_letter = "а" | "о" | "у" | "э" | "ы" ; 45 | 46 | export vow_letter = Optimize[soft_vow_letter | hard_vow_letter]; 47 | 48 | export word_sep = " "; 49 | 50 | # -------------------------------------------------------------------------------------- 51 | 52 | # PHONETIC 53 | # -------- 54 | 55 | # plosives 56 | 57 | export hard_plosives = "p" | "b" | "t" | "d" | "k" | "g"; 58 | export soft_plosives = "[pJ]" | "[bJ]" | "[tJ]" | "[dJ]" | "[kJ]" | "[gJ]"; 59 | 60 | export geminated_hard_plosives = "[t_t]" | "[d_d]"; 61 | export geminated_soft_plosives = "[t_tJ]" | "[d_dJ]"; 62 | 63 | export coronal_plosives = "t" | "d" | "[tJ]" | "[dJ]" | "[t_t]" | "[d_d]" | "[t_tJ]" | "[d_dJ]"; 64 | 65 | export anterior_plosives = "p" | "b" | "t" | "d" | 66 | "[pJ]" | "[bJ]" | "[tJ]" | "[dJ]" | 67 | "[t_t]" | "[d_d]" | 68 | "[t_tJ]" | "[d_dJ]" 69 | ; 70 | 71 | export geminated_plosives = Optimize[geminated_hard_plosives | geminated_soft_plosives]; 72 | 73 | export voiced_plosives = "b" | "d" | "g" | "[bJ]" | "[dJ]" | "[gJ]" | "[d_d]" | "[d_dJ]"; 74 | export voiceless_plosives = "p" | "t" | "k" | "[pJ]" | "[tJ]" | "[kJ]" | "[t_t]" | "[t_tJ]"; 75 | 76 | export plosives = Optimize[hard_plosives | soft_plosives | geminated_plosives]; 77 | 78 | # fricatives 79 | 80 | export hard_fricatives = "S" | "Z" | "f" | "s" | "v" | "x" | "z"; 81 | export soft_fricatives = "[SJ]" | "[ZJ]" | "[fJ]" | "[sJ]" | "[vJ]" | "[xJ]" | "[zJ]"; 82 | 83 | export geminated_hard_fricatives = "[s_s]"; 84 | export geminated_soft_fricatives = "[s_sJ]"; 85 | 86 | export geminated_fricatives = Optimize[geminated_hard_fricatives | 87 | geminated_soft_fricatives 88 | ]; 89 | 90 | export coronal_fricatives = "S" | "Z" | "s" | "z" | 91 | "[SJ]" | "[ZJ]" | "[sJ]" | "[zJ]" | 92 | "[S_S]" | "[Z_Z]" | "[s_s]" | "[z_z]" | 93 | "[S_SJ]" | "[Z_ZJ]" | "[s_sJ]" | "[z_zJ]" 94 | ; 95 | 96 | export anterior_fricatives = "f" | "s" | "v" | "z" | 97 | "[fJ]" | "[sJ]" | "[vJ]" | "[zJ]" | 98 | "[s_s]" | "[v_v]" | "[z_z]" | 99 | "[s_sJ]" | "[v_vJ]" | "[z_zJ]" 100 | ; 101 | 102 | export voiced_fricatives = "Z" | "v" | "z" | "[ZJ]" | "[vJ]" | "[zJ]" | "[Z_Z]" | "[v_v]" | 103 | "[z_z]" | "[Z_ZJ]" | "[z_zJ]" | "[v_vJ]" 104 | ; 105 | 106 | export voiceless_fricatives = "S" | "f" | "s" | "[SJ]" | "[fJ]" | "[sJ]" | "[S_S]" | "[s_s]" | 107 | "[s_s]" | "[S_SJ]" | "[s_sJ]" | "x" | "[xJ]" 108 | ; 109 | 110 | export fricatives = Optimize[hard_fricatives | soft_fricatives | geminated_fricatives]; 111 | 112 | # nasals 113 | 114 | export hard_nasals = "m" | "n"; 115 | export soft_nasals = "[mJ]" | "[nJ]"; 116 | export geminated_hard_nasals = "[n_n]"; 117 | export geminated_soft_nasals = "[n_nJ]"; 118 | export coronal_nasals = "n" | "[nJ]" | "[n_n]" | "[n_nJ]"; 119 | 120 | export geminated_nasals = Optimize[geminated_hard_nasals | geminated_soft_nasals]; 121 | 122 | export nasals = Optimize[hard_nasals | soft_nasals | geminated_nasals]; 123 | 124 | # liquids 125 | 126 | export hard_liquids = "l" | "r"; 127 | export soft_liquids = "[lJ]" | "[rJ]"; 128 | export geminated_hard_liquids = "[l_l]" | "[r_r]"; 129 | export geminated_soft_liquids = "[l_lJ]" | "[r_rJ]"; 130 | export laterals = "l" | "[lJ]" | "[l_l]" | "[l_lJ]"; 131 | 132 | export geminated_liquids = Optimize[geminated_hard_liquids | geminated_soft_liquids]; 133 | 134 | export liquids = Optimize[hard_liquids | soft_liquids | geminated_liquids]; 135 | 136 | # approximant 137 | 138 | export approximant = "j"; 139 | 140 | # affricates 141 | 142 | export hard_affricates = "[d_Z]" | "[d_z]" | "[t_s]"; 143 | export soft_affricates = "[t_SJ]" | "[d_ZJ]"; 144 | 145 | export voiced_affricates = "[d_Z]" | "[d_z]" | "[d_ZJ]"; 146 | export voiceless_affricates = "[t_s]" | "[t_SJ]"; 147 | 148 | export anterior_affricates = "[d_z]" | "[t_s]"; 149 | 150 | export affricates = Optimize[hard_affricates | soft_affricates]; 151 | 152 | # consonants 153 | 154 | export hard_cons_phono = Optimize[hard_plosives | 155 | geminated_hard_plosives | 156 | hard_fricatives | 157 | geminated_hard_fricatives | 158 | hard_nasals | 159 | geminated_hard_nasals | 160 | hard_liquids | 161 | geminated_hard_liquids | 162 | hard_affricates 163 | ]; 164 | 165 | export voiced_consonants = Optimize[voiced_plosives | 166 | voiced_fricatives | 167 | nasals | 168 | liquids | 169 | approximant | 170 | voiced_affricates 171 | ]; 172 | 173 | export voiceless_consonants = Optimize[voiceless_plosives | 174 | voiceless_fricatives | 175 | voiceless_affricates 176 | ]; 177 | 178 | export consonants = Optimize[voiced_consonants | voiceless_consonants]; 179 | 180 | # vowels 181 | 182 | export stressed_close_vowels = "i" | "[i_x]" | "e"; 183 | export stressed_other_vowels = "a" | "u" | "o"; 184 | 185 | export stressed_vowels = Optimize[stressed_close_vowels | stressed_other_vowels]; 186 | 187 | export reduced_vowels = "[@_o]" | "E" | "[@_r]" | "I" | "U" | "@" | "[U_x]"; 188 | 189 | export vowels = Optimize[stressed_vowels | reduced_vowels]; 190 | 191 | # diphthongs 192 | 193 | export stressed_front_diphthongs = "[i_i]" | "[e_i]"; 194 | export stressed_other_diphthongs = "[a_i]" | "[o_i]" | "[u_i]"; 195 | 196 | export stressed_diphthongs = Optimize[stressed_front_diphthongs | 197 | stressed_other_diphthongs 198 | ]; 199 | 200 | export reduced_diphthongs = "[I_i]" | "[@_o_i]" | "[@_i]" | "[U_i]" | "[@_r_i]"; 201 | 202 | export diphthongs = Optimize[stressed_diphthongs | reduced_diphthongs]; 203 | 204 | # vowels + diphthongs 205 | 206 | export stressed_front_nuclei = Optimize[stressed_close_vowels | 207 | stressed_front_diphthongs]; 208 | 209 | export soft_nuclei = "i" | "e" | "[i_i]" | "[e_i]"; 210 | 211 | export nuclei = Optimize[vowels | diphthongs]; 212 | 213 | export phone = Optimize[nuclei | consonants]; 214 | 215 | # -------------------------------------------------------------------------------------- 216 | 217 | # MIXED 218 | # ----- 219 | 220 | export voiced = Optimize[voiced_consonant_letters | voiced_consonants]; 221 | export voiceless = Optimize[voiceless_consonant_letters | voiceless_consonants]; 222 | 223 | # -------------------------------------------------------------------------------------- 224 | 225 | # COMPLETE ALPHABET 226 | # ----------------- 227 | 228 | export letter_star = Optimize[letter*]; 229 | export phone_star = Optimize[phone*]; 230 | export sigma = Optimize[letter | phone | 231 | " " | "-" | "+" | 232 | "[SIL]" | "[ERROR]" | 233 | "[ADJ]" | "[VERB]" | 234 | "[WUD]" 235 | ]; 236 | export sigma_star = Optimize[sigma*]; 237 | 238 | # -------------------------------------------------------------------------------------- 239 | 240 | # FOR TESTING 241 | # ----------- 242 | composed_char = ("I_i":"[I_i]") | 243 | ("@_o":"[@_o]") | 244 | ("ZJ":"[ZJ]") | 245 | ("t_tJ":"[t_tJ]") | 246 | ("l_l":"[l_l]") | 247 | ("rJ":"[rJ]") | 248 | ("tJ":"[tJ]") | 249 | ("s_s":"[s_s]") | 250 | ("fJ":"[fJ]") | 251 | ("t_SJ":"[t_SJ]") | 252 | ("vJ":"[vJ]") | 253 | ("i_i":"[i_i]") | 254 | ("d_d":"[d_d]") | 255 | ("dJ":"[dJ]") | 256 | ("n_n":"[n_n]") | 257 | ("U_i":"[U_i]") | 258 | ("l_lJ":"[l_lJ]") | 259 | ("d_Z":"[d_Z]") | 260 | ("d_dJ":"[d_dJ]") | 261 | ("s_sJ":"[s_sJ]") | 262 | ("gJ":"[gJ]") | 263 | ("kJ":"[kJ]") | 264 | ("t_t":"[t_t]") | 265 | ("e_i":"[e_i]") | 266 | ("t_s":"[t_s]") | 267 | ("i_x":"[i_x]") | 268 | ("sJ":"[sJ]") | 269 | ("d_z":"[d_z]") | 270 | ("SJ":"[SJ]") | 271 | ("mJ":"[mJ]") | 272 | ("@_o_i":"[@_o_i]") | 273 | ("pJ":"[pJ]") | 274 | ("zJ":"[zJ]") | 275 | ("xJ":"[xJ]") | 276 | ("a_i":"[a_i]") | 277 | ("@_r":"[@_r]") | 278 | ("bJ":"[bJ]") | 279 | ("u_i":"[u_i]") | 280 | ("o_i":"[o_i]") | 281 | ("d_ZJ":"[d_ZJ]") | 282 | ("nJ":"[nJ]") | 283 | ("@_i":"[@_i]") | 284 | ("U_x":"[U_x]") | 285 | ("lJ":"[lJ]") | 286 | ("@_r_i":"[@_r_i]") | 287 | ("n_nJ":"[n_nJ]") | 288 | ("SIL":"[SIL]") | 289 | ("ERROR":"[ERROR]") | 290 | ("ADJ":"[ADJ]") | 291 | ("VERB":"[VERB]") | 292 | ("WUD":"[WUD]") 293 | ; 294 | 295 | inv_composed_char = ("[@_i]":"@_i") | 296 | ("[i_i]":"i_i") | 297 | ("[u_i]":"u_i") | 298 | ("[o_i]":"o_i") | 299 | ("[tJ]":"tJ") | 300 | ("[t_tJ]":"t_tJ") | 301 | ("[vJ]":"vJ") | 302 | ("[t_t]":"t_t") | 303 | ("[U_x]":"U_x") | 304 | ("[lJ]":"lJ") | 305 | ("[n_n]":"n_n") | 306 | ("[xJ]":"xJ") | 307 | ("[n_nJ]":"n_nJ") | 308 | ("[dJ]":"dJ") | 309 | ("[I_i]":"I_i") | 310 | ("[fJ]":"fJ") | 311 | ("[bJ]":"bJ") | 312 | ("[l_l]":"l_l") | 313 | ("[zJ]":"zJ") | 314 | ("[@_r]":"@_r") | 315 | ("[d_ZJ]":"d_ZJ") | 316 | ("[i_x]":"i_x") | 317 | ("[a_i]":"a_i") | 318 | ("[e_i]":"e_i") | 319 | ("[d_d]":"d_d") | 320 | ("[s_s]":"s_s") | 321 | ("[kJ]":"kJ") | 322 | ("[l_lJ]":"l_lJ") | 323 | ("[t_s]":"t_s") | 324 | ("[d_z]":"d_z") | 325 | ("[nJ]":"nJ") | 326 | ("[t_SJ]":"t_SJ") | 327 | ("[d_dJ]":"d_dJ") | 328 | ("[SJ]":"SJ") | 329 | ("[@_o_i]":"@_o_i") | 330 | ("[pJ]":"pJ") | 331 | ("[s_sJ]":"s_sJ") | 332 | ("[gJ]":"gJ") | 333 | ("[U_i]":"U_i") | 334 | ("[sJ]":"sJ") | 335 | ("[@_o]":"@_o") | 336 | ("[d_Z]":"d_Z") | 337 | ("[ZJ]":"ZJ") | 338 | ("[mJ]":"mJ") | 339 | ("[@_r_i]":"@_r_i") | 340 | ("[rJ]":"rJ") | 341 | ("[ERROR]":"ERROR") | 342 | ("[SIL]":"SIL") | 343 | ("[ADJ]":"ADJ") | 344 | ("[VERB]":"VERB") | 345 | ("[WUD]":"WUD") 346 | ; 347 | 348 | simple_char = "S" | "ш" | "a" | "щ" | "п" | "d" | "E" | "д" | "j" | 349 | "Z" | "u" | "ь" | "k" | "g" | "ч" | "е" | "в" | "t" | 350 | "б" | "e" | "у" | "а" | "v" | "s" | "й" | "b" | "I" | 351 | "р" | "к" | "т" | "н" | "z" | "ц" | "U" | "м" | "r" | 352 | "о" | "ж" | "з" | "x" | "и" | "ъ" | "с" | "ф" | "л" | 353 | "я" | "@" | "f" | "i" | "n" | "ё" | "m" | "l" | "p" | 354 | "г" | "х" | "э" | "ы" | "ю" | "o" | " " | "-" | "+" 355 | ; 356 | 357 | export in_feeder = Optimize[(simple_char|composed_char)*]; 358 | export out_feeder = Optimize[(simple_char|inv_composed_char)*]; 359 | -------------------------------------------------------------------------------- /grammars/inflections.grm: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | # 13 | # Copyright 2014 Yandex LLC 14 | # All Rights Reserved. 15 | # 16 | # Author : Alexis Wilpert 17 | 18 | 19 | 20 | import 'alphabets.grm' as alphabets; 21 | import 'definitions.grm' as defs; 22 | 23 | # set/class definitions 24 | 25 | WSEP = alphabets.word_sep; 26 | LSEP = defs.LSEP; 27 | RSEP = (defs.RSEP | "[SIL]" | "[EOS]"); 28 | vowel = defs.vowel; 29 | cons_letter_hyphen = defs.cons_letter_hyphen; 30 | stress_minus_1 = defs.stress_minus_1; 31 | before_stress = defs.before_stress; 32 | all = alphabets.letter | alphabets.nuclei | "-"; 33 | soft_cons = defs.soft_cons; 34 | hard_cons = defs.hard_cons; 35 | hard_cons_phono = alphabets.hard_cons_phono; 36 | 37 | unstressed_seq = (all | "+")* all; 38 | 39 | #---------------------------------------------------------------------------- 40 | 41 | # EXCEPTIONS 42 | 43 | # SEE ALSO: letter_simplification rules in consonants.grm 44 | 45 | # letter <и> after hard consonants 46 | hard_i = CDRewrite[("и":"ы"), 47 | ( (hard_cons "[WUD]"? WSEP "+"?) | 48 | ("ж" | "ш" | "ц") "+"?), 49 | "", 50 | alphabets.sigma_star 51 | ]; 52 | 53 | # letter <и> after hard consonants (for WUD words) 54 | hard_i_phono = CDRewrite[("и":"ы"), 55 | (hard_cons_phono 56 | "[WUD]" WSEP "+"?), 57 | "", 58 | alphabets.sigma_star 59 | ]; 60 | 61 | # -чувст- --> -чуст- 62 | exception1 = CDRewrite[("в":""), 63 | "ч" "+"? "у", 64 | "с" "т", 65 | alphabets.sigma_star 66 | ]; 67 | 68 | # г --> х 69 | # легк- --> лехк- 70 | # лёгк- --> лёхк- 71 | exception2 = CDRewrite[("г":"х"), 72 | "л" "+"? ("е" | "ё"), 73 | "к", 74 | alphabets.sigma_star 75 | ]; 76 | 77 | # счаст --> щаст 78 | exception3a = CDRewrite[("с":"щ"), 79 | "", 80 | "ч" "а" "с" "т", 81 | alphabets.sigma_star 82 | ]; 83 | 84 | exception3b = CDRewrite[("ч":""), 85 | "щ", 86 | "а" "с" "т", 87 | alphabets.sigma_star 88 | ]; 89 | 90 | # for <контрр-> words (like <контрреволюционная>) 91 | # these are the only words that are pronounced with a double [rr] 92 | # most of them are included in the lexicon with a pseudo transcription 93 | # [ррр]; here we will simpliy double <рр> to <р>, which makes the triple 94 | # pseudo transcription an actual double <рр>, which is then later mapped 95 | # correctly to double phonetic [rr] or [rJrJ] 96 | 97 | exception4a = CDRewrite[("р":""), 98 | "", 99 | "р" (alphabets.letter - "р"), 100 | alphabets.sigma_star 101 | ]; 102 | 103 | exception4b = CDRewrite[("р":""), 104 | "", 105 | "р" "р", 106 | alphabets.sigma_star 107 | ]; 108 | 109 | exceptions = Optimize[hard_i @ 110 | hard_i_phono @ 111 | exception1 @ 112 | exception2 @ 113 | exception3a @ 114 | exception3b @ 115 | exception4a @ 116 | exception4b 117 | ]; 118 | 119 | #---------------------------------------------------------------------------- 120 | 121 | # ONLY FOR ADJECTIVES 122 | 123 | # ого ->ово, его ->ево 124 | 125 | g_to_v = CDRewrite[("г":"в"), 126 | "[ADJ]" (all | "+")* ("о" | "е"), 127 | "+"? "о" RSEP, 128 | alphabets.sigma_star 129 | ]; 130 | 131 | #---------------------------------------------------------------------------- 132 | 133 | # ONLY FOR ADJECTIVES 134 | 135 | # <ие>/ ST --> [i I] 136 | # <ее>/ ST --> [e E] 137 | # <ое>/ ST --> [o E] 138 | # <ые>/ ST --> [i_x I] 139 | 140 | e_stressed_pairs_left = ("и":"i") | 141 | ("е":"e") | 142 | ("о":"o") | 143 | ("ы":"[i_x]") 144 | ; 145 | 146 | e_stress_infl_left = CDRewrite[e_stressed_pairs_left, 147 | "[ADJ]" all* "+", 148 | "е" RSEP, 149 | alphabets.sigma_star 150 | ]; 151 | 152 | e_stress_infl_right1 = CDRewrite[("е":"I"), 153 | ("i" | "[i_x]"), 154 | RSEP, 155 | alphabets.sigma_star 156 | ]; 157 | 158 | e_stress_infl_right2 = CDRewrite[("е":"E"), 159 | ("e" | "o"), 160 | RSEP, 161 | alphabets.sigma_star 162 | ]; 163 | 164 | e_stress_infl = Optimize[e_stress_infl_left @ 165 | e_stress_infl_right1 @ 166 | e_stress_infl_right2 167 | ]; 168 | 169 | # <ие>/ ¬ST --> [I I] 170 | # <ее>/ ¬ST --> [I_i @] 171 | # <ое>/ ¬ST --> [@_i I] 172 | # <ые>/ ¬ST --> [@_r I] 173 | 174 | e_unstressed_pairs_left = ("и":"I") | 175 | ("е":"[I_i]") | 176 | ("о":"[@_i]") | 177 | ("ы":"[@_r]") 178 | ; 179 | 180 | e_unstress_infl_left = CDRewrite[e_unstressed_pairs_left, 181 | "[ADJ]" unstressed_seq, 182 | "е" RSEP, 183 | alphabets.sigma_star 184 | ]; 185 | 186 | e_unstress_infl_right1 = CDRewrite[("е":"I"), 187 | ("I" | "[@_i]" | "[@_r]"), 188 | RSEP, 189 | alphabets.sigma_star 190 | ]; 191 | 192 | e_unstress_infl_right2 = CDRewrite[("е":"@"), 193 | "[I_i]", 194 | RSEP, 195 | alphabets.sigma_star 196 | ]; 197 | 198 | e_unstress_infl = Optimize[e_unstress_infl_left @ 199 | e_unstress_infl_right1 @ 200 | e_unstress_infl_right2 201 | ]; 202 | 203 | # <ую>/ ST --> [u_i U_x] 204 | # <ою>/ ST --> [o_i U_x] 205 | 206 | yu_stressed_pairs_left = ("у":"[u_i]") | 207 | ("о":"[o_i]") 208 | ; 209 | 210 | yu_stress_infl_left = CDRewrite[yu_stressed_pairs_left, 211 | "[ADJ]" all* "+", 212 | "ю" RSEP, 213 | alphabets.sigma_star 214 | ]; 215 | 216 | yu_stress_infl_right = CDRewrite[("ю":"[U_x]"), 217 | ("[u_i]" | "[o_i]"), 218 | RSEP, 219 | alphabets.sigma_star 220 | ]; 221 | 222 | yu_stress_infl = Optimize[yu_stress_infl_left @ 223 | yu_stress_infl_right 224 | ]; 225 | 226 | # <ую>/ ¬ST --> [U_i U_x] 227 | # <ею>/ ¬ST --> [I_i U_x] 228 | # <ою>/ ¬ST --> [@_i U_x] 229 | 230 | yu_unstressed_pairs_left = ("у":"[U_i]") | 231 | ("е":"[I_i]") | 232 | ("о":"[@_i]") 233 | ; 234 | 235 | yu_unstress_infl_left = CDRewrite[yu_unstressed_pairs_left, 236 | "[ADJ]" unstressed_seq, 237 | "ю" RSEP, 238 | alphabets.sigma_star 239 | ]; 240 | 241 | yu_unstress_infl_right = CDRewrite[("ю":"[U_x]"), 242 | ("[U_i]" | "[I_i]" | "[@_i]"), 243 | RSEP, 244 | alphabets.sigma_star 245 | ]; 246 | 247 | yu_unstress_infl = Optimize[yu_unstress_infl_left @ 248 | yu_unstress_infl_right 249 | ]; 250 | 251 | # <ой>/ ST --> [o_i] 252 | 253 | i_stress_infl_left = CDRewrite[("о":"[o_i]"), 254 | "[ADJ]" all* "+", 255 | "й" RSEP, 256 | alphabets.sigma_star 257 | ]; 258 | 259 | i_stress_infl_right = CDRewrite[("й":""), 260 | "[o_i]", 261 | RSEP, 262 | alphabets.sigma_star 263 | ]; 264 | 265 | i_stress_infl = Optimize[i_stress_infl_left @ 266 | i_stress_infl_right 267 | ]; 268 | 269 | # <ой>/ ¬ST --> [@_i] 270 | # <ей>/ ¬ST --> [I_i] 271 | # <ый>/ ¬ST --> [@_r_i] 272 | # <ий>/ ¬ST --> [I_i] 273 | 274 | i_unstress_pairs_left = ("о":"[@_i]") | 275 | ("е":"[I_i]") | 276 | ("ы":"[@_r_i]") | 277 | ("и":"[I_i]") 278 | ; 279 | 280 | i_unstress_infl_left = CDRewrite[i_unstress_pairs_left, 281 | "[ADJ]" unstressed_seq, 282 | "й" RSEP, 283 | alphabets.sigma_star 284 | ]; 285 | 286 | i_unstress_infl_right = CDRewrite[("й":""), 287 | "[@_i]" | 288 | "[I_i]" | 289 | "[@_r_i]" | 290 | "[I_i]", 291 | RSEP, 292 | alphabets.sigma_star 293 | ]; 294 | 295 | i_unstress_infl = Optimize[i_unstress_infl_left @ 296 | i_unstress_infl_right 297 | ]; 298 | 299 | 300 | # <ая>/ ST --> [a_i @] 301 | 302 | ya_stress_infl_left = CDRewrite[("а":"[a_i]"), 303 | "[ADJ]" all* "+", 304 | "я" RSEP, 305 | alphabets.sigma_star 306 | ]; 307 | 308 | ya_stress_infl_right = CDRewrite[("я":"@"), 309 | "[a_i]", 310 | RSEP, 311 | alphabets.sigma_star 312 | ]; 313 | 314 | ya_stress_infl = Optimize[ya_stress_infl_left @ 315 | ya_stress_infl_right 316 | ]; 317 | 318 | # <ая>/ ¬ST --> [@_i @] 319 | # <яя>/ ¬ST --> [@_i @] 320 | 321 | ya_unstress_infl_left1 = CDRewrite[("а":"[@_i]"), 322 | "[ADJ]" unstressed_seq, 323 | "я" RSEP, 324 | alphabets.sigma_star 325 | ]; 326 | 327 | ya_unstress_infl_left2 = CDRewrite[("я":"[@_i]"), 328 | "[ADJ]" unstressed_seq, 329 | "я" RSEP, 330 | alphabets.sigma_star 331 | ]; 332 | 333 | ya_unstress_infl_right = CDRewrite[("я":"@"), 334 | "[@_i]", 335 | RSEP, 336 | alphabets.sigma_star 337 | ]; 338 | 339 | ya_unstress_infl = Optimize[ya_unstress_infl_left1 @ 340 | ya_unstress_infl_left2 @ 341 | ya_unstress_infl_right 342 | ]; 343 | 344 | #---------------------------------------------------------------------------- 345 | 346 | # ONLY FOR VERBS 347 | 348 | # -тся, -ться --> [t_s@] 349 | 350 | verb_infl1 = CDRewrite[("т":"ц"), 351 | "[VERB]" (all | "+")*, 352 | "ь"? "с" "я" RSEP, 353 | alphabets.sigma_star 354 | ]; 355 | 356 | verb_infl2 = CDRewrite[("ь":""), 357 | "[VERB]" (all | "+")* "ц", 358 | "с" "я" RSEP, 359 | alphabets.sigma_star 360 | ]; 361 | 362 | verb_infl3 = CDRewrite[("с":""), 363 | "[VERB]" (all | "+")* "ц", 364 | "я" RSEP, 365 | alphabets.sigma_star 366 | ]; 367 | 368 | verb_infl4 = CDRewrite[("я":"@"), 369 | "[VERB]" (all | "+")* "ц", 370 | RSEP, 371 | alphabets.sigma_star 372 | ]; 373 | 374 | verb_infl = Optimize[verb_infl1 @ 375 | verb_infl2 @ 376 | verb_infl3 @ 377 | verb_infl4 378 | ]; 379 | 380 | #---------------------------------------------------------------------------- 381 | 382 | # clean POS tags 383 | 384 | clean_adj_pos = CDRewrite[("[ADJ]":""), 385 | "", 386 | "", 387 | alphabets.sigma_star 388 | ]; 389 | 390 | clean_verb_pos = CDRewrite[("[VERB]":""), 391 | "", 392 | "", 393 | alphabets.sigma_star 394 | ]; 395 | 396 | clean_pos = Optimize[clean_adj_pos @ clean_verb_pos]; 397 | 398 | #---------------------------------------------------------------------------- 399 | 400 | export inflections = Optimize[exceptions @ 401 | g_to_v @ 402 | e_stress_infl @ 403 | e_unstress_infl @ 404 | yu_stress_infl @ 405 | yu_unstress_infl @ 406 | i_stress_infl @ 407 | i_unstress_infl @ 408 | ya_stress_infl @ 409 | ya_unstress_infl @ 410 | verb_infl @ 411 | clean_pos 412 | ]; 413 | 414 | -------------------------------------------------------------------------------- /grammars/consonants.grm: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | # 13 | # Copyright 2014 Yandex LLC 14 | # All Rights Reserved. 15 | # 16 | # Author : Alexis Wilpert 17 | 18 | 19 | 20 | import 'alphabets.grm' as alphabets; 21 | 22 | SEP = alphabets.word_sep; 23 | WUD = "[WUD]" SEP; 24 | 25 | #---------------------------------------------------------------------------- 26 | 27 | letter_simplification1 = CDRewrite[("т":"")*, 28 | "с", 29 | ("н" | "ч" | "ск"), 30 | alphabets.sigma_star 31 | ]; 32 | 33 | 34 | letter_simplification2 = CDRewrite[("д":"")*, 35 | "з", 36 | ("н" | "ч"), 37 | alphabets.sigma_star 38 | ]; 39 | 40 | letter_simplification3 = CDRewrite[("д":"")*, 41 | "р", 42 | ("ч" | "ц"), 43 | alphabets.sigma_star 44 | ]; 45 | 46 | letter_simplification4 = CDRewrite[("в":"")*, 47 | "л", 48 | "ств", 49 | alphabets.sigma_star 50 | ]; 51 | 52 | letter_simplification5 = CDRewrite[("н":"")*, 53 | "л", 54 | "ц", 55 | alphabets.sigma_star 56 | ]; 57 | 58 | 59 | letter_simplifications = Optimize[letter_simplification1 @ 60 | letter_simplification2 @ 61 | letter_simplification3 @ 62 | letter_simplification4 @ 63 | letter_simplification5 64 | ]; 65 | 66 | #---------------------------------------------------------------------------- 67 | 68 | soft_cons = ("б":"[bJ]") | 69 | ("в":"[vJ]") | 70 | ("г":"[gJ]") | 71 | ("д":"[dJ]") | 72 | ("ж":"[ZJ]") | 73 | ("з":"[zJ]") | 74 | ("к":"[kJ]") | 75 | ("л":"[lJ]") | 76 | ("м":"[mJ]") | 77 | ("н":"[nJ]") | 78 | ("п":"[pJ]") | 79 | ("р":"[rJ]") | 80 | ("с":"[sJ]") | 81 | ("т":"[tJ]") | 82 | ("ф":"[fJ]") | 83 | ("х":"[xJ]") | 84 | ("щ":"[SJ]") 85 | ; 86 | 87 | softening = CDRewrite[soft_cons, 88 | "", 89 | "ь", 90 | alphabets.sigma_star] 91 | ; 92 | 93 | hard_cons = ("б":"b") | 94 | ("в":"v") | 95 | ("д":"d") | 96 | ("ж":"Z") | 97 | ("з":"z") | 98 | ("к":"k") | 99 | ("н":"n") | 100 | ("р":"r") | 101 | ("с":"s") | 102 | ("т":"t") | 103 | ("х":"x") 104 | ; 105 | 106 | hardening = CDRewrite[hard_cons, 107 | "", 108 | "ъ", 109 | alphabets.sigma_star] 110 | ; 111 | 112 | hard_soft_letters = Optimize[softening @ hardening]; 113 | 114 | 115 | soft_hard_chars = ("ь":"") | 116 | ("ъ":"") 117 | ; 118 | 119 | clean_soft_hard_chars = CDRewrite[soft_hard_chars, 120 | "", 121 | "", 122 | alphabets.sigma_star 123 | ]; 124 | 125 | #---------------------------------------------------------------------------- 126 | 127 | letter_pairs = Optimize[("п":"p") | 128 | ("б":"b") | 129 | ("т":"t") | 130 | ("д":"d") | 131 | ("к":"k") | 132 | ("г":"g") | 133 | ("м":"m") | 134 | ("н":"n") | 135 | ("л":"l") | 136 | ("р":"r") | 137 | ("ф":"f") | 138 | ("в":"v") | 139 | ("с":"s") | 140 | ("з":"z") | 141 | ("ж":"Z") | 142 | ("ш":"S") | 143 | ("щ":"[SJ]") | 144 | ("х":"x") | 145 | ("ц":"[t_s]") | 146 | ("ч":"[t_SJ]") 147 | ]; 148 | 149 | letter_g2p = CDRewrite[letter_pairs, 150 | "", 151 | "", 152 | alphabets.sigma_star 153 | ]; 154 | 155 | #---------------------------------------------------------------------------- 156 | 157 | export devoicing_pairs = ("b":"p") | 158 | ("d":"t") | 159 | ("g":"k") | 160 | ("[bJ]":"[pJ]") | 161 | ("[dJ]":"[tJ]") | 162 | ("[gJ]":"[kJ]") | 163 | ("[d_d]":"[t_t]") | 164 | ("[d_dJ]":"[t_tJ]") | 165 | 166 | ("Z":"S") | 167 | ("v":"f") | 168 | ("z":"s") | 169 | ("[ZJ]":"[SJ]") | 170 | ("[vJ]":"[fJ]") | 171 | ("[zJ]":"[sJ]") | 172 | 173 | ("[d_z]":"[t_s]") | 174 | ("[d_ZJ]":"[t_SJ]") 175 | ; 176 | 177 | export voicing_pairs = ("p":"b") | 178 | ("t":"d") | 179 | ("k":"g") | 180 | ("[pJ]":"[bJ]") | 181 | ("[tJ]":"[dJ]") | 182 | ("[kJ]":"[gJ]") | 183 | ("[t_t]":"[d_d]") | 184 | ("[t_tJ]":"[d_dJ]") | 185 | 186 | ("S":"Z") | 187 | ("f":"v") | 188 | ("s":"z") | 189 | ("[SJ]":"[ZJ]") | 190 | ("[fJ]":"[vJ]") | 191 | ("[sJ]":"[zJ]") | 192 | 193 | ("[t_s]":"[d_z]") | 194 | ("[t_SJ]":"[d_ZJ]") 195 | ; 196 | 197 | export voicing_context = Optimize[(alphabets.voiced_consonants - 198 | ("v" | "[vJ]" | "j" | 199 | alphabets.liquids | 200 | alphabets.nasals 201 | ) 202 | ) 203 | ]; 204 | 205 | devoicing_context = alphabets.voiceless_consonants; 206 | 207 | voicing = CDRewrite[voicing_pairs*, 208 | "", 209 | WUD? voicing_context, 210 | alphabets.sigma_star 211 | ]; 212 | 213 | devoicing = CDRewrite[devoicing_pairs*, 214 | "", 215 | WUD? devoicing_context, 216 | alphabets.sigma_star 217 | ]; 218 | 219 | export voice_assimilation = Optimize[voicing @ devoicing]; 220 | 221 | #---------------------------------------------------------------------------- 222 | 223 | # context dependent palatalization rules 224 | 225 | # [d] --> [dJ] / _ [dJ] | [zJ] | [nJ] 226 | phonetic_palatalization1 = CDRewrite[("d":"[dJ]"), 227 | "", 228 | WUD? ("[dJ]" | "[zJ]" | "[nJ]"), 229 | alphabets.sigma_star 230 | ]; 231 | 232 | # [d] --> [tJ] / _ [tJ] 233 | phonetic_palatalization2 = CDRewrite[("d":"[tJ]"), 234 | "", 235 | WUD? "[tJ]", 236 | alphabets.sigma_star 237 | ]; 238 | 239 | # [n] --> [nJ] / _ [tJ] | [sJ] | [t_SJ] | [nJ] | [SJ] 240 | phonetic_palatalization3 = CDRewrite[("n":"[nJ]"), 241 | "", 242 | WUD? ("[tJ]" | "[sJ]" | 243 | "[t_SJ]" | "[nJ]" | "[SJ]"), 244 | alphabets.sigma_star 245 | ]; 246 | 247 | # [s] --> [SJ] / _ [t_SJ] 248 | phonetic_palatalization4 = CDRewrite[("s":"[SJ]"), 249 | "", 250 | WUD? "[t_SJ]", 251 | alphabets.sigma_star 252 | ]; 253 | 254 | # [s] --> [S] / _ [S] 255 | phonetic_palatalization5 = CDRewrite[("s":"S"), 256 | "", 257 | WUD? "S", 258 | alphabets.sigma_star 259 | ]; 260 | 261 | # [s] --> [Z] / _ [Z] 262 | phonetic_palatalization6 = CDRewrite[("s":"Z"), 263 | "", 264 | WUD? "Z", 265 | alphabets.sigma_star 266 | ]; 267 | 268 | # [s] --> [sJ] / _ [sJ] | [tJ] | [nJ] 269 | phonetic_palatalization7 = CDRewrite[("s":"[sJ]"), 270 | "", 271 | WUD? ("[sJ]" | "[tJ]" | "[nJ]"), 272 | alphabets.sigma_star 273 | ]; 274 | 275 | # [s] --> [zJ] / _ [zJ] 276 | phonetic_palatalization8 = CDRewrite[("s":"[zJ]"), 277 | "", 278 | WUD? "[zJ]", 279 | alphabets.sigma_star 280 | ]; 281 | 282 | # [t] --> [dJ] / _ [dJ] 283 | phonetic_palatalization9 = CDRewrite[("t":"[dJ]"), 284 | "", 285 | WUD? "[dJ]", 286 | alphabets.sigma_star 287 | ]; 288 | 289 | # [t] --> [tJ] / _ [tJ] | [sJ] | [t_SJ] | [nJ] | [SJ] 290 | phonetic_palatalization10 = CDRewrite[("t":"[tJ]"), 291 | "", 292 | WUD? ("[tJ]" | "[sJ]" | 293 | "[t_SJ]" | "[nJ]" | "[SJ]"), 294 | alphabets.sigma_star 295 | ]; 296 | 297 | # [v] --> [vJ] / _ [vJ] 298 | # [f] --> [vJ] / _ [vJ] 299 | phonetic_palatalization11a = CDRewrite[("v":"[vJ]") | ("f":"[vJ]"), 300 | "", 301 | WUD? "[vJ]", 302 | alphabets.sigma_star 303 | ]; 304 | 305 | # [f] --> [fJ] / _ [fJ] 306 | # [v] --> [fJ] / _ [fJ] 307 | phonetic_palatalization11b = CDRewrite[("f":"[fJ]") | ("v":"[fJ]"), 308 | "", 309 | WUD? "[fJ]", 310 | alphabets.sigma_star 311 | ]; 312 | 313 | 314 | # [m] --> [mJ] / _ [mJ] 315 | phonetic_palatalization11c = CDRewrite[("m":"[mJ]"), 316 | "", 317 | WUD? "[mJ]", 318 | alphabets.sigma_star 319 | ]; 320 | 321 | # [r] --> [rJ] / _ [rJ] 322 | phonetic_palatalization11d = CDRewrite[("r":"[rJ]"), 323 | "", 324 | WUD? "[rJ]", 325 | alphabets.sigma_star 326 | ]; 327 | 328 | # [l] --> [lJ] / _ [lJ] 329 | phonetic_palatalization11e = CDRewrite[("l":"[lJ]"), 330 | "", 331 | WUD? "[lJ]", 332 | alphabets.sigma_star 333 | ]; 334 | 335 | # [z] --> [S] / _ [S] 336 | phonetic_palatalization12 = CDRewrite[("z":"S"), 337 | "", 338 | WUD? "S", 339 | alphabets.sigma_star 340 | ]; 341 | 342 | # [z] --> [Z] / _ [Z] 343 | phonetic_palatalization13 = CDRewrite[("z":"Z"), 344 | "", 345 | WUD? "Z", 346 | alphabets.sigma_star 347 | ]; 348 | 349 | # [z] --> [sJ] / _ [sJ] | [tJ] 350 | phonetic_palatalization14 = CDRewrite[("z":"[sJ]"), 351 | "", 352 | WUD? ("[sJ]" | "[tJ]"), 353 | alphabets.sigma_star 354 | ]; 355 | 356 | # [z] --> [zJ] / _ [zJ] | [dJ] | [nJ] 357 | phonetic_palatalization15 = CDRewrite[("z":"[zJ]"), 358 | "", 359 | WUD? ("[zJ]" | "[dJ]" | "[nJ]"), 360 | alphabets.sigma_star 361 | ]; 362 | 363 | phonetic_palatalization = Optimize[phonetic_palatalization1 @ 364 | phonetic_palatalization2 @ 365 | phonetic_palatalization3 @ 366 | phonetic_palatalization4 @ 367 | phonetic_palatalization5 @ 368 | phonetic_palatalization6 @ 369 | phonetic_palatalization7 @ 370 | phonetic_palatalization8 @ 371 | phonetic_palatalization9 @ 372 | phonetic_palatalization10 @ 373 | phonetic_palatalization11a @ 374 | phonetic_palatalization11b @ 375 | phonetic_palatalization11c @ 376 | phonetic_palatalization11d @ 377 | phonetic_palatalization11e @ 378 | phonetic_palatalization12 @ 379 | phonetic_palatalization13 @ 380 | phonetic_palatalization14 @ 381 | phonetic_palatalization15 382 | ]; 383 | 384 | #---------------------------------------------------------------------------- 385 | 386 | # devoice final consonants (before WUD) in SIL or EOS context 387 | 388 | wud_final_devoicing = CDRewrite[devoicing_pairs*, 389 | "", 390 | WUD ("[SIL]" | "[EOS]"), 391 | alphabets.sigma_star 392 | ]; 393 | 394 | #---------------------------------------------------------------------------- 395 | 396 | # ([s] | [z]) --> EPS / _ [S] [t_S] 397 | 398 | consonant_simplification_pairs = ("s":"") | ("z":""); 399 | 400 | consonant_simplification1 = CDRewrite[consonant_simplification_pairs*, 401 | "", 402 | ("S" "[t_SJ]"), 403 | alphabets.sigma_star 404 | ]; 405 | 406 | consonant_simplification2 = CDRewrite["[t_s]":"", 407 | "", 408 | "-"? "[t_s]", 409 | alphabets.sigma_star 410 | ]; 411 | 412 | consonant_simplification = Optimize[consonant_simplification1 @ 413 | consonant_simplification2 414 | ]; 415 | 416 | #---------------------------------------------------------------------------- 417 | 418 | # gemination 419 | 420 | # SYMBOL transcription example 421 | # --------------------------------------------- 422 | # PH: t_t @_o t_t o k отток 423 | # PH: t_tJ p @_o t_tJ e m @ подтема 424 | # PH: d_d @_o d_d a m отдам 425 | # PH: d_dJ @_o d_dJ e l отдел 426 | # PH: s_s r @_o s_s a d @ рассада 427 | # PH: s_sJ r a s_sJ e l I n @ расселина 428 | # PH: n_n v a n_n @ ванна 429 | # PH: n_nJ v a n_nJ @ ванне 430 | # PH: l_l vJ i l_l @ вилла 431 | # PH: l_lJ vJ i l_lJ @ Вилли 432 | 433 | hard_geminate_pairs = (("t":"[t_t]") "t") | 434 | (("d":"[d_d]") "d") | 435 | (("s":"[s_s]") "s") | 436 | (("n":"[n_n]") "n") | 437 | (("l":"[l_l]") "l") 438 | ; 439 | 440 | hard_gemination = CDRewrite[hard_geminate_pairs, 441 | "", 442 | "", 443 | alphabets.sigma_star 444 | ]; 445 | 446 | clean_hard_geminate_pairs = ("[t_t]" ("t":"")) | 447 | ("[d_d]" ("d":"")) | 448 | ("[s_s]" ("s":"")) | 449 | ("[n_n]" ("n":"")) | 450 | ("[l_l]" ("l":"")) 451 | ; 452 | 453 | clean_hard_gemination = CDRewrite[clean_hard_geminate_pairs, 454 | "", 455 | "", 456 | alphabets.sigma_star 457 | ]; 458 | 459 | soft_geminate_pairs1 = ("t":"[t_tJ]") "[tJ]" | 460 | ("d":"[d_dJ]") "[dJ]" | 461 | ("s":"[s_sJ]") "[sJ]" | 462 | ("n":"[n_nJ]") "[nJ]" | 463 | ("l":"[l_lJ]") "[lJ]" 464 | ; 465 | 466 | soft_geminate_pairs2 = ("[tJ]":"[t_tJ]") "[tJ]" | 467 | ("[dJ]":"[d_dJ]") "[dJ]" | 468 | ("[sJ]":"[s_sJ]") "[sJ]" | 469 | ("[nJ]":"[n_nJ]") "[nJ]" | 470 | ("[lJ]":"[l_lJ]") "[lJ]" 471 | ; 472 | 473 | soft_gemination = CDRewrite[soft_geminate_pairs1 | 474 | soft_geminate_pairs2, 475 | "", 476 | "", 477 | alphabets.sigma_star 478 | ]; 479 | 480 | clean_soft_geminate_pairs = ("[t_tJ]" ("[tJ]":"")) | 481 | ("[d_dJ]" ("[dJ]":"")) | 482 | ("[s_sJ]" ("[sJ]":"")) | 483 | ("[n_nJ]" ("[nJ]":"")) | 484 | ("[l_lJ]" ("[lJ]":"")) 485 | ; 486 | 487 | clean_soft_gemination = CDRewrite[clean_soft_geminate_pairs, 488 | "", 489 | "", 490 | alphabets.sigma_star 491 | ]; 492 | 493 | gemination = Optimize[hard_gemination @ 494 | clean_hard_gemination @ 495 | soft_gemination @ 496 | clean_soft_gemination 497 | ]; 498 | 499 | #---------------------------------------------------------------------------- 500 | 501 | export consonant_rules = Optimize[letter_simplifications @ 502 | hard_soft_letters @ 503 | clean_soft_hard_chars @ 504 | letter_g2p @ 505 | voice_assimilation @ 506 | phonetic_palatalization @ 507 | wud_final_devoicing @ 508 | consonant_simplification @ 509 | gemination 510 | ]; 511 | 512 | -------------------------------------------------------------------------------- /test/rus_sentences.txt.g2p: -------------------------------------------------------------------------------- 1 | sU-vo-r@f s lJU_x-b@_o-pi_xt-stv@m r@_o-spra-S@_r-v@l sla-vn@-g@ mJI-tJeZ-nJI-k@ @_o jI-vo v@_o-je-n_n@_rx dJe_ist-vJI-jI-xJi n@_o-mJe-rJI-nJI-j@x I p@-vJos jI-vo f sJIm-bJirsk SIL kU-da dol-Z@_rn bi_xl prJI-je-x@tJ I graf pa-nJIn 2 | I pro-st@ z@-d@_ox-nul-sJ@ @_ot nJI-@_o-Zi_x-d@-n_n@_o_i ra-d@-sJtJI 3 | fsko-rJI n@_o-t_SJal n@_o-kra-p@_r-v@dJ doZdJ SIL m@_o-t_SJif-S@_r_i naz d@_o sa-m@-v@ mJe-st@ la-gJI-rJ@ Ek-spJI-dJi-t_s@-I SIL par-tJI-I jI-lJI-z@_o-vJe-t@_r vl@_o-dJi-mJI-r@_o-vni_x 4 | @_o-nJi sJI-dJe-lJI n@_od mJi-r@m SIL U-t_SJast-vU_x-j@ v op-SJIm pr@_o-t_se-sJI Zi_x-zJnJI I gJi-bJI-lJI I prJI-s@-jI-dJI-nJa-j@ k sv@_o-je_i suSJ-n@-sJtJI fsJe dU-Si_x SIL lJit_SJ-n@-sJtJI I t_SJust-v@ ja 5 | jI-vo tol-st@-j@ Se-j@ vzdu-l@zJ b@_o-gro-v@_r-mJI Zi_x-l@-mJI SIL I S@_r-ro-k@-jI skU-la-st@-jI lJI-t_so n@_o-lJi-l@sJ krovJ-jU_x 6 | tru-s@sJtJ jI-vo nJI d@z-v@_o-lJa-l@ jI-mu x@-r@_o-SenJ-k@ Iz-jI-sJnJitJ-sJ@ SIL I tak s@_ol-gal on ra-z@ dva I-lJI trJi bJIs pra-vJIl SIL @_o kag g@-sp@_o-dJin z@-krJI-t_SJal jI-mu SIL Sto-b@_r on p@-Sol d@_o-mo_i SIL to tJem dJe-l@ I konJ-t_SJI-l@sJ 7 | U-kla-d@_r-v@-jItJ-sJ@ v u-skJI_i t@_o-nelJ n@_o-ut_SJ-n@_rx zna-nJI_i SIL v@_r-smJe-I-v@-jItJ-sJ@ I-lJI nJI z@-mJI-t_SJa-jItJ-sJ@ 8 | k@-mJI-tJed gr@_oZ-dan-sk@-jI s@_o-dJe_ist-vJI-jI v@_r-d@-jot i_xm x@_o-da-t@_ist-v@ @_o rJI-gJI-stra-t_s@-I I @_o-br@_o-SJe-nJI-j@ k s@_o-trudJ-nJI-k@m mft s r@z-jI-sJnJe-nJI-jIm rJI-alJ-n@-g@ p@-l@_o-Ze-nJI-j@ e-tJIx lJU-dJe_i 9 | @_od-na-k@ dnk s@_o-ma p@_o sJI-bJe nJI-sJot v@_oZ-nJe_i-SU_x-jU_x funk-t_s@-jU_x k@_o-dJi-r@-v@-nJI-j@ @_o-mJI-n@-kJI-slot-n@_o_i p@_o-slJe-d@-v@-tJIlJ-n@-sJtJI bJIl-kov SIL I mi_x nJI mo-Z@_rm pr@-Iz-volJ-n@ jI-jo m@-dJI-fJI-t_si_x-r@-v@tJ SIL nJI z@_o-tro-nU_xf e-t@_o_i In-f@_or-ma-t_s@-I I nJI I-zmJI-nJif sp@_o-sob-n@-sJtJI dnk k U_xd-v@_o-je-nJI-jU_x 10 | sJI-t_SJas o-t_SJInJ InJ-tJI-rJe-snU_x-jU_x ZenJ-SJI-nU_x pJi-SU_xt 11 | z@-st@_o-vlJa-lJI m@_o-lJitJ-sJ@ SIL g@_o-to-vJI-lJI je-dU_x SIL r@_o-ska-z@_r-v@-lJI @_o-nJIg-do-t@_r 12 | @_o-fJI-t_se-r@m I @_of-t@_o-matJ-t_SJI-k@m Iz ro-t@_r @_o-xra-n@_r p@_o-lJa-k@f prJI-k@_o-zal t_SJSJa-tJIlJ-n@ @_o-sm@_o-trJetJ @_o-krJe-sn@sJtJ SIL @_o sam s nJem-t_s@-mJI I z@-nJIl-sJa nJI-p@_o-srJet-stvJI-n_n@ U-t_SJast-k@m SIL gdJe r@-sp@-l@_o-ga-l@sJ jI-dro grup-p@_r 13 | k@-lJI-nJIn pr@_o-je-x@l SIL nJI z@-InJ-tJI-rJI-s@_o-val-sJ@ 14 | n@t_SJ-nJom z dJI-pU-ta-t@v 15 | v nJUrn-bJer-gJI b@_r-la zn@-mJI-nJi-t@-j@ f@_o-mJi-lJI-j@ fU_xg-gJI-r@v SIL b@_on-kJi-r@v SIL kUp-t_sof t@_o-vo vrJe-mJI-nJI SIL vro-dJI rot-S@_rlJ-d@v SIL @_o-nJi prJI-dl@_o-Zi_x-lJI U kUr-fJir-st@ vzJatJ n@_o ot-kU_xp @_rn-dUlJ-gJen-t_s@-I 16 | iz-za smut I bJI-sp@_o-rJat-k@f e-t@-v@ vrJe-mJI-nJI SIL sU_x-d@_o-xot-stv@ f p@r-tU-galJ-skJIx vo-d@x p@_ot_SJ-tJi prJI-kr@_o-tJi-l@sJ SIL tak Sto nJe-t_SJI-v@ bi_x-l@ @_o-p@_o-satJ-sJ@ da-Z@_r slU-t_SJa_i-n@-g@ @_ob-n@-rU-Ze-nJI-j@ sud-n@ v e-tJIx mJI-stax 17 | bo-lJI-jI t@_o-vo SIL mi_x SJt_SJI-ta-jIm SIL Sto krJI-dJit v bu-dU_x-SJIm mo-Z@_rt statJ @_olJ-t@_rr-n@_o-tJi-v@_o_i dol-gU_x 18 | pr@_o-Slo p@_o kra_i-nJI_i mJe-rJI s p@l-t_SJI-sa SIL p@_o-ka U-tJi-xlJI e-tJI xv@_o-ta-jU_x-SJI-jI z@_o sJer-t_s@ r@_r-da-nJI-j@ 19 | n@_o I-kra-n@x SIL b@_olJ-Si_xx I ma-l@_rx SIL t_s@_o-rJat @_o-mJI-rJI-kan-skJI-jI fJilJ-m@_r 20 | pJI-rJI-xot k pr@_o-sto_i br@_o-nJi bi_xl b@_r slJiS-k@m krut I z@-mJo-tJIn bJIz ra-zn@_rx pJI-rJI-lJi-v@v SIL @_o-t_tJen-k@f I m@_o-stov 21 | fsJex Z@_rlJ-jom @_o-bJI-spJe-t_SJIl 22 | k@_og-da s@_o-trudJ-nJI-kJI kkp vi_x-sk@-t_SJI-lJI Iz zda-nJI-j@ SIL to n@_o U-ka-z@-n_n@m mJe-sJtJI @_ob-n@_o-ru-Z@_r-lJI lJiS klJU-t_SJi @_ot m@_o-Si_x-n@_r 23 | da-Z@_r nJI-k@_o-la_i z@-prJI-SJal @_ob e-t@m slu-t_SJ@-jI fsp@-mJI-natJ SIL sra-zU_x bJI-lJI-nJil-sJ@ 24 | I d@_o-nJi-l@ s@_o-zo-n@_rd_ZJ z@-vJol r@-zg@_o-vor @_o p@-tJom-kJI-nJI SIL k@_o-to-r@_r_i g@-v@_o-rJil jI-mu SIL Sto pJI-rJI-xo-dJIt @_o-pJadJ z@_o m@_o-sko-fskU_x-jU_x z@_o-sta-vU_x 25 | kom-n@-t@ SIL f k@_o-to-r@_o_i nas prJI-nJI-ma-lJI SIL b@_r-la SIL k@_o-nJet_SJ-n@ SIL sa-m@-j@ pr@_o-stor-n@-j@ v do-mJI SIL jI-jo z@_o-ra-nJI-jI mi_x-lJI I t_SJi-sJtJI-lJI I pJI-rJId o-br@-z@-mJI z@_o-tJe-plJ@-lJI l@_om-pa-d@_r 26 | p@_o-slJedJ-nJI_i ka_if lJe-t@ pJI-rJIt skU-t_SJi-SJI_i U_x-t_SJo-b@_r I p@f-sJIdJ-nJe-vn@-sJtJI 27 | ja vJi-dJIl tJI-pJerJ SIL Sto n@_o sa-m@m dJe-lJI on nJI U_x-sk@_o-gla-z@_r_i SIL @_o pro-st@ IsJ tJex lJU-dJe_i SIL k@_o-to-r@_r-jI fsJIg-da smo-trJ@t fprJI-SJur 28 | tolJ-k@ Iz-vJI-nJi-tJI 29 | vdruk p@-mJI-sJtJil-sJ@ p@-srJI-dJi t_SJI-sJtJi-lJI-SJ@ SIL kag b@_r nJI-k@_o-mu I nJI mJI-Sa-j@ 30 | t@_o-ko_i InJ-tJI-l_lJIk-tU-alJ-n@_r_i m@-r@_o-fon 31 | sta-r@_r_i tUrk-mJen SIL proz-v@-n_n@_r_i x@_o-ro-br@_rx t_s@-rJom mJI-nJI-l@-jIm SIL r@-zdU-val slu-xJI SIL Sto v mJer-gJI-lJI sJI-dJid zlo_i dux I bu-dJId ZI-sto-k@ msJtJitJ kaZ-d@-mU_x SIL kto d@_i-dJod d@_o sJer-t_s@ g@_o-ri_x 32 | b@-g@_o-slo-vJI-jI f kJi-jI-vJI on t_SJI-tal p@_o @_ok-vJI-na-tU_x 33 | I sJI-v@_osJtJ-ja-n@f nJI U_x-dJI-vlJa-jItJ-sJ@ SIL Sto sJem-k@ pJi-S@_rt pJI-sJmo bra-tU_x SIL spJa-SJI-mU_x f s@_o-sJedJ-nJI_i kom-n@-tJI SIL SIL nJI d@_o t@_o-vo sJI-v@_osJtJ-ja-n@-vU_x 34 | mi_x nJI m@_o-glJi @_o-prJI-dJI-lJitJ SIL Sto tJI-pJerJ SIL sol-nJIt_SJ-n@_r_i lJI dJenJ I-lJI nJI-pr@-nJI-t_sa-jI-m@_r_i tU-man SIL I-b@ v lJe-sU_x bi_x-lJI su-mJIr-kJI SIL kak v na-S@_rx S@_r-ro-t@x t_SJI-rJISJ t_SJas po-slJI sol-nJIt_SJ-n@-g@ z@_o-ka-t@ 35 | nJI I-zbJI-Zal e-t@_o_i u-t_SJ@-sJtJI I en 36 | e-t@ Ze nJI pa-st@ SIL @_o t_SJi-st@-j@ @_o-tra-v@ 37 | z@_o nJim @_o-na SIL prJI-kra-sn@-j@ kak fsJIg-da SIL fsJa v bJe-l@m SIL fsJa f t_svJI-tax p@-mJI-ran-t_s@ SIL z dlJi-n_n@_rm blon-d@-v@_rm vU-a-lJIm n@_o g@-l@_o-vJe SIL k@_o-to-r@_r_i Z@_r-v@_o-pJi-sn@ spU-skal-sJ@ n@_o-zad SIL z blJI-sJtJa-SJIm Si_x-fr@m n@_o lJe-v@m plJI-t_SJe 38 | je-slJI b@_r k@_o-ko_i-lJi-b@ svJI-SJe-n_nJIk vzdu-m@l U-stro-ItJ rJI-lJI-gJI-o-zn@_r-jI t_SJtJe-nJI-j@ dlJ@_o n@_o-ro-d@ f sv@_o-je_i Ze t_serk-vJI SIL no f t_SJI-si_x SIL k@_og-da nJed b@-g@-slU-Ze-nJI-j@ v nJe_i SIL dlJ@_o e-t@-v@ on prJId-v@_o-rJi-tJIlJ-n@ dol-Z@_rn @_r-spr@_o-sJitJ sJI-bJe @_o-so-b@-jI r@-zrJI-Se-nJI-jI jI-p@r-xJI-alJ-n@_o_i vla-sJtJI 39 | kag bu-t_t@ ranJ-S@_r lJu-dJI s@_o vrJe-mJI-nJIm m@-l@_o-dJe-lJI 40 | k@_o-bJin-k@ b@_r-la p@_o-t_SJi-SJI 41 | sJInJ-tJabrJ v @_r-zra-I-lJI SIL mJe-sJ@t_s n@-v@_o-godJ-nJI_i I p@_o-e-t@-mU_x nJI o-t_SJInJ pJI-rJI-gru-Z@_r-n_n@_r_i p@_o-lJi-tJI-k@_o_i 42 | zJdJe-l@-l@ e-t@ z gru-sn@_rm vJi-d@m ma-mJInJ-k@ SIL tol-st@-j@ da-m@ SIL b@_olJ-Sa-j@ kU-rJi-tJIlJ-nJI-t_s@ I spJI-t_s@_r-@_o-lJist-k@ f prJI-fJI-ranJ-sJI 43 | z@_o-sJi-mU_x I sav-v@_o-tJi-j@ 44 | t_SJI-rJIz dva mJe-sJ@-t_s@ SIL vi_x b@_olJ-ni_x @_o-t_t@_o-vo SIL Sto nJI p@-bJI-rJI-glJisJ 45 | r@-SJot U nJI-vo bi_xl pr@_o-sto_i SIL t_SJem mJenJ-S@_r go-sJtJI pr@_o-bu-dU_xt v la-gJI-rJI SIL tJem lut_SJ-S@_r SIL SIL la-gJIrJ fsJIg-da k@_o-zal-sJ@ jI-mu kJI-pJa-SJIm k@_o-tlom SIL jI-Z@_r-mJI-nut-n@ g@_o-to-v@_rm vz@_or-vatJ-sJ@ 46 | fsJIg-da mJIt_SJ-ta-l@ I-mJetJ mno-g@ dJI-tJe_i SIL @_o tut SJt_SJasJtJ-jI v rU-kJi SIL dJe-v@t_SJ-k@ sla-vn@-j@ SIL s@-m@-st@_o-ja-tJIlJ-n@-j@ 47 | I-lJI druk g dru-gU_x SIL vr@_oZ-dJeb-n@ prJI-stra-sn@_r-jI 48 | g@-sU-dar-stvJI-n_n@-j@ tJI-rJI-to-rJI-j@ SIL d@_o-to-lJI z@-klJU_x-t_SJo-n_n@-j@ f prJI-dJe-l@x pJIr-v@-n@_o-t_SJalJ-n@-g@ r@-s_sJI-lJe-nJI-j@ vJI-lJI-k@_o-ru-sk@-g@ plJe-mJI-nJI SIL tJI-pJerJ pJI-rJI-xo-dJId d@-lJI-ko z@_o e-tJI prJI-dJe-l@_r I p@-sJtJI-pJe-n_n@ vbJI-ra-jIt f sJI-bJa fsJu ru-skU_x-jU_x r@_o-vnJi-nU_x SIL r@-spr@-str@_o-nJa-j@sJ kag d@_o gJI-@_o-gr@_o-fJi-t_SJI-skJIx jI-jo gr@_o-nJit_s SIL tak p@_ot_SJ-tJi vJI-zJdJe d@_o prJI-dJe-l@f ru-sk@-g@ n@-r@-d@-n@-sJI-lJe-nJI-j@ 49 | zJdJesJ prJeZ-dJI fsJI-vo b@_olJ-So-jI zn@_o-t_SJe-nJI-jI I-mJe-jIt pra-vJIlJ-n@_r_i @_od-bor p@-stU-pa-jU_x-SJIx n@_o v@_o-je-n_nU_x-jU_x sluZ-bU_x 50 | mo-j@ k@_o-bi_x-l@-t@ jI-SJo zJI-mo_i d@_o t@_o-vo prJI-vi_x-kl@ s@_o-lo-mU_x v@_o-zJitJ SIL Sto z z@_o-kri_x-t@_r-mJI gl@_o-za-mJI p@_o t@_o-mu m@_or-Sru-tU_x x@_o-dJi-l@ 51 | s@_o-trudJ-nJI-kJI mJI-lJi-t_s@-I da-Z@_r v@_r-jIZ-Za-lJI n@_o strJel-kJI SIL gdJe o-t_SJInJ bi_x-str@ I d@_o-stup-n@ @_ob-j@-sJnJ@-lJI sv@_o-im @_op-p@_o-nJen-t@m nJI-@_op-x@_o-dJi-m@sJtJ @_o-sta-vJItJ f p@_o-ko-jI @_o-pJI-ka-jI-m@_rx I-lJI k@mJ-mJIr-san-t@v 52 | lJU-bovJ SIL mJe-r@ @_o-d@-rJI-n_n@-sJtJI Zi_xzJnJ-jU_x lJU-dJe_i SIL no @_o-na SIL v@-prJI-kJi fsJI-mu SIL v o-t_SJInJ ma-l@_o_i sJtJe-pJI-nJI sJIk-sU-alJ-n@sJtJ 53 | nU SIL vi_x xra-br@_r_i 54 | box vJesJtJ SIL fstrJe-tJIm-sJ@ lJI jI-SJo 55 | I-b@ kag b@_r nJI bi_xl t_SJI-l@_o-vJek mal SIL no jesJtJ k@_o-kJi-jI-t@ rJI-zUlJ-ta-t@_r jI-vo Zi_x-zJnJI 56 | staf @_o-dJin ras vr@_o-zrJes s ma-tJIrJ-jU_x I sJo-str@-mJI SIL @_o-na nJI U-mJe-l@ s nJi-mJI s@_o_i-tJisJ sno-v@ SIL @_o @_o-nJi e-t@-v@ nJI I-ska-lJI 57 | @_o-dJin z@_o-su-nU_xl jI-vo v n@_o-sJtJe-n_n@_r-jI t_SJI-si_x s kU-kuS-k@_o_i 58 | t@_og-da ja SIL @_o-b@-r@_o-tJasJ SIL U-vJi-dJIl n@_o go-rJI pro-tJIf nas SIL z@_o rJet_SJ-k@_o_i SIL mno-Z@_rst-v@ k@_o-lJu-Z@_i SIL @_o svJerx t@_o-vo t_SJI-l@_o-vJeg dvat-t_s@dJ bJI-Zaf-S@_rx SIL Stop @_o-trJe-z@tJ nas tr@_o-ix @_ot na-S@_rx t@_o-va-rJI-SJI_i SIL mJeZ-dU_x tJem strJI-li_x si_x-p@-lJIsJ n@_o nas SIL kag grad 59 | mi_x nJI @_o-st@_o-na-vlJI-v@-jIm-sJ@ n@_o d@_o-sJtJig-nU_x-t@m SIL st@_o-ra-j@sJ @_ox-v@_o-tJitJ vJesJ spJektr n@-pr@_o-vlJe-nJI_i p@d-g@_o-tof-kJI stU-dJen-t@v 60 | @_o n@_o mnJe SIL ja znal SIL lJI-Za-l@ fsJa @_ot-vJet-stvJI-n_n@zJdJ z@_o U-spJex jI-jo 61 | s@_o-bo-jU_x ix nJI z@-sl@_o-nJu SIL x@_o-tJa ja I af-t@r SIL vJIr-nJe-jI SIL @_od-no Is ft@-r@-sJtJI-pJe-n_n@_rx lJit_s n@_o z@_o-dax m@_o-s_sof-kJI 62 | p@_o-ka-t_SJI-v@-j@ g@-l@_o-vo_i I zgor-bJIf-S@_rsJ SIL on v@z-vr@_o-SJa-jItJ-sJ@ k @_ok-nu SIL @_o-trJad dlJi-n_n@_o_i I nJI-ro-vn@_o_i t_sepJ-jU_x v@_r-p@_ol-za-jIt t_SJI-rJIz vo-r@-t@ plat_s-p@_o-ra-d@ 63 | no e-t@ @_o-k@_o-za-l@sJ rJI-alJ-n@sJtJ-jU_x SIL I tJI-pJerJ mJir U-Ze nJI bu-dJIt t@_o-kJim SIL k@_o-kJim on bi_xl ft_SJI-ra 64 | p@_o SJt_SJasJtJ-jU_x SIL pa-d@-lJI dJI-rJevJ-j@ not_SJ-jU_x SIL t_SJa-s@ f t_SJI-ti_x-rJI SIL v bJI-zlJudJ-jI 65 | sam t@_o-SJo_i SIL @_o mJe-st@ za-nJ@l SIL kag ba-b@ @_ot-kor-mlJI-n_n@-j@ 66 | nJI-vaZ-n@ 67 | k@_og-da mi_x vi_x-jI-x@-lJI SIL b@_r-la U-Ze not_SJ SIL I d@_o-ro-g@ lJI-Za-l@ t_SJI-rJIzJ dJef-stvJI-n_n@_r_i lJes s vJI-k@_o-vi_x-mJI so-sn@-mJI SIL lJist-vJI-n_nJI-t_s@_i I jelJ-jU_x gJI-gant-skJIx r@_o-zmJe-r@v 68 | @_o-st@_o-va-l@sJ ro-vn@-j@ SIL sp@_o-ko_i-n@-j@ I nJI-z@_o-muZ-nJ@-j@ 69 | e-t@ sU-ka v @_o-t_sen-kJI mU_x-Z@_r-kof I ix dU-rat_s-kJIx ka-t_SJIstf p@_ot_SJ-tJi fsJIg-da b@_r-va-jIt pr@_o-va 70 | im e-t@ nJI mJI-Sa-l@ 71 | no jI-SJo pi_x-r@m SIL d@_r-rJa-v@_rm b@_o-tJin-k@m on I tut U-spJel vrJe-z@tJ sJIr-Zan-tU_x mJeS nog SIL I e-t@ to-Z@_r s@_o SJI-stlJi-v@_o_i mi_xslJ-jU_x 72 | n@-vJI-va-jIt mi_x-slJI @_o st@_o-bJilJ-n@-sJtJI I p@-st@_o-jan-stvJI 73 | p@_o-sli_x-S@-lJIzJ Zen-skJI-jI fsxlJi-p@_r-v@-nJI-j@ 74 | @_o tak kag dJe-j@-tJIlJ-n@sJtJ jI-vo pr@-Is-x@_o-dJi-l@ srJI-dJi o-t_SJInJ m@-l@_o-di_xx lJU-dJe_i SIL prJI-nJI-maf-S@_rx jI-vo bJI-zgr@_o-nJit_SJ-nU_x-jU_x s@-m@-U-vJe-rJI-n_n@zJdJ z@_o glU_x-b@-k@_o-mi_x-slJI-jI I mu-dr@sJtJ SIL to b@lJ-S@_rn-stvo p@tJ-t_SJI-nJa-l@sJ jI-mu SIL I on @_r-mJel b@_olJ-So_i U-spJex v rJI-v@-lJU_x-t_s@_r-o-n_n@_rx krU-gax 75 | st@_o-ra-j@sJ nJI zbJitJ-sJ@ s op-SJI-g@ Sa-g@ SIL on sno-v@ prJIt-sta-vJIl sJI-bJe s@_r-n@_o-vJe_i I mi_x-slJI-n_n@ @_o-br@_o-tJil-sJ@ k nJim s pr@-d@_ol-Ze-nJI-jIm sv@-jI-vo r@_o-ska-z@ 76 | op-SJIst-v@ SIL f k@_o-to-r@m vlast-v@-v@-l@ p@_or-tJi_i-n@-j@ n@-mJIn-kl@_o-tu-r@ SIL n@_osk-vosJ pr@_o-pJi-t@-n_n@-j@ dog-m@-mJI SIL nJI-I-zlJI-t_SJi-m@ b@_olJ-na-j@ U_x-t@_o-pJi-t_SJI-sk@_o_i I-dJI-@_o-lo-gJI-jI_i 77 | e-t@-v@ ZI-la-jIt mo-j@ v@_r-so-k@-j@ p@-vJI-lJi-tJIlJ-nJI-t_s@ 78 | n@-st@_o-ja-SJI_i gr@_o-mJi-l@ s u-skJIm lbom SIL s l@_ox-ma-t@_r-mJI br@_o-vJa-mJI n@_od blJi-sk@ svJI-dJo-n_n@_r-mJI t_SJor-n@_r-mJI gl@_o-za-mJI SIL @_o dlJi-n_n@_r-jI rU-kJi SIL slo-vn@ klJIS-nJi SIL SIL @_od-no_i l@_o-donJ-jU_x fsJu m@_o-ju spJi-nU_x prJI-kro-jU_xt 79 | tJe t_se-n_n@_r-jI rJI-zUlJ-ta-t@_r SIL @_o k@_o-to-r@_rx mi_x g@-v@_o-rJi-lJI SIL SIL I-tog dlJi-tJIlJ-n@-g@ I n@-prJ@-Zo-n_n@-g@ trU-da 80 | @_os-v@-b@Z-dJo-n_n@_r_i @_ot nJI-@_op-x@_o-dJi-m@-sJtJI n@_o kaZ-d@m Sa-gU_x d@_o-ka-z@_r-v@tJ sv@_o-ju nJI-z@_o-vJi-sJI-m@sJtJ SIL fsJa-kJI_i dJe-l@l sv@-jo dJe-l@ sp@_o-ko_i-n@ SIL bJIz r@-zdr@_o-Ze-nJI-j@ 81 | sJI-godJ-nJ@ @_o-nJi bolJ-S@_r fsJI-vo b@_o-jatJ-sJ@ SIL Sto prJId-vor-n@-j@ znatJ I krup-n@_r-jI k@-pJI-t@_o-lJi-st@_r nJI z@-x@_o-tJat i_xx bratJ fsJIrJ-joz 82 | sk@_o-rJe-jI fsJI-vo on nJI U-mJe-jIt p@_o-nJatJ pr@-tJI-v@_o-rJe-t_SJI-jI mJeZ-dU_x @_ot-vJet-stvJI-n_n@sJtJ-jU_x ix mJi-s_sJI-I I tJe-mJI jI-d@_o-vJi-t@_r-mJI x@-r@k-tJI-rJi-sJtJI-k@-mJI SIL k@_o-to-r@_r-mJI n@-dJI-lJa-jIt i_xx lJe-nJIn 83 | nU SIL vot SIL mi_x I pr@_o-vJe-rJIm SIL mJi-l@-j@ trU-sJi-x@ SIL n@_o-skolJ-k@ @_o-pr@_ov-da-jU_xtJ-sJ@ va-S@_r stra-xJI 84 | ZenJ-SJI-n@ @_o-bra-d@-v@-l@sJ r@-zg@_o-vo-rU_x SIL I s@_o-ma fsJe r@-sk@_o-za-l@ SIL l@_o-ska-j@ jI-vo u-sk@-jI lJI-t_so S@_rl-k@_o-vJi-st@_r-mJI sv@_o-i-mJI gl@_o-za-mJI 85 | r@_o-sJi-I xva-tJIt n@_o fsJex 86 | @_o gdJe-t@ SIL vot-k@ SIL gdJe-t@ SIL s@-m@_o-gon SIL gdJe-t@ SIL t_SJIr-nJi-l@ @_o-b@_rk-n@_o-vJe-n_n@_r-jI 87 | no @_o-nJi nJI lJU_x-b@_o-pi_xt-n@_r 88 | rJI-zU_xlJ-t@_o-tJi-vn@sJtJ 89 | I fsJIg-da mi_xslJ box s@_o mno-jU_x 90 | jI-fJim @_rg-n@_o-tJit_SJ tolJ-k@ mJI-ga-jIt SIL @_o d@_o g@_o-rJa-t_SJIx pJI-r@_oS-kof nJI d@_o-tra-gJI-v@-jItJ-sJ@ 91 | U tJo-tU_xS-kJI ma-rJI @_o-lJIk-sJe-jI-vn@_r @_o-na pr@-Z@_r-la nJI-dol-g@ 92 | e-t@ p@-st@_o-ja-n_n@-j@ nJI-@_o-bi_xt_SJ-n@zJdJ dlJ@_o nas SIL vJI-r@_o-jat-n@ SIL @_o-sno-v@-n@ n@_o Im-m@_o-nJent-n@_o_i I-l_lJu-zJI-I k@-U-zalJ-n@-sJtJI vrJe-mJI-n_n@_o_i @_or-g@-nJI-za-t_s@-I psJI-xJi-t_SJI-sk@-g@ 93 | si_xr SIL ma-sl@ SIL ko-Z@ SIL mJod SIL lJes I SIL d@_o-lo_i fa-brJI-kJI 94 | @_od-no vrJe-mJ@ SIL n@-prJI-mJer SIL U_x-g@_o-va-rJI-v@-lJI el 95 | sto-j@ U p@_od-no-Z@_r-j@ m@-jI-ka SIL na-d@ v@_r-s@_o-ko z@-dJI-radJ go-l@-vU_x SIL Sto-b@_r U-vJi-dJItJ jI-vo vJIr-Si_x-nU_x SIL I tolJ-k@ t@_og-da p@-sJtJI-ga-jIS fsJe vJI-lJi-t_SJI-jI s@-@_o-rU-Ze-nJI-j@ 96 | bolJ-S@_r nJI-t_SJI-go nJI g@-v@_o-rJit SIL g@-v@_o-rJit tolJ-k@ SIL Sto tam o-t_SJInJ plo-x@ 97 | tag Ze kak v n@_o-t_SJa-lJI pro-Sl@-g@ vJe-k@ ko-n_n@_r-jI skat_SJ-kJI SIL @_o v n@_o-t_SJa-lJI p@-z@_o-pro-Sl@-g@ SIL strJIlJ-ba Iz lu-k@ 98 | @_o-fon-sk@_o_i g@_o-ri_x 99 | kak nJI znatJ @_on-drJe-j@ mJI-x@_o_i-li_x-t_SJ@ 100 | sJtJI-p@n tJI-m@_o-xJin 101 | p@_o-slJedJ-nJI-jI m@-jo svJI-da-nJI-jI z go-g@-lJIm bi_x-l@ f pJI-tJIr-bur-gJI SIL k@_og-da on @_o-st@_o-na-vlJI-v@l-sJ@ v zJim-nJIm dv@_or-t_se SIL U ZU-ko-fsk@-g@ 102 | mi_x U-Ze U-zna-lJI SIL Sto on s@-bJI-ral-sJ@ pr@_o-t_SJesJtJ nam no-v@-jI sv@-jo pr@-Iz-vJI-dJe-nJI-jI SIL no prJI-stU-pJitJ g dJe-lU_x bi_x-l@ nJI lJIx-ko 103 | go-g@lJ kak nJI f t_SJem nJI b@_r-va-l@ x@_o-dJil p@_o kom-n@-tJI SIL d@-br@_o-duS-n@ p@_ot-smJe-I-v@l-sJ@ n@_odJ nJe-k@-t@-r@_r-mJI op-SJI-mJI zn@_o-ko-m@_r-mJI SIL @_o @_o t_SJtJe-nJI-I I p@_o-mJi-nU_x nJI bi_x-l@ 104 | da-Z@_r ras on n@-mJIk-nul SIL Sto moZ-n@ @_o-tl@_o-Zi_xdJ z@-sJI-da-nJI-jI 105 | on p@-d@-Sol g go-g@-lJU_x zza-dJI SIL @_o-SJu-p@l k@_or-ma-n@_r jI-vo fra-k@ SIL vi_x-t@-SJIl @_o-t_tu-d@ tJI-tratJ p@_ot_SJ-to-v@_o_i bU-ma-gJI v @_o-sJmuS-kU_x 106 | go-g@lJ sJIr-dJi-t@ vi_xx-v@-tJIl tJI-trat-kU_x SIL sJel mrat_SJ-n@ n@_o dJI-van I totJ-t_SJaz Ze n@_o-t_SJal t_SJI-tatJ prJI fsJI-op-SJIm m@_ol-t_SJa-nJI-I 107 | on t_SJI-tal bJIs pJI-rJI-ri_x-v@ d@_o tJex por SIL p@_o-ka I-st@_o-SJil-sJ@ vJesJ jI-vo go-l@s I z@-rJI-bJi-l@ v gl@_o-zax 108 | mi_x U-zna-lJI t@_o-kJim o-br@-z@m pJer-v@_r-jI t_SJI-ti_x-rJI gl@_o-vi_x mJort-v@_rx duS 109 | op-SJI_i smJex ma-l@ p@-r@_o-zJil go-g@-lJ@ SIL no Iz-jI-vlJe-nJI-jI nJI-lJI-t_sI-mJer-n@-g@ v@_o-stor-g@ SIL k@_o-to-r@-jI vJi-dJI-m@ bi_x-l@ n@_o fsJex lJi-t_s@x pot k@_o-nJet_s t_SJtJe-nJI-j@ SIL jI-vo tro-nU_x-l@ 110 | on bi_xl d@_o-vo-lJIn 111 | kto-t@ sk@_o-zal SIL Sto prJI-vJet-stvJI-jI sJI-lJI-fa-n@ b@_o-so_i dJe-v@t_SJ-kJI SIL k@_o-to-rU_x-jU_x on s@_o-Za-jIt n@_o k@_o-zli_x vmJe-st@ pr@-v@dJ-nJI-ka @_ot k@_o-ro-b@t_SJ-kJI SIL nJI s@_of-sJem prJI-lJit_SJ-n@ 112 | fsJe @_o-st@_olJ-ni_x-jI slu-S@-tJI-lJI v@_o-s_sta-lJI pro-tJIf e-t@-v@ z@-mJI-t_SJa-nJI-j@ 113 | po-slJI t_SJtJe-nJI-j@ on z@_o-ku-t@l-sJ@ SIL p@_o @_o-b@_rk-n@_o-vJe-nJI-jU_x SIL f Su-bU_x d@_o sa-m@-v@ lba SIL sJel s@_o mno_i n@_o Iz-voSJ-t_SJI-k@ SIL I mi_x mol-t_SJ@ d@_o-je-x@-lJI d@_o zJim-nJI-g@ dv@_or-t_sa SIL gdJe ja jI-vo s_s@_o-dJil 114 | fsko-rJI p@_o-tom on @_o-pJatJ ISJ-t_SJes Is pJI-tJIr-bur-g@ 115 | go-g@lJ @_o-bra-d@-v@l-sJ@ na-S@_i no-v@_o_i fstrJe-t_SJI SIL r@_o-spra-S@_r-v@l SIL k@_o-kJim pU_x-tJom prJi-b@_rl ja v @_r-ta-lJI-jU_x 116 | jI-mu k@_o-za-l@sJ SIL Sto po-slJI I-ta-lJI-I p@_o-rJiS st@_o-no-vJItJ-sJ@ sux I bJIZ-Zi_x-zJnJIn SIL @_o zn@_o-t_SJe-nJI-jI I-ta-lJI-I br@_o-sa-jItJ-sJ@ s@_o-mo s@_o-bo_i v gla-z@ po-slJI p@_o-rJiS-sk@_o_i Zi_x-zJnJI I p@_o-rJiS-skJIx @_rnJ-tJI-rJe-s@v 117 | fp@_o-slJet-stvJI-I on t_SJa-st@ r@z-vJI-val e-tU_x mi_xslJ 118 | mJeZ-dU_x tJem vrJe-mJ@ bi_x-l@ @_o-bJe-dJI-n_n@-jI 119 | on p@-vJol mJI-nJa v @_rz-vJe-snU_x-jU_x I-st@_o-rJi-t_SJI-skU_x-jU_x @_o-fsJtJe-rJI-jU_x SIL gdJe z@_o dlJi-n_n@_r-mJI st@_o-la-mJI SIL S@_o-ga-j@ p@_o grJa-zn@-mU_x po-lU_x I U-sa-Z@_r-v@-j@sJ pro-st@ n@_o sk@_o-mJe_i-k@x SIL sJtJI-ka-jItJ-sJ@ k @_o-bJe-dJI-n_n@-mU_x t_SJa-sU_x r@-zn@-@_o-bra-zJnJI_i-S@-j@ pu-blJI-k@ 120 | e-t@ fsJe tod Ze rJis SIL b@_o-ra-S@_rk SIL ku-rJI-t_s@ SIL mJI-nJa-jItJ-sJ@ tolJ-k@ zJe-lJInJ p@_o vrJI-mJI-nam go-d@ 121 | pr@-st@_o-ta SIL @_op-SJI-Zi_x-tJIlJ-n@sJtJ I-t@_olJ-jan-sk@-j@ fsJI-vo bo-lJI-jI kJI-da-jU_xtJ-sJ@ tut v gla-z@ SIL z@-st@_o-vlJa-j@ prJItJ-t_SJust-v@-v@tJ sJI-bJa I v@_o fsJex drU-gJix sfJe-r@x Zi_x-zJnJI 122 | go-g@lJ p@-r@_o-zJil mJI-nJa SIL @_od-na-k@ SIL k@_o-prJi-zn@_rm SIL vz@_r-ska-tJIlJ-n@_rm @_o-br@_o-SJe-nJI-jIm sv@_o-im s prJI-sluZ-nJI-k@m 123 | ra-z@ dva mJI-nJal on blJu-d@ rJi-s@ SIL n@-x@_o-dJa jI-vo to pJI-rJI-va-rJI-n_n@_rm SIL to nJI-d@_o-va-rJI-n_n@_rm SIL I fsJa-kJI_i ras prJI-sluZ-nJIk pJI-rJI-mJI-nJal blJu-d@ z d@-br@_o-duS-n@_o_i U-li_xp-k@_o_i 124 | p@-lU-t_SJif n@-k@_o-nJet_s t@_o-rJel-kU_x rJi-s@ p@_o sv@-jI-mu fku-sU_x SIL go-g@lJ prJI-stU-pJilk nJe_i s nJI-@_o-b@_r-t_SJa_i-n@-jU_x alt_SJ-n@sJtJ-jU_x SIL n@-kl@_o-nJasJ tak SIL Sto dlJi-n_n@_r-jI vo-l@-s@_r jI-vo U-pa-lJI n@_o sa-m@-jI blJu-d@ SIL I p@-gl@_o-SJa-j@ loS-kU_x z@_o loS-k@_o_i s@_o strasJtJ-jU_x I b@_r-str@_o-to_i SIL k@_o-kJi-mJI SIL g@-v@_o-rJat SIL @_o-b@_rk-n@_o-vJe-n_n@ @_o-tlJI-t_SJa-jU_xtJ-sJ@ z@_o st@_o-lom lJu-dJI SIL r@-sp@_o-lo-Z@_r-n_n@_r-jI k @_r-p@_o-xon-drJI-I 125 | f sJI-rJI-dJi-nJI @_o-bJe-d@ k nam p@_otJ-sJel d@_o-volJ-n@ plot-n@_r_i mUS-t_SJi-n@ SIL s kr@_o-sJi-v@_o_i SIL kru-gl@_o_i b@_o-rot-k@_o_i 126 | @_o-p@-r@_oZ-nJif sv@-jo blJu-d@ SIL go-g@lJ @_ot-kJi-nU_xl-sJ@ n@_o-zad SIL zJdJe-l@l-sJ@ vJe-sJIl SIL r@-zg@_o-vor-t_SJIf I n@_o-t_SJal SU-tJitJ s prJI-sluZ-nJI-k@m SIL jI-SJo tak nJI-da-vn@ 127 | @_o-s@_r-pa-jI-m@_rm stro-gJI-mJI vi_x-g@-v@-r@-mJI I U_x-k@_o-rJi-zn@-mJI 128 | p@_o @_o-k@_onJ-t_SJa-nJI-I r@SJ-t_SJo-t@ z@_o @_o-bJed go-g@lJ @_o-sta-vJIl prJI-sluZ-nJI-kU_x SIL kak I fsJe drU-gJi-jI p@-sJI-tJi-tJI-lJI SIL dva b@_o_i-o-k@ SIL @_o k@_og-da ja s@_o sv@_o-je_i st@-r@_o-ni_x t_SJto-t@ pJI-rJI-l@_o-Zi_xl pro-tJIf e-t@_o_i skud-n@_o_i sum-m@_r SIL on @_o-st@-n@_o-vJil mJI-nJa z@-mJI-t_SJa-nJI-jIm 129 | Iz-vJe-sn@ SIL Sto Z@_r-tJe_i-sk@_o_i mu-dr@-sJtJI v nJem bi_x-l@ p@_ot_SJ-tJi stolJ-k@ Ze SIL skolJ-k@ I t@_o-lan-t@ 130 | on bi_xl f sv@_o-je_i t@_o-rJel-kJI I mog SIL Sto jI-mu nuZ-n@ bi_x-l@ I-lJI Sto sto-I-l@ e-t@-v@ SIL pol-n@_o_i rU-ko_i SIL nJI d@_o-va-j@ sam nJI-t_SJI-go 131 | ja nJI-k@_og-da nJI x@_o-t_SJu @_o-bJe-d@tJ 132 | mnJe tak x@-r@_o-So v@_o dv@_o-rJe I-gratJ 133 | ja fsJu Zi_xzJnJ b@_r v@_o dv@_o-rJe I-gral 134 | I nJI-k@_og-da nJI @_o-bJe-d@l b@_r 135 | ja s@_of-sJem nJI lJU-blJu borSJ s k@_o-pu-st@_o_i 136 | I v@_op-SJe ja sup nJI lJU-blJu 137 | I ka-SU_x ja nJI lJU-blJu 138 | I k@_o-tlJe-t@_r to-Z@_r nJI o-t_SJInJ lJU-blJu 139 | ja lJU-blJu @_o-brJI-ko-s@_r 140 | vi_x je-lJI @_o-brJI-ko-s@_r 141 | ja tak lJU-blJu @_o-brJI-ko-s@_r 142 | no vot ma-m@ z@-vJot mJI-nJa jezJdJ borSJ SIL mnJe prJI-xo-dJItJ-sJ@ fsJo br@_o-satJ 143 | mo_i brad b@_o-ba lJu-bJId borSJ 144 | on smJI-jotJ-sJ@ SIL k@_og-da jezd borSJ SIL @_o ja mor-SJU_xsJ 145 | on v@_op-SJe fsJIg-da smJI-jotJ-sJ@ I ti_x-t_SJIt sJI-bJe loS-k@_o_i v nos vmJe-st@ rta SIL p@-t@_o-mu Sto jI-mu trJi go-d@ 146 | nJet SIL borSJ ja m@_o-gu sjesJtJ 147 | I k@_o-tlJe-t@_r ja to-Z@_r sjI-da-jU_x 148 | vJI-n@_o-gra-t_t@ ja jem s U_x-d@_o-volJ-stvJI-jIm 149 | t@_og-da I s@_o-Za-jU_xt mJI-nJa z@_o r@_o-jalJ 150 | p@_o-Za-lU_i SIL ja sjel b@_r jI-SJo raz borSJ 151 | tolJ-k@ b@_r nJI I-gratJ n@_o r@_o-ja-lJI 152 | ja I-gra-jU_x SIL @_o brat sJI-dJit n@_o po-lU_x I smJI-jotJ-sJ@ 153 | v rU-kax U nJI-vo z@-v@_od-na-j@ m@_o-Si_x-n@ 154 | on @_o-t@_or-val @_ot m@_o-Si_x-n@_r k@-lJo-s@ 155 | I k@_o-ta-jIt i_xx p@_o po-lU_x 156 | I e-t@ jI-mu o-t_SJInJ nra-vJItJ-sJ@ 157 | nJIk-to jI-mu nJI mJI-Sa-jIt 158 | nJI z@-st@_o-vlJa-jIt @_r-gratJ n@_o r@_o-ja-lJI 159 | I p@-t@_o-mu jI-mu o-t_SJInJ vJe-sJI-l@ 160 | pla-t_SJIt on o-t_SJInJ rJet-k@ 161 | k@_og-da U nJI-vo t_SJto-nJI-butJ @_otJ-nJI-ma-jU_xt 162 | I-lJI k@_og-da jI-vo strJI-gut 163 | on s@-vJIr-Se-n_n@ nJI lJu-bJIt strJit_SJ-sJ@ 164 | on tak I x@_o-dJil b@_r fsJu Zi_xzJnJ l@_ox-ma-t@_r_i 165 | n@_o e-t@ on nJI @_o-br@_o-SJa-jIt vnJI-ma-nJI-j@ 166 | v op-SJIm SIL jI-mu x@-r@_o-So SIL @_o mnJe plo-x@ 167 | pa-p@ s ma-m@_o_i slu-S@-jU_xt SIL kak ja I-gra-jU_x 168 | brat k@_o-ta-jIt p@_o po-lU_x k@-lJo-sJI-kJI 169 | z@_o @_ok-nom krJI-t_SJat t_SJI-ti_x-rJI bra-t@ 170 | @_o-nJi krJI-t_SJat ra-zn@_r-mJI g@-l@_o-sa-mJI 171 | ja vJi-ZU_x v @_ok-no SIL @_o-nJi ma-SU_xt rU-ka-mJI 172 | @_o-nJi z@_o-vut mJI-nJa 173 | im @_odJ-nJim skut_SJ-n@ 174 | --------------------------------------------------------------------------------