├── src ├── tapkomet_stem │ ├── ukr.h │ ├── README.txt │ ├── Makefile │ ├── api.h │ ├── api.c │ ├── header.h │ ├── ukr_stem.c │ ├── stem_ukr.sbl │ ├── utilities.c │ └── ukr.c ├── vgrichina_stem.py └── tochytskyi_stem.py └── README.md /src/tapkomet_stem/ukr.h: -------------------------------------------------------------------------------- 1 | /* Generated by Snowball 2.0.0 - https://snowballstem.org/ */ 2 | 3 | #ifdef __cplusplus 4 | extern "C" { 5 | #endif 6 | 7 | extern struct SN_env * ukr_create_env(void); 8 | extern void ukr_close_env(struct SN_env * z); 9 | 10 | extern int ukr_stem(struct SN_env * z); 11 | 12 | #ifdef __cplusplus 13 | } 14 | #endif 15 | 16 | -------------------------------------------------------------------------------- /src/tapkomet_stem/README.txt: -------------------------------------------------------------------------------- 1 | Porter stemmer for the Ukrainian language 2 | ----------------------------------------- 3 | 4 | Author: 5 | Roman Kobzar, 2017 6 | Source: 7 | https://github.com/Tapkomet/UAStemming 8 | License: 9 | BSD 3-Clause 10 | Build: 11 | make 12 | 13 | Converted from Snowball to C by Andrii Makukha. 14 | 15 | Comparison of stemmers for Ukrainian: 16 | https://github.com/amakukha/stemmers_ukrainian 17 | 18 | -------------------------------------------------------------------------------- /src/tapkomet_stem/Makefile: -------------------------------------------------------------------------------- 1 | EXE=ukr_stem 2 | OBJ=api.o utilities.o ukr_stem.o 3 | 4 | all: $(EXE) $(OBJ) 5 | 6 | clean: 7 | @rm -f $(OBJ) ukr.o 8 | 9 | clean_all: clean 10 | @rm -f $(EXE) 11 | 12 | api.o: api.c api.h header.h Makefile 13 | @$(CC) -c api.c 14 | 15 | utilities.o: utilities.c header.h Makefile 16 | @$(CC) -c utilities.c 17 | 18 | ukr_stem.o: ukr_stem.c ukr.c ukr.h header.h Makefile 19 | @$(CC) -c ukr_stem.c 20 | 21 | $(EXE): $(OBJ) Makefile 22 | @echo Linking $(EXE) 23 | @$(CC) -o $(EXE) $(OBJ) 24 | -------------------------------------------------------------------------------- /src/tapkomet_stem/api.h: -------------------------------------------------------------------------------- 1 | 2 | typedef unsigned char symbol; 3 | 4 | /* Or replace 'char' above with 'short' for 16 bit characters. 5 | 6 | More precisely, replace 'char' with whatever type guarantees the 7 | character width you need. Note however that sizeof(symbol) should divide 8 | HEAD, defined in header.h as 2*sizeof(int), without remainder, otherwise 9 | there is an alignment problem. In the unlikely event of a problem here, 10 | consult Martin Porter. 11 | 12 | */ 13 | 14 | struct SN_env { 15 | symbol * p; 16 | int c; int l; int lb; int bra; int ket; 17 | symbol * * S; 18 | int * I; 19 | }; 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | struct SN_env * SN_create_env(int S_size, int I_size); 26 | void SN_close_env(struct SN_env * z, int S_size); 27 | 28 | int SN_set_current(struct SN_env * z, int size, const symbol * s); 29 | 30 | #ifdef __cplusplus 31 | } 32 | #endif 33 | -------------------------------------------------------------------------------- /src/tapkomet_stem/api.c: -------------------------------------------------------------------------------- 1 | 2 | #include /* for calloc, free */ 3 | #include "header.h" 4 | 5 | struct SN_env * SN_create_env(int S_size, int I_size) 6 | { 7 | struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env)); 8 | if (z == NULL) return NULL; 9 | z->p = create_s(); 10 | if (z->p == NULL) goto error; 11 | if (S_size) 12 | { 13 | int i; 14 | z->S = (symbol * *) calloc(S_size, sizeof(symbol *)); 15 | if (z->S == NULL) goto error; 16 | 17 | for (i = 0; i < S_size; i++) 18 | { 19 | z->S[i] = create_s(); 20 | if (z->S[i] == NULL) goto error; 21 | } 22 | } 23 | 24 | if (I_size) 25 | { 26 | z->I = (int *) calloc(I_size, sizeof(int)); 27 | if (z->I == NULL) goto error; 28 | } 29 | 30 | return z; 31 | error: 32 | SN_close_env(z, S_size); 33 | return NULL; 34 | } 35 | 36 | void SN_close_env(struct SN_env * z, int S_size) 37 | { 38 | if (z == NULL) return; 39 | if (S_size) 40 | { 41 | int i; 42 | for (i = 0; i < S_size; i++) 43 | { 44 | lose_s(z->S[i]); 45 | } 46 | free(z->S); 47 | } 48 | free(z->I); 49 | if (z->p) lose_s(z->p); 50 | free(z); 51 | } 52 | 53 | int SN_set_current(struct SN_env * z, int size, const symbol * s) 54 | { 55 | int err = replace_s(z, 0, z->l, size, s, NULL); 56 | z->c = 0; 57 | return err; 58 | } 59 | -------------------------------------------------------------------------------- /src/tapkomet_stem/header.h: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | 4 | #include "api.h" 5 | 6 | #define MAXINT INT_MAX 7 | #define MININT INT_MIN 8 | 9 | #define HEAD 2*sizeof(int) 10 | 11 | #define SIZE(p) ((int *)(p))[-1] 12 | #define SET_SIZE(p, n) ((int *)(p))[-1] = n 13 | #define CAPACITY(p) ((int *)(p))[-2] 14 | 15 | struct among 16 | { int s_size; /* number of chars in string */ 17 | const symbol * s; /* search string */ 18 | int substring_i;/* index to longest matching substring */ 19 | int result; /* result of the lookup */ 20 | int (* function)(struct SN_env *); 21 | }; 22 | 23 | symbol * create_s(void); 24 | void lose_s(symbol * p); 25 | 26 | int skip_utf8(const symbol * p, int c, int lb, int l, int n); 27 | 28 | int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 29 | int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 30 | int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 31 | int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 32 | 33 | int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 34 | int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 35 | int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 36 | int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); 37 | 38 | int eq_s(struct SN_env * z, int s_size, const symbol * s); 39 | int eq_s_b(struct SN_env * z, int s_size, const symbol * s); 40 | int eq_v(struct SN_env * z, const symbol * p); 41 | int eq_v_b(struct SN_env * z, const symbol * p); 42 | 43 | int find_among(struct SN_env * z, const struct among * v, int v_size); 44 | int find_among_b(struct SN_env * z, const struct among * v, int v_size); 45 | 46 | int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment); 47 | int slice_from_s(struct SN_env * z, int s_size, const symbol * s); 48 | int slice_from_v(struct SN_env * z, const symbol * p); 49 | int slice_del(struct SN_env * z); 50 | 51 | int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s); 52 | int insert_v(struct SN_env * z, int bra, int ket, const symbol * p); 53 | 54 | symbol * slice_to(struct SN_env * z, symbol * p); 55 | symbol * assign_to(struct SN_env * z, symbol * p); 56 | 57 | int len_utf8(const symbol * p); 58 | 59 | void debug(struct SN_env * z, int number, int line_count); 60 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Stemmers for Ukrainian 2 | 3 | This repository introduces a new stemmer for the Ukrainian language (*tree_stem*) created via machine learning. It outperforms all other stemmers available to date as well as some lemmatizers by the error rate relative to truncation (ERRT) (Paice 1994). It also has the lowest percentage of understemming errors compared to the available stemming algorithms. 4 | 5 | The proposed algorithm does not use dictionary lookups while maintaining a reasonably small size (48 KB of Python bytecode). It works faster than lemmatization approach by a factor of x24, and outperforms other stemming algorithms in speed as well. 6 | 7 | In addition to the new algorithm, this repository also contains Python ports of some of the previously published stemmers. 8 | 9 | Comparison of stemmers for the Ukrainian language 10 | -- 11 | 12 | | Stemmer | Languages | UI | OI | ERRT | 13 | | --- | :---: | :---: | :---: | :---: | 14 | | *Dictionary-based (reference)* | – | 0.0172 | 3.59e-06 | **0.0244** | 15 | | tree\_stem | [Python](https://github.com/amakukha/stemmers_ukrainian/blob/master/src/tree_stem.py) | **0.0907** | 2.71e-06 | **0.125** | 16 | | pymorphy2 ([Paper](https://link.springer.com/chapter/10.1007%2F978-3-319-26123-2_31)) | [Python](https://github.com/kmike/pymorphy2) | 0.324 | **2.01e-07** | **0.391** | 17 | | stemka | [C++](https://web.archive.org/web/20200630070845/http://www.keva.ru/stemka/stemka.html) | 0.329 | 2.34e-06 | **0.447** | 18 | | tapkomet | [Snowball](https://github.com/Tapkomet/UAStemming), [C](https://github.com/amakukha/stemmers_ukrainian/tree/master/src/tapkomet_stem), [Java](https://github.com/Tapkomet/UAStemming) | 0.447 | 2.73e-06 | **0.603** | 19 | | vgrichina | [Groovy](https://github.com/vgrichina/ukrainian-stemmer), [Python](https://github.com/amakukha/stemmers_ukrainian/blob/master/src/vgrichina_stem.py) | 0.497 | 1.05e-06 | **0.651** | 20 | | drupal | [JS](https://github.com/titarenko/ukrstemmer), [Python](https://github.com/Desklop/Uk_Stemmer) | 0.511 | 7.54e-07 | **0.666** | 21 | | tochytskyi ([Paper](http://ekmair.ukma.edu.ua/bitstream/handle/123456789/12541/Hlybovets_Tochytskyi_Alhorytm_tokenizatsii.pdf?sequence=1&isAllowed=y)) | [PHP](https://github.com/tochytskyi/ukrstemmer), [Python](https://github.com/amakukha/stemmers_ukrainian/blob/master/src/tochytskyi_stem.py) | 0.623 | 5.72e-07 | **0.795** | 22 | | *No stemming* | – | 1.00 | 1.69e-08 | – | 23 | 24 | where: 25 | 26 | - *UI* – understemming index 27 | - *OI* – overstemming index 28 | - *ERRT* – error rate relative to truncation 29 | 30 | Notes: 31 | 32 | - *pymorphy2* is a dictionary-assisted lemmatizer and morphological analyzer which was included into this comparison for reference. The most probable normal form is used as a stem. 33 | - training and testing was performed on a dictionary of word forms. 34 | 35 | References 36 | -- 37 | 38 | - Paice, C. (1994). [An Evaluation Method for Stemming Algorithms](https://web.archive.org/web/20060705163430id_/http://widit.slis.indiana.edu/irpub/SIGIR/1994/pdf5.pdf). *Proceedings of the 17th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval*, 42-50. 39 | 40 | -------------------------------------------------------------------------------- /src/vgrichina_stem.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Stemmer for the Ukrainian language. 4 | # Author: Vladimir Grichina (vgrichina), 2013, MIT licence. 5 | # https://github.com/vgrichina/ukrainian-stemmer 6 | # Ported from Groovy to Python by Andrii Makukha. 7 | 8 | STRESS = ' ́'.lstrip() 9 | 10 | wends = """ 11 | а ам ами ах та 12 | в вав вавсь вався вала валась валася вали вались валися вало валось валося вати ватись ватися всь вся 13 | е еві ем ею 14 | є ємо ємось ємося ється єте єтесь єтеся єш єшся єю 15 | и ив ий ила или ило илося им ими имо имось имося ите итесь итеся ити ить иться их иш ишся 16 | й ймо ймось ймося йсь йся йте йтесь йтеся 17 | і ів ій ім імо ість істю іть 18 | ї 19 | ла лась лася ло лось лося ли лись лися 20 | о ові овував овувала овувати ого ої ок ом ому осте ості очка очкам очками очках очки очків очкові очком очку очок ою 21 | ти тись тися 22 | у ував увала увати 23 | ь 24 | ці 25 | ю юст юсь юся ють ються 26 | я ям ями ях 27 | """.strip().split() 28 | 29 | # WAT ? 30 | # к ка кам ками ках ки кою ку 31 | # ні ню ня ням нями нях 32 | wends.sort(key=lambda x: -len(x)) 33 | 34 | # endings in unchangeable words 35 | 36 | skip_ends = """ 37 | ер 38 | ск 39 | """.strip().split() 40 | 41 | skip_ends.sort(key=lambda x: -len(x)) 42 | 43 | # endings are changing 44 | change_endings = { 45 | "аче" : "ак", 46 | "іче" : "ік", 47 | "йовував" : "йов", "йовувала" : "йов", "йовувати" : "йов", 48 | "ьовував" : "ьов", "ьовувала" : "ьов", "ьовувати" : "ьов", 49 | "цьовував" : "ц", "цьовувала" : "ц", "цьовувати" : "ц", 50 | "ядер" : "ядр", 51 | } 52 | 53 | # words to skip 54 | stable_exclusions = set(""" 55 | баядер беатріче 56 | віче 57 | наче неначе 58 | одначе 59 | паче 60 | """.strip().split()) 61 | 62 | # words to replace 63 | exclusions = { 64 | "відер" : "відр", 65 | "був" : "бува", 66 | } 67 | 68 | 69 | def stem(word): 70 | # normalize word 71 | word = word.replace(STRESS,'') # remove stress symbol 72 | 73 | # don't change short words 74 | if len(word) <= 2: 75 | return word 76 | 77 | # check for unchanged exclusions 78 | if word in stable_exclusions: 79 | return word 80 | 81 | # check for replace exclusions 82 | if word in exclusions: 83 | return exclusions[word] 84 | 85 | # changing endings 86 | # TODO order endings by abc DESC 87 | for eow, rep in sorted(change_endings.items(), key=lambda x: x[1]): 88 | if eow and word.endswith(eow): 89 | return word[:-len(eow)] + rep 90 | 91 | # match for stable endings 92 | for eow in skip_ends: 93 | if word.endswith(eow): 94 | return word 95 | 96 | # try simple trim 97 | for eow in wends: 98 | if eow and word.endswith(eow): 99 | trimmed = word[:-len(eow)] 100 | if len(trimmed) > 2: 101 | return trimmed 102 | 103 | return word 104 | 105 | if __name__=='__main__': 106 | import sys 107 | for line in sys.stdin: 108 | arr = line.rstrip().split(';') 109 | arr = map(stem, arr) 110 | print (';'.join(arr)) 111 | -------------------------------------------------------------------------------- /src/tochytskyi_stem.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Stemmer for the Ukrainian language. 4 | # Authors: Andrii Glybovets, Volodymyr Tochytskyi (2016) 5 | # https://github.com/tochytskyi/ukrstemmer 6 | # Ported from PHP to Python by Andrii Makukha. 7 | 8 | import re 9 | 10 | class UkrStemmer: 11 | # http://uk.wikipedia.org/wiki/Голосний_звук */ 12 | #VOWEL = re.compile('[аеиоуюяіїє]') 13 | INFINITIVE = re.compile(r'(ти|учи|ячи|вши|ши|ати|яти|ючи)$') 14 | PERFECTIVEGROUND = re.compile(r'((ив|ивши|ившись))$') 15 | # static $PERFECTIVEGROUND = '/((ив|ивши|ившись|ыв|ывши|ывшись((?<=[ая])(в|вши|вшись)))$/u'; 16 | # http://uk.wikipedia.org/wiki/Рефлексивне_дієслово 17 | REFLEXIVE = re.compile(r'(с[яьи])$') 18 | #http://uk.wikipedia.org/wiki/Прикметник + http://wapedia.mobi/uk/Прикметник 19 | ADJECTIVE = re.compile(r'(ими|ій|ий|а|е|ова|ове|ів|є|їй|єє|еє|я|ім|ем|им|ім|их|іх|ою|йми|іми|у|ю|ого|ому|ої)$') 20 | #http://uk.wikipedia.org/wiki/Дієприкметник 21 | PARTICIPLE = re.compile(r'(ий|ого|ому|им|ім|а|ій|у|ою|ій|і|их|йми|их)$') 22 | #http://uk.wikipedia.org/wiki/Дієслово 23 | VERB = re.compile(r'(сь|ся|ив|ать|ять|у|ю|ав|али|учи|ячи|вши|ши|е|ме|ати|яти|є)$') 24 | #http://uk.wikipedia.org/wiki/Іменник 25 | NOUN = re.compile(r'(а|ев|ов|е|ями|ами|еи|и|ей|ой|ий|й|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я|і|ові|ї|ею|єю|ою|є|еві|ем|єм|ів|їв|\'ю)$') 26 | RVRE = re.compile(r'^(.*?[аеиоуюяіїє])(.*)$') 27 | DERIVATIONAL = re.compile(r'[^аеиоуюяіїє][аеиоуюяіїє]+[^аеиоуюяіїє]+[аеиоуюяіїє].*сть?$') 28 | 29 | @staticmethod 30 | def stemWord(word: str) -> str: 31 | stem = word.lower() 32 | 33 | # check if infinitive 34 | m = UkrStemmer.INFINITIVE.sub('', word) 35 | if m != word: 36 | return word 37 | 38 | # init 39 | p = UkrStemmer.RVRE.search(stem) 40 | if not p: 41 | return stem 42 | start = p.group(1) 43 | RV = p.group(2) 44 | if not start or not RV: 45 | return stem 46 | 47 | # STEP 1 48 | m = UkrStemmer.PERFECTIVEGROUND.sub('', RV) 49 | if m == RV: 50 | RV = UkrStemmer.REFLEXIVE.sub('', RV) 51 | m = UkrStemmer.ADJECTIVE.sub('', RV) 52 | if m == RV: 53 | RV = UkrStemmer.PARTICIPLE.sub('', RV) 54 | else: 55 | RV = m 56 | m = UkrStemmer.VERB.sub('', RV) 57 | if m == RV: 58 | RV = UkrStemmer.NOUN.sub('', RV) 59 | else: 60 | RV = m 61 | else: 62 | RV = m 63 | 64 | # STEP 2 65 | if RV.endswith('і'): 66 | RV = RV[:-1] 67 | 68 | # STEP 3 69 | if UkrStemmer.DERIVATIONAL.search(RV): 70 | RV = re.sub(r'ість?$', '', RV) 71 | 72 | # STEP 4 73 | m = re.sub(r'ь?$', '', RV); 74 | if m == RV: 75 | RV = re.sub(r'ейше?', '', RV) 76 | RV = re.sub(r'нн$', 'н', RV) 77 | else: 78 | RV = m 79 | 80 | stem = start + RV 81 | 82 | return stem 83 | 84 | @staticmethod 85 | def stemArray(tokens: list) -> list: 86 | return map(UkrStemmer.stemWord, tokens) 87 | 88 | if __name__=='__main__': 89 | import sys 90 | stem = UkrStemmer() 91 | for line in sys.stdin: 92 | arr = line.rstrip().split(';') 93 | arr = stem.stemArray(arr) 94 | print (';'.join(arr)) 95 | -------------------------------------------------------------------------------- /src/tapkomet_stem/ukr_stem.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include /* for malloc, free */ 3 | #include /* for memmove */ 4 | 5 | //#include "api.h" 6 | #include "ukr.h" 7 | #include "ukr.c" 8 | 9 | 10 | /* This derives from the source file driver.template */ 11 | 12 | /* A simple driver for a single ANSI C generated Hungarian stemmer. 13 | 14 | Following compilation with 15 | 16 | gcc -o H_prog q/*.c 17 | 18 | The command line syntax is 19 | 20 | ./H_prog [file] [-o[utput] file] [-h[elp]] 21 | 22 | The first argument gives the input file, which consists of a list of words 23 | to be stemmed, one per line. (Words must be in lower case.) If omitted, stdin 24 | is used. 25 | 26 | The output is sent to stdout by default, otherwise to the -output file. 27 | 28 | */ 29 | 30 | static void stem_file(struct SN_env * z, FILE * f_in, FILE * f_out) { 31 | #define INC 10 32 | int lim = INC; 33 | symbol * b = (symbol *) malloc(lim * sizeof(symbol)); 34 | 35 | while(1) { 36 | int ch = getc(f_in); 37 | if (ch == EOF) { 38 | free(b); return; 39 | } 40 | { 41 | int i = 0; 42 | while(1) { 43 | if (ch == '\n' || ch == EOF) break; 44 | if (i == lim) { /* make b bigger */ 45 | symbol * q = (symbol *) malloc((lim + INC) * sizeof(symbol)); 46 | memmove(q, b, lim * sizeof(symbol)); 47 | free(b); b = q; 48 | lim = lim + INC; 49 | } 50 | b[i] = ch; i++; 51 | ch = getc(f_in); 52 | } 53 | 54 | SN_set_current(z, i, b); 55 | ukr_stem(z); 56 | { 57 | int j; 58 | for (j = 0; j < z->l; j++) fprintf(f_out, "%c", z->p[j]); 59 | fprintf(f_out, "\n"); 60 | } 61 | } 62 | } 63 | } 64 | 65 | static int eq(char * s1, char * s2) { 66 | int s1_len = strlen(s1); 67 | int s2_len = strlen(s2); 68 | return s1_len == s2_len && memcmp(s1, s2, s1_len) == 0; 69 | } 70 | 71 | static void show_options(int n) { 72 | printf("options are: file [-o[utput] file] [-h[elp]]\n"); 73 | exit(n); 74 | } 75 | 76 | int main(int argc, char * argv[]) 77 | { char * in = 0; 78 | char * out = 0; 79 | { char * s; 80 | int i = 1; 81 | while(1) { 82 | if (i >= argc) break; 83 | s = argv[i++]; 84 | if (s[0] == '-') { 85 | 86 | if (eq(s, "-output") || eq(s, "-o")) { 87 | if (i >= argc) { 88 | fprintf(stderr, "%s requires an argument\n", s); 89 | exit(1); 90 | } 91 | out = argv[i++]; 92 | } else if (eq(s, "-help") || eq(s, "-h")) { 93 | show_options(0); 94 | } else { 95 | fprintf(stderr, "%s unknown\n", s); 96 | show_options(1); 97 | } 98 | } 99 | else in = s; 100 | } 101 | } 102 | 103 | /* initialise the stemming process: */ 104 | 105 | { 106 | struct SN_env * z = ukr_create_env(); 107 | FILE * f_in; 108 | FILE * f_out; 109 | f_in = in == 0 ? stdin : fopen(in, "r"); 110 | if (f_in == 0) { 111 | fprintf(stderr, "file %s not found\n", in); exit(1); 112 | } 113 | f_out = out == 0 ? stdout : fopen(out, "w"); 114 | if (f_out == 0) { 115 | fprintf(stderr, "file %s cannot be opened\n", out); exit(1); 116 | } 117 | stem_file(z, f_in, f_out); 118 | ukr_close_env(z); 119 | } 120 | 121 | return 0; 122 | } 123 | -------------------------------------------------------------------------------- /src/tapkomet_stem/stem_ukr.sbl: -------------------------------------------------------------------------------- 1 | /* Stemmer for the Ukrainian language 2 | * 3 | * Author: Roman Kobzar, 2017 4 | * https://github.com/Tapkomet/UAStemming 5 | * 3-Clause BSD licence 6 | */ 7 | 8 | stringescapes {} 9 | 10 | /* the 33 Ukrainian letters and apostrophe represented by single quote*/ 11 | 12 | stringdef a hex '430' 13 | stringdef b hex '431' 14 | stringdef v hex '432' 15 | stringdef gh hex '433' 16 | stringdef g hex '491' 17 | stringdef d hex '434' 18 | stringdef e hex '435' 19 | stringdef ye hex '454' 20 | stringdef zh hex '436' 21 | stringdef z hex '437' 22 | stringdef y hex '438' 23 | stringdef i hex '456' 24 | stringdef yi hex '457' 25 | stringdef i` hex '439' 26 | stringdef k hex '43A' 27 | stringdef l hex '43B' 28 | stringdef m hex '43C' 29 | stringdef n hex '43D' 30 | stringdef o hex '43E' 31 | stringdef p hex '43F' 32 | stringdef r hex '440' 33 | stringdef s hex '441' 34 | stringdef t hex '442' 35 | stringdef u hex '443' 36 | stringdef f hex '444' 37 | stringdef kh hex '445' 38 | stringdef ts hex '446' 39 | stringdef ch hex '447' 40 | stringdef sh hex '448' 41 | stringdef shch hex '449' 42 | stringdef soft hex '44C' 43 | stringdef iu hex '44E' 44 | stringdef ia hex '44F' 45 | stringdef apostrophe hex '27' 46 | 47 | routines ( exception1 48 | adjective 49 | postfix 50 | verb 51 | noun 52 | tidy_up 53 | ) 54 | 55 | externals ( stem ) 56 | 57 | define exception1 as ( 58 | 59 | [substring] atlimit among( 60 | 61 | '{z}{d}{o}{r}{o}{v}{apostrophe}{ia}' (<-'{z}{d}{o}{r}') 62 | /* invariant forms: */ 63 | '{k}{r}{i}{m}' 64 | '{a}{d}{zh}{e}' 65 | '{a}{t}{o}{m}' 66 | '{k}{r}{o}{k}' 67 | '{d}{e}{s}{soft}' 68 | '{v}{i}{s}{soft}' 69 | // ... extensions possible here ... 70 | ) 71 | ) 72 | 73 | 74 | backwardmode ( 75 | 76 | 77 | define adjective as ( 78 | 79 | [substring] among ( 80 | 81 | '{o}{v}{e}' '{o}{v}{a}' '{o}{v}{o}' '{o}{v}{y}{i`}' 82 | '{o}{v}{y}{m}' '{o}{v}{y}{kh}' '{o}{v}{o}{gh}{o}' 83 | '{o}{v}{o}{m}{u}' '{o}{v}{o}{iu}' '{o}{v}{o}{yi}' 84 | '{o}{v}{i}{i`}' '{y}{m}{y}' 85 | '{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{yi}' '{e}{m}' '{i}{m}' 86 | '{y}{m}' '{o}{m}' '{o}{gh}{o}' '{e}{m}{u}' 87 | '{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}' 88 | '{o}{iu}' '{e}{iu}' 89 | (delete) 90 | ) 91 | ) 92 | 93 | 94 | define postfix as ( 95 | [substring] among ( 96 | '{s}{ia}' 97 | '{s}{soft}' 98 | (delete) 99 | ) 100 | ) 101 | 102 | define verb as ( 103 | [substring] among ( 104 | '{sh}{o}{v}' '{sh}{l}{a}' '{sh}{l}{y}' '{sh}{l}{o}' (<-'{t}') 105 | '{u}{iu}{t}{soft}' '{a}{iu}{t}{soft}' '{i}{iu}{t}{soft}' '{y}{v}{sh}{y}' 106 | '{a}{n}{y}{m}{y}' '{a}{n}{y}{kh}' '{a}{n}{i}' '{y}{v}' 107 | '{i}{l}{a}' '{y}{l}{a}' '{y}{l}{y}' '{y}{t}{a}' '{e}{n}{a}' 108 | '{u}{i`}{t}{e}' '{u}{v}{a}{l}{y}' '{u}{v}{a}{l}{o}' 109 | '{u}{v}{a}{l}{a}' '{u}{v}{a}{v}' '{u}{v}{a}{t}{y}' '{u}{v}{a}{n}{n}{ia}' 110 | '{a}{l}{y}' '{a}{l}{o}' '{a}{l}{a}' '{a}{v}' '{y}{l}{i}' 111 | '{u}{i`}' '{a}{ye}' '{u}{ye}' '{i}{ye}' '{y}{ye}' '{ia}{ye}' 112 | '{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{iu}{ye}' 113 | '{i}{t}{soft}' '{y}{t}{soft}' 114 | 115 | (delete) 116 | ) 117 | ) 118 | 119 | define noun as ( 120 | [substring] among ( 121 | '{ia}{t}{a}' '{ia}{t}{a}{m}' '{ia}{t}{a}{m}{y}' 122 | ('{n}' delete) 123 | '{o}{k}' (<-'{k}') 124 | '{o}{yi}{v}' '{o}{ia}{kh}' '{o}{ye}{m}' '{o}{ia}' 125 | '{i}{ia}{kh}' 126 | '{a}' '{yi}{v}' '{o}{v}{i}' '{o}{v}' '{i}{v}' '{e}' 127 | '{ia}{t}' '{i}{l}' '{y}{l}' '{e}{n}' '{u}{iu}' 128 | '{i}{ia}{m}{y}' '{ia}{m}{y}' '{a}{m}{y}' 129 | '{i}{yi}' '{i}{ye}{iu}' 130 | '{i}' '{e}{i`}' '{e}{yi}' '{o}{i`}' '{i}{i`}' '{i`}' 131 | '{i}{ia}{m}' '{ia}{m}' '{e}{m}' '{a}{m}' '{o}{m}' 132 | '{o}' '{u}' '{a}{kh}' '{ia}{kh}' '{y}' '{soft}' 133 | '{i}{iu}' '{iu}' '{i}{ia}' '{ia}' 134 | (delete) 135 | ) 136 | ) 137 | 138 | 139 | define tidy_up as ( 140 | [substring] among ( 141 | '{n}' 142 | ('{n}' delete) 143 | '{t}' 144 | ('{t}' delete) //double suffixes 145 | '{soft}' 146 | '{i}{ch}{n}' 147 | '{i}{i`}{s}{soft}{k}' 148 | '{s}{soft}{k}' 149 | '{soft}{n}' 150 | '{e}{n}' '{a}{n}' '{y}{t}' '{iu}{iu}{t}' 151 | '{u}{ye}{t}' '{a}{ye}{t}' '{iu}{ye}{t}' '{ia}{ye}{t}' 152 | '{apostrophe}' 153 | (delete) 154 | '{o}{s}{t}' (<-'{i}{s}{t}') 155 | ) 156 | ) 157 | ) 158 | 159 | define stem as ( 160 | not hop 4 or ( 161 | exception1 or 162 | backwards( 163 | do ( 164 | try postfix 165 | adjective or verb or noun 166 | ) 167 | do tidy_up 168 | ) 169 | ) 170 | ) 171 | -------------------------------------------------------------------------------- /src/tapkomet_stem/utilities.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | 6 | #include "header.h" 7 | 8 | #define CREATE_SIZE 1 9 | 10 | symbol * create_s(void) { 11 | symbol * p; 12 | void * mem = malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol)); 13 | if (mem == NULL) return NULL; 14 | p = (symbol *) (HEAD + (char *) mem); 15 | CAPACITY(p) = CREATE_SIZE; 16 | SET_SIZE(p, 0); 17 | return p; 18 | } 19 | 20 | void lose_s(symbol * p) { 21 | if (p == NULL) return; 22 | free((char *) p - HEAD); 23 | } 24 | 25 | /* 26 | new_p = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c 27 | if n +ve, or n characters backwards from p + c - 1 if n -ve. new_p is the new 28 | position, or -1 on failure. 29 | 30 | -- used to implement hop and next in the utf8 case. 31 | */ 32 | 33 | int skip_utf8(const symbol * p, int c, int lb, int l, int n) { 34 | int b; 35 | if (n >= 0) { 36 | for (; n > 0; n--) { 37 | if (c >= l) return -1; 38 | b = p[c++]; 39 | if (b >= 0xC0) { /* 1100 0000 */ 40 | while (c < l) { 41 | b = p[c]; 42 | if (b >= 0xC0 || b < 0x80) break; 43 | /* break unless b is 10------ */ 44 | c++; 45 | } 46 | } 47 | } 48 | } else { 49 | for (; n < 0; n++) { 50 | if (c <= lb) return -1; 51 | b = p[--c]; 52 | if (b >= 0x80) { /* 1000 0000 */ 53 | while (c > lb) { 54 | b = p[c]; 55 | if (b >= 0xC0) break; /* 1100 0000 */ 56 | c--; 57 | } 58 | } 59 | } 60 | } 61 | return c; 62 | } 63 | 64 | /* Code for character groupings: utf8 cases */ 65 | 66 | static int get_utf8(const symbol * p, int c, int l, int * slot) { 67 | int b0, b1, b2; 68 | if (c >= l) return 0; 69 | b0 = p[c++]; 70 | if (b0 < 0xC0 || c == l) { /* 1100 0000 */ 71 | *slot = b0; 72 | return 1; 73 | } 74 | b1 = p[c++] & 0x3F; 75 | if (b0 < 0xE0 || c == l) { /* 1110 0000 */ 76 | *slot = (b0 & 0x1F) << 6 | b1; 77 | return 2; 78 | } 79 | b2 = p[c++] & 0x3F; 80 | if (b0 < 0xF0 || c == l) { /* 1111 0000 */ 81 | *slot = (b0 & 0xF) << 12 | b1 << 6 | b2; 82 | return 3; 83 | } 84 | *slot = (b0 & 0xE) << 18 | b1 << 12 | b2 << 6 | (p[c] & 0x3F); 85 | return 4; 86 | } 87 | 88 | static int get_b_utf8(const symbol * p, int c, int lb, int * slot) { 89 | int a, b; 90 | if (c <= lb) return 0; 91 | b = p[--c]; 92 | if (b < 0x80 || c == lb) { /* 1000 0000 */ 93 | *slot = b; 94 | return 1; 95 | } 96 | a = b & 0x3F; 97 | b = p[--c]; 98 | if (b >= 0xC0 || c == lb) { /* 1100 0000 */ 99 | *slot = (b & 0x1F) << 6 | a; 100 | return 2; 101 | } 102 | a |= (b & 0x3F) << 6; 103 | b = p[--c]; 104 | if (b >= 0xE0 || c == lb) { /* 1110 0000 */ 105 | *slot = (b & 0xF) << 12 | a; 106 | return 3; 107 | } 108 | *slot = (p[--c] & 0xE) << 18 | (b & 0x3F) << 12 | a; 109 | return 4; 110 | } 111 | 112 | int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { 113 | do { 114 | int ch; 115 | int w = get_utf8(z->p, z->c, z->l, & ch); 116 | if (!w) return -1; 117 | if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) 118 | return w; 119 | z->c += w; 120 | } while (repeat); 121 | return 0; 122 | } 123 | 124 | int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { 125 | do { 126 | int ch; 127 | int w = get_b_utf8(z->p, z->c, z->lb, & ch); 128 | if (!w) return -1; 129 | if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) 130 | return w; 131 | z->c -= w; 132 | } while (repeat); 133 | return 0; 134 | } 135 | 136 | int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { 137 | do { 138 | int ch; 139 | int w = get_utf8(z->p, z->c, z->l, & ch); 140 | if (!w) return -1; 141 | if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) 142 | return w; 143 | z->c += w; 144 | } while (repeat); 145 | return 0; 146 | } 147 | 148 | int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { 149 | do { 150 | int ch; 151 | int w = get_b_utf8(z->p, z->c, z->lb, & ch); 152 | if (!w) return -1; 153 | if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) 154 | return w; 155 | z->c -= w; 156 | } while (repeat); 157 | return 0; 158 | } 159 | 160 | /* Code for character groupings: non-utf8 cases */ 161 | 162 | int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { 163 | do { 164 | int ch; 165 | if (z->c >= z->l) return -1; 166 | ch = z->p[z->c]; 167 | if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) 168 | return 1; 169 | z->c++; 170 | } while (repeat); 171 | return 0; 172 | } 173 | 174 | int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { 175 | do { 176 | int ch; 177 | if (z->c <= z->lb) return -1; 178 | ch = z->p[z->c - 1]; 179 | if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) 180 | return 1; 181 | z->c--; 182 | } while (repeat); 183 | return 0; 184 | } 185 | 186 | int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { 187 | do { 188 | int ch; 189 | if (z->c >= z->l) return -1; 190 | ch = z->p[z->c]; 191 | if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) 192 | return 1; 193 | z->c++; 194 | } while (repeat); 195 | return 0; 196 | } 197 | 198 | int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { 199 | do { 200 | int ch; 201 | if (z->c <= z->lb) return -1; 202 | ch = z->p[z->c - 1]; 203 | if (!(ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0)) 204 | return 1; 205 | z->c--; 206 | } while (repeat); 207 | return 0; 208 | } 209 | 210 | int eq_s(struct SN_env * z, int s_size, const symbol * s) { 211 | if (z->l - z->c < s_size || memcmp(z->p + z->c, s, s_size * sizeof(symbol)) != 0) return 0; 212 | z->c += s_size; return 1; 213 | } 214 | 215 | int eq_s_b(struct SN_env * z, int s_size, const symbol * s) { 216 | if (z->c - z->lb < s_size || memcmp(z->p + z->c - s_size, s, s_size * sizeof(symbol)) != 0) return 0; 217 | z->c -= s_size; return 1; 218 | } 219 | 220 | int eq_v(struct SN_env * z, const symbol * p) { 221 | return eq_s(z, SIZE(p), p); 222 | } 223 | 224 | int eq_v_b(struct SN_env * z, const symbol * p) { 225 | return eq_s_b(z, SIZE(p), p); 226 | } 227 | 228 | int find_among(struct SN_env * z, const struct among * v, int v_size) { 229 | 230 | int i = 0; 231 | int j = v_size; 232 | 233 | int c = z->c; int l = z->l; 234 | const symbol * q = z->p + c; 235 | 236 | const struct among * w; 237 | 238 | int common_i = 0; 239 | int common_j = 0; 240 | 241 | int first_key_inspected = 0; 242 | 243 | while (1) { 244 | int k = i + ((j - i) >> 1); 245 | int diff = 0; 246 | int common = common_i < common_j ? common_i : common_j; /* smaller */ 247 | w = v + k; 248 | { 249 | int i2; for (i2 = common; i2 < w->s_size; i2++) { 250 | if (c + common == l) { diff = -1; break; } 251 | diff = q[common] - w->s[i2]; 252 | if (diff != 0) break; 253 | common++; 254 | } 255 | } 256 | if (diff < 0) { 257 | j = k; 258 | common_j = common; 259 | } else { 260 | i = k; 261 | common_i = common; 262 | } 263 | if (j - i <= 1) { 264 | if (i > 0) break; /* v->s has been inspected */ 265 | if (j == i) break; /* only one item in v */ 266 | 267 | /* - but now we need to go round once more to get 268 | v->s inspected. This looks messy, but is actually 269 | the optimal approach. */ 270 | 271 | if (first_key_inspected) break; 272 | first_key_inspected = 1; 273 | } 274 | } 275 | while (1) { 276 | w = v + i; 277 | if (common_i >= w->s_size) { 278 | z->c = c + w->s_size; 279 | if (w->function == 0) return w->result; 280 | { 281 | int res = w->function(z); 282 | z->c = c + w->s_size; 283 | if (res) return w->result; 284 | } 285 | } 286 | i = w->substring_i; 287 | if (i < 0) return 0; 288 | } 289 | } 290 | 291 | /* find_among_b is for backwards processing. Same comments apply */ 292 | 293 | int find_among_b(struct SN_env * z, const struct among * v, int v_size) { 294 | 295 | int i = 0; 296 | int j = v_size; 297 | 298 | int c = z->c; int lb = z->lb; 299 | const symbol * q = z->p + c - 1; 300 | 301 | const struct among * w; 302 | 303 | int common_i = 0; 304 | int common_j = 0; 305 | 306 | int first_key_inspected = 0; 307 | 308 | while (1) { 309 | int k = i + ((j - i) >> 1); 310 | int diff = 0; 311 | int common = common_i < common_j ? common_i : common_j; 312 | w = v + k; 313 | { 314 | int i2; for (i2 = w->s_size - 1 - common; i2 >= 0; i2--) { 315 | if (c - common == lb) { diff = -1; break; } 316 | diff = q[- common] - w->s[i2]; 317 | if (diff != 0) break; 318 | common++; 319 | } 320 | } 321 | if (diff < 0) { j = k; common_j = common; } 322 | else { i = k; common_i = common; } 323 | if (j - i <= 1) { 324 | if (i > 0) break; 325 | if (j == i) break; 326 | if (first_key_inspected) break; 327 | first_key_inspected = 1; 328 | } 329 | } 330 | while (1) { 331 | w = v + i; 332 | if (common_i >= w->s_size) { 333 | z->c = c - w->s_size; 334 | if (w->function == 0) return w->result; 335 | { 336 | int res = w->function(z); 337 | z->c = c - w->s_size; 338 | if (res) return w->result; 339 | } 340 | } 341 | i = w->substring_i; 342 | if (i < 0) return 0; 343 | } 344 | } 345 | 346 | 347 | /* Increase the size of the buffer pointed to by p to at least n symbols. 348 | * If insufficient memory, returns NULL and frees the old buffer. 349 | */ 350 | static symbol * increase_size(symbol * p, int n) { 351 | symbol * q; 352 | int new_size = n + 20; 353 | void * mem = realloc((char *) p - HEAD, 354 | HEAD + (new_size + 1) * sizeof(symbol)); 355 | if (mem == NULL) { 356 | lose_s(p); 357 | return NULL; 358 | } 359 | q = (symbol *) (HEAD + (char *)mem); 360 | CAPACITY(q) = new_size; 361 | return q; 362 | } 363 | 364 | /* to replace symbols between c_bra and c_ket in z->p by the 365 | s_size symbols at s. 366 | Returns 0 on success, -1 on error. 367 | Also, frees z->p (and sets it to NULL) on error. 368 | */ 369 | int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjptr) 370 | { 371 | int adjustment; 372 | int len; 373 | if (z->p == NULL) { 374 | z->p = create_s(); 375 | if (z->p == NULL) return -1; 376 | } 377 | adjustment = s_size - (c_ket - c_bra); 378 | len = SIZE(z->p); 379 | if (adjustment != 0) { 380 | if (adjustment + len > CAPACITY(z->p)) { 381 | z->p = increase_size(z->p, adjustment + len); 382 | if (z->p == NULL) return -1; 383 | } 384 | memmove(z->p + c_ket + adjustment, 385 | z->p + c_ket, 386 | (len - c_ket) * sizeof(symbol)); 387 | SET_SIZE(z->p, adjustment + len); 388 | z->l += adjustment; 389 | if (z->c >= c_ket) 390 | z->c += adjustment; 391 | else if (z->c > c_bra) 392 | z->c = c_bra; 393 | } 394 | if (s_size) memmove(z->p + c_bra, s, s_size * sizeof(symbol)); 395 | if (adjptr != NULL) 396 | *adjptr = adjustment; 397 | return 0; 398 | } 399 | 400 | static int slice_check(struct SN_env * z) { 401 | 402 | if (z->bra < 0 || 403 | z->bra > z->ket || 404 | z->ket > z->l || 405 | z->p == NULL || 406 | z->l > SIZE(z->p)) /* this line could be removed */ 407 | { 408 | #if 0 409 | fprintf(stderr, "faulty slice operation:\n"); 410 | debug(z, -1, 0); 411 | #endif 412 | return -1; 413 | } 414 | return 0; 415 | } 416 | 417 | int slice_from_s(struct SN_env * z, int s_size, const symbol * s) { 418 | if (slice_check(z)) return -1; 419 | return replace_s(z, z->bra, z->ket, s_size, s, NULL); 420 | } 421 | 422 | int slice_from_v(struct SN_env * z, const symbol * p) { 423 | return slice_from_s(z, SIZE(p), p); 424 | } 425 | 426 | int slice_del(struct SN_env * z) { 427 | return slice_from_s(z, 0, 0); 428 | } 429 | 430 | int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s) { 431 | int adjustment; 432 | if (replace_s(z, bra, ket, s_size, s, &adjustment)) 433 | return -1; 434 | if (bra <= z->bra) z->bra += adjustment; 435 | if (bra <= z->ket) z->ket += adjustment; 436 | return 0; 437 | } 438 | 439 | int insert_v(struct SN_env * z, int bra, int ket, const symbol * p) { 440 | return insert_s(z, bra, ket, SIZE(p), p); 441 | } 442 | 443 | symbol * slice_to(struct SN_env * z, symbol * p) { 444 | if (slice_check(z)) { 445 | lose_s(p); 446 | return NULL; 447 | } 448 | { 449 | int len = z->ket - z->bra; 450 | if (CAPACITY(p) < len) { 451 | p = increase_size(p, len); 452 | if (p == NULL) 453 | return NULL; 454 | } 455 | memmove(p, z->p + z->bra, len * sizeof(symbol)); 456 | SET_SIZE(p, len); 457 | } 458 | return p; 459 | } 460 | 461 | symbol * assign_to(struct SN_env * z, symbol * p) { 462 | int len = z->l; 463 | if (CAPACITY(p) < len) { 464 | p = increase_size(p, len); 465 | if (p == NULL) 466 | return NULL; 467 | } 468 | memmove(p, z->p, len * sizeof(symbol)); 469 | SET_SIZE(p, len); 470 | return p; 471 | } 472 | 473 | int len_utf8(const symbol * p) { 474 | int size = SIZE(p); 475 | int len = 0; 476 | while (size--) { 477 | symbol b = *p++; 478 | if (b >= 0xC0 || b < 0x80) ++len; 479 | } 480 | return len; 481 | } 482 | 483 | #if 0 484 | void debug(struct SN_env * z, int number, int line_count) { 485 | int i; 486 | int limit = SIZE(z->p); 487 | /*if (number >= 0) printf("%3d (line %4d): '", number, line_count);*/ 488 | if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit); 489 | for (i = 0; i <= limit; i++) { 490 | if (z->lb == i) printf("{"); 491 | if (z->bra == i) printf("["); 492 | if (z->c == i) printf("|"); 493 | if (z->ket == i) printf("]"); 494 | if (z->l == i) printf("}"); 495 | if (i < limit) 496 | { int ch = z->p[i]; 497 | if (ch == 0) ch = '#'; 498 | printf("%c", ch); 499 | } 500 | } 501 | printf("'\n"); 502 | } 503 | #endif 504 | -------------------------------------------------------------------------------- /src/tapkomet_stem/ukr.c: -------------------------------------------------------------------------------- 1 | /* Generated by Snowball 2.0.0 - https://snowballstem.org/ */ 2 | 3 | #include "header.h" 4 | 5 | #ifdef __cplusplus 6 | extern "C" { 7 | #endif 8 | extern int ukr_stem(struct SN_env * z); 9 | #ifdef __cplusplus 10 | } 11 | #endif 12 | static int r_tidy_up(struct SN_env * z); 13 | static int r_noun(struct SN_env * z); 14 | static int r_verb(struct SN_env * z); 15 | static int r_postfix(struct SN_env * z); 16 | static int r_adjective(struct SN_env * z); 17 | static int r_exception1(struct SN_env * z); 18 | #ifdef __cplusplus 19 | extern "C" { 20 | #endif 21 | 22 | 23 | extern struct SN_env * ukr_create_env(void); 24 | extern void ukr_close_env(struct SN_env * z); 25 | 26 | 27 | #ifdef __cplusplus 28 | } 29 | #endif 30 | static const symbol s_0_0[8] = { 0xD0, 0xB0, 0xD0, 0xB4, 0xD0, 0xB6, 0xD0, 0xB5 }; 31 | static const symbol s_0_1[8] = { 0xD0, 0xB0, 0xD1, 0x82, 0xD0, 0xBE, 0xD0, 0xBC }; 32 | static const symbol s_0_2[8] = { 0xD0, 0xB2, 0xD1, 0x96, 0xD1, 0x81, 0xD1, 0x8C }; 33 | static const symbol s_0_3[8] = { 0xD0, 0xB4, 0xD0, 0xB5, 0xD1, 0x81, 0xD1, 0x8C }; 34 | static const symbol s_0_4[15] = { 0xD0, 0xB7, 0xD0, 0xB4, 0xD0, 0xBE, 0xD1, 0x80, 0xD0, 0xBE, 0xD0, 0xB2, '\'', 0xD1, 0x8F }; 35 | static const symbol s_0_5[8] = { 0xD0, 0xBA, 0xD1, 0x80, 0xD0, 0xBE, 0xD0, 0xBA }; 36 | static const symbol s_0_6[8] = { 0xD0, 0xBA, 0xD1, 0x80, 0xD1, 0x96, 0xD0, 0xBC }; 37 | 38 | static const struct among a_0[7] = 39 | { 40 | { 8, s_0_0, -1, -1, 0}, 41 | { 8, s_0_1, -1, -1, 0}, 42 | { 8, s_0_2, -1, -1, 0}, 43 | { 8, s_0_3, -1, -1, 0}, 44 | { 15, s_0_4, -1, 1, 0}, 45 | { 8, s_0_5, -1, -1, 0}, 46 | { 8, s_0_6, -1, -1, 0} 47 | }; 48 | 49 | static const symbol s_1_0[6] = { 0xD0, 0xB5, 0xD0, 0xBC, 0xD1, 0x83 }; 50 | static const symbol s_1_1[6] = { 0xD0, 0xBE, 0xD0, 0xBC, 0xD1, 0x83 }; 51 | static const symbol s_1_2[10] = { 0xD0, 0xBE, 0xD0, 0xB2, 0xD0, 0xBE, 0xD0, 0xBC, 0xD1, 0x83 }; 52 | static const symbol s_1_3[4] = { 0xD1, 0x96, 0xD1, 0x85 }; 53 | static const symbol s_1_4[4] = { 0xD0, 0xB8, 0xD1, 0x85 }; 54 | static const symbol s_1_5[8] = { 0xD0, 0xBE, 0xD0, 0xB2, 0xD0, 0xB8, 0xD1, 0x85 }; 55 | static const symbol s_1_6[4] = { 0xD1, 0x83, 0xD1, 0x8E }; 56 | static const symbol s_1_7[4] = { 0xD1, 0x8E, 0xD1, 0x8E }; 57 | static const symbol s_1_8[4] = { 0xD0, 0xB5, 0xD1, 0x8E }; 58 | static const symbol s_1_9[4] = { 0xD0, 0xBE, 0xD1, 0x8E }; 59 | static const symbol s_1_10[8] = { 0xD0, 0xBE, 0xD0, 0xB2, 0xD0, 0xBE, 0xD1, 0x8E }; 60 | static const symbol s_1_11[4] = { 0xD0, 0xB0, 0xD1, 0x8F }; 61 | static const symbol s_1_12[4] = { 0xD0, 0xBE, 0xD1, 0x97 }; 62 | static const symbol s_1_13[8] = { 0xD0, 0xBE, 0xD0, 0xB2, 0xD0, 0xBE, 0xD1, 0x97 }; 63 | static const symbol s_1_14[6] = { 0xD0, 0xBE, 0xD0, 0xB2, 0xD0, 0xB0 }; 64 | static const symbol s_1_15[6] = { 0xD0, 0xBE, 0xD0, 0xB2, 0xD0, 0xB5 }; 65 | static const symbol s_1_16[6] = { 0xD0, 0xB8, 0xD0, 0xBC, 0xD0, 0xB8 }; 66 | static const symbol s_1_17[4] = { 0xD1, 0x96, 0xD0, 0xB9 }; 67 | static const symbol s_1_18[8] = { 0xD0, 0xBE, 0xD0, 0xB2, 0xD1, 0x96, 0xD0, 0xB9 }; 68 | static const symbol s_1_19[4] = { 0xD0, 0xB5, 0xD0, 0xB9 }; 69 | static const symbol s_1_20[4] = { 0xD0, 0xB8, 0xD0, 0xB9 }; 70 | static const symbol s_1_21[8] = { 0xD0, 0xBE, 0xD0, 0xB2, 0xD0, 0xB8, 0xD0, 0xB9 }; 71 | static const symbol s_1_22[4] = { 0xD1, 0x96, 0xD0, 0xBC }; 72 | static const symbol s_1_23[4] = { 0xD0, 0xB5, 0xD0, 0xBC }; 73 | static const symbol s_1_24[4] = { 0xD0, 0xB8, 0xD0, 0xBC }; 74 | static const symbol s_1_25[8] = { 0xD0, 0xBE, 0xD0, 0xB2, 0xD0, 0xB8, 0xD0, 0xBC }; 75 | static const symbol s_1_26[4] = { 0xD0, 0xBE, 0xD0, 0xBC }; 76 | static const symbol s_1_27[6] = { 0xD0, 0xBE, 0xD0, 0xB2, 0xD0, 0xBE }; 77 | static const symbol s_1_28[6] = { 0xD0, 0xBE, 0xD0, 0xB3, 0xD0, 0xBE }; 78 | static const symbol s_1_29[10] = { 0xD0, 0xBE, 0xD0, 0xB2, 0xD0, 0xBE, 0xD0, 0xB3, 0xD0, 0xBE }; 79 | 80 | static const struct among a_1[30] = 81 | { 82 | { 6, s_1_0, -1, 1, 0}, 83 | { 6, s_1_1, -1, 1, 0}, 84 | { 10, s_1_2, 1, 1, 0}, 85 | { 4, s_1_3, -1, 1, 0}, 86 | { 4, s_1_4, -1, 1, 0}, 87 | { 8, s_1_5, 4, 1, 0}, 88 | { 4, s_1_6, -1, 1, 0}, 89 | { 4, s_1_7, -1, 1, 0}, 90 | { 4, s_1_8, -1, 1, 0}, 91 | { 4, s_1_9, -1, 1, 0}, 92 | { 8, s_1_10, 9, 1, 0}, 93 | { 4, s_1_11, -1, 1, 0}, 94 | { 4, s_1_12, -1, 1, 0}, 95 | { 8, s_1_13, 12, 1, 0}, 96 | { 6, s_1_14, -1, 1, 0}, 97 | { 6, s_1_15, -1, 1, 0}, 98 | { 6, s_1_16, -1, 1, 0}, 99 | { 4, s_1_17, -1, 1, 0}, 100 | { 8, s_1_18, 17, 1, 0}, 101 | { 4, s_1_19, -1, 1, 0}, 102 | { 4, s_1_20, -1, 1, 0}, 103 | { 8, s_1_21, 20, 1, 0}, 104 | { 4, s_1_22, -1, 1, 0}, 105 | { 4, s_1_23, -1, 1, 0}, 106 | { 4, s_1_24, -1, 1, 0}, 107 | { 8, s_1_25, 24, 1, 0}, 108 | { 4, s_1_26, -1, 1, 0}, 109 | { 6, s_1_27, -1, 1, 0}, 110 | { 6, s_1_28, -1, 1, 0}, 111 | { 10, s_1_29, 28, 1, 0} 112 | }; 113 | 114 | static const symbol s_2_0[4] = { 0xD1, 0x81, 0xD1, 0x8C }; 115 | static const symbol s_2_1[4] = { 0xD1, 0x81, 0xD1, 0x8F }; 116 | 117 | static const struct among a_2[2] = 118 | { 119 | { 4, s_2_0, -1, 1, 0}, 120 | { 4, s_2_1, -1, 1, 0} 121 | }; 122 | 123 | static const symbol s_3_0[8] = { 0xD0, 0xB0, 0xD0, 0xBD, 0xD0, 0xB8, 0xD1, 0x85 }; 124 | static const symbol s_3_1[8] = { 0xD1, 0x83, 0xD1, 0x8E, 0xD1, 0x82, 0xD1, 0x8C }; 125 | static const symbol s_3_2[8] = { 0xD1, 0x96, 0xD1, 0x8E, 0xD1, 0x82, 0xD1, 0x8C }; 126 | static const symbol s_3_3[8] = { 0xD0, 0xB0, 0xD1, 0x8E, 0xD1, 0x82, 0xD1, 0x8C }; 127 | static const symbol s_3_4[6] = { 0xD1, 0x96, 0xD1, 0x82, 0xD1, 0x8C }; 128 | static const symbol s_3_5[6] = { 0xD0, 0xB8, 0xD1, 0x82, 0xD1, 0x8C }; 129 | static const symbol s_3_6[12] = { 0xD1, 0x83, 0xD0, 0xB2, 0xD0, 0xB0, 0xD0, 0xBD, 0xD0, 0xBD, 0xD1, 0x8F }; 130 | static const symbol s_3_7[4] = { 0xD1, 0x83, 0xD1, 0x94 }; 131 | static const symbol s_3_8[4] = { 0xD1, 0x8E, 0xD1, 0x94 }; 132 | static const symbol s_3_9[4] = { 0xD1, 0x8F, 0xD1, 0x94 }; 133 | static const symbol s_3_10[4] = { 0xD1, 0x96, 0xD1, 0x94 }; 134 | static const symbol s_3_11[4] = { 0xD0, 0xB0, 0xD1, 0x94 }; 135 | static const symbol s_3_12[4] = { 0xD0, 0xB8, 0xD1, 0x94 }; 136 | static const symbol s_3_13[6] = { 0xD0, 0xB8, 0xD0, 0xBB, 0xD1, 0x96 }; 137 | static const symbol s_3_14[6] = { 0xD0, 0xB0, 0xD0, 0xBD, 0xD1, 0x96 }; 138 | static const symbol s_3_15[6] = { 0xD0, 0xB8, 0xD1, 0x82, 0xD0, 0xB0 }; 139 | static const symbol s_3_16[6] = { 0xD1, 0x88, 0xD0, 0xBB, 0xD0, 0xB0 }; 140 | static const symbol s_3_17[6] = { 0xD1, 0x96, 0xD0, 0xBB, 0xD0, 0xB0 }; 141 | static const symbol s_3_18[6] = { 0xD0, 0xB0, 0xD0, 0xBB, 0xD0, 0xB0 }; 142 | static const symbol s_3_19[10] = { 0xD1, 0x83, 0xD0, 0xB2, 0xD0, 0xB0, 0xD0, 0xBB, 0xD0, 0xB0 }; 143 | static const symbol s_3_20[6] = { 0xD0, 0xB8, 0xD0, 0xBB, 0xD0, 0xB0 }; 144 | static const symbol s_3_21[6] = { 0xD0, 0xB5, 0xD0, 0xBD, 0xD0, 0xB0 }; 145 | static const symbol s_3_22[4] = { 0xD0, 0xB0, 0xD0, 0xB2 }; 146 | static const symbol s_3_23[8] = { 0xD1, 0x83, 0xD0, 0xB2, 0xD0, 0xB0, 0xD0, 0xB2 }; 147 | static const symbol s_3_24[4] = { 0xD0, 0xB8, 0xD0, 0xB2 }; 148 | static const symbol s_3_25[6] = { 0xD1, 0x88, 0xD0, 0xBE, 0xD0, 0xB2 }; 149 | static const symbol s_3_26[8] = { 0xD1, 0x83, 0xD0, 0xB9, 0xD1, 0x82, 0xD0, 0xB5 }; 150 | static const symbol s_3_27[10] = { 0xD1, 0x83, 0xD0, 0xB2, 0xD0, 0xB0, 0xD1, 0x82, 0xD0, 0xB8 }; 151 | static const symbol s_3_28[8] = { 0xD0, 0xB8, 0xD0, 0xB2, 0xD1, 0x88, 0xD0, 0xB8 }; 152 | static const symbol s_3_29[6] = { 0xD1, 0x88, 0xD0, 0xBB, 0xD0, 0xB8 }; 153 | static const symbol s_3_30[6] = { 0xD0, 0xB0, 0xD0, 0xBB, 0xD0, 0xB8 }; 154 | static const symbol s_3_31[10] = { 0xD1, 0x83, 0xD0, 0xB2, 0xD0, 0xB0, 0xD0, 0xBB, 0xD0, 0xB8 }; 155 | static const symbol s_3_32[6] = { 0xD0, 0xB8, 0xD0, 0xBB, 0xD0, 0xB8 }; 156 | static const symbol s_3_33[10] = { 0xD0, 0xB0, 0xD0, 0xBD, 0xD0, 0xB8, 0xD0, 0xBC, 0xD0, 0xB8 }; 157 | static const symbol s_3_34[4] = { 0xD1, 0x83, 0xD0, 0xB9 }; 158 | static const symbol s_3_35[6] = { 0xD1, 0x88, 0xD0, 0xBB, 0xD0, 0xBE }; 159 | static const symbol s_3_36[6] = { 0xD1, 0x96, 0xD0, 0xBB, 0xD0, 0xBE }; 160 | static const symbol s_3_37[6] = { 0xD0, 0xB0, 0xD0, 0xBB, 0xD0, 0xBE }; 161 | static const symbol s_3_38[10] = { 0xD1, 0x83, 0xD0, 0xB2, 0xD0, 0xB0, 0xD0, 0xBB, 0xD0, 0xBE }; 162 | static const symbol s_3_39[6] = { 0xD0, 0xB8, 0xD0, 0xBB, 0xD0, 0xBE }; 163 | static const symbol s_3_40[6] = { 0xD0, 0xB5, 0xD0, 0xBD, 0xD0, 0xBE }; 164 | 165 | static const struct among a_3[41] = 166 | { 167 | { 8, s_3_0, -1, 2, 0}, 168 | { 8, s_3_1, -1, 2, 0}, 169 | { 8, s_3_2, -1, 2, 0}, 170 | { 8, s_3_3, -1, 2, 0}, 171 | { 6, s_3_4, -1, 2, 0}, 172 | { 6, s_3_5, -1, 2, 0}, 173 | { 12, s_3_6, -1, 2, 0}, 174 | { 4, s_3_7, -1, 2, 0}, 175 | { 4, s_3_8, -1, 2, 0}, 176 | { 4, s_3_9, -1, 2, 0}, 177 | { 4, s_3_10, -1, 2, 0}, 178 | { 4, s_3_11, -1, 2, 0}, 179 | { 4, s_3_12, -1, 2, 0}, 180 | { 6, s_3_13, -1, 2, 0}, 181 | { 6, s_3_14, -1, 2, 0}, 182 | { 6, s_3_15, -1, 2, 0}, 183 | { 6, s_3_16, -1, 1, 0}, 184 | { 6, s_3_17, -1, 2, 0}, 185 | { 6, s_3_18, -1, 2, 0}, 186 | { 10, s_3_19, 18, 2, 0}, 187 | { 6, s_3_20, -1, 2, 0}, 188 | { 6, s_3_21, -1, 2, 0}, 189 | { 4, s_3_22, -1, 2, 0}, 190 | { 8, s_3_23, 22, 2, 0}, 191 | { 4, s_3_24, -1, 2, 0}, 192 | { 6, s_3_25, -1, 1, 0}, 193 | { 8, s_3_26, -1, 2, 0}, 194 | { 10, s_3_27, -1, 2, 0}, 195 | { 8, s_3_28, -1, 2, 0}, 196 | { 6, s_3_29, -1, 1, 0}, 197 | { 6, s_3_30, -1, 2, 0}, 198 | { 10, s_3_31, 30, 2, 0}, 199 | { 6, s_3_32, -1, 2, 0}, 200 | { 10, s_3_33, -1, 2, 0}, 201 | { 4, s_3_34, -1, 2, 0}, 202 | { 6, s_3_35, -1, 1, 0}, 203 | { 6, s_3_36, -1, 2, 0}, 204 | { 6, s_3_37, -1, 2, 0}, 205 | { 10, s_3_38, 37, 2, 0}, 206 | { 6, s_3_39, -1, 2, 0}, 207 | { 6, s_3_40, -1, 2, 0} 208 | }; 209 | 210 | static const symbol s_4_0[4] = { 0xD1, 0x8F, 0xD1, 0x82 }; 211 | static const symbol s_4_1[2] = { 0xD1, 0x83 }; 212 | static const symbol s_4_2[4] = { 0xD1, 0x8F, 0xD1, 0x85 }; 213 | static const symbol s_4_3[6] = { 0xD1, 0x96, 0xD1, 0x8F, 0xD1, 0x85 }; 214 | static const symbol s_4_4[6] = { 0xD0, 0xBE, 0xD1, 0x8F, 0xD1, 0x85 }; 215 | static const symbol s_4_5[4] = { 0xD0, 0xB0, 0xD1, 0x85 }; 216 | static const symbol s_4_6[2] = { 0xD1, 0x8C }; 217 | static const symbol s_4_7[2] = { 0xD1, 0x8E }; 218 | static const symbol s_4_8[4] = { 0xD1, 0x83, 0xD1, 0x8E }; 219 | static const symbol s_4_9[6] = { 0xD1, 0x96, 0xD1, 0x94, 0xD1, 0x8E }; 220 | static const symbol s_4_10[4] = { 0xD1, 0x96, 0xD1, 0x8E }; 221 | static const symbol s_4_11[2] = { 0xD1, 0x8F }; 222 | static const symbol s_4_12[4] = { 0xD1, 0x96, 0xD1, 0x8F }; 223 | static const symbol s_4_13[4] = { 0xD0, 0xBE, 0xD1, 0x8F }; 224 | static const symbol s_4_14[2] = { 0xD1, 0x96 }; 225 | static const symbol s_4_15[6] = { 0xD0, 0xBE, 0xD0, 0xB2, 0xD1, 0x96 }; 226 | static const symbol s_4_16[4] = { 0xD1, 0x96, 0xD1, 0x97 }; 227 | static const symbol s_4_17[4] = { 0xD0, 0xB5, 0xD1, 0x97 }; 228 | static const symbol s_4_18[2] = { 0xD0, 0xB0 }; 229 | static const symbol s_4_19[6] = { 0xD1, 0x8F, 0xD1, 0x82, 0xD0, 0xB0 }; 230 | static const symbol s_4_20[4] = { 0xD1, 0x96, 0xD0, 0xB2 }; 231 | static const symbol s_4_21[4] = { 0xD1, 0x97, 0xD0, 0xB2 }; 232 | static const symbol s_4_22[6] = { 0xD0, 0xBE, 0xD1, 0x97, 0xD0, 0xB2 }; 233 | static const symbol s_4_23[4] = { 0xD0, 0xBE, 0xD0, 0xB2 }; 234 | static const symbol s_4_24[2] = { 0xD0, 0xB5 }; 235 | static const symbol s_4_25[2] = { 0xD0, 0xB8 }; 236 | static const symbol s_4_26[6] = { 0xD1, 0x8F, 0xD0, 0xBC, 0xD0, 0xB8 }; 237 | static const symbol s_4_27[8] = { 0xD1, 0x96, 0xD1, 0x8F, 0xD0, 0xBC, 0xD0, 0xB8 }; 238 | static const symbol s_4_28[6] = { 0xD0, 0xB0, 0xD0, 0xBC, 0xD0, 0xB8 }; 239 | static const symbol s_4_29[10] = { 0xD1, 0x8F, 0xD1, 0x82, 0xD0, 0xB0, 0xD0, 0xBC, 0xD0, 0xB8 }; 240 | static const symbol s_4_30[2] = { 0xD0, 0xB9 }; 241 | static const symbol s_4_31[4] = { 0xD1, 0x96, 0xD0, 0xB9 }; 242 | static const symbol s_4_32[4] = { 0xD0, 0xB5, 0xD0, 0xB9 }; 243 | static const symbol s_4_33[4] = { 0xD0, 0xBE, 0xD0, 0xB9 }; 244 | static const symbol s_4_34[4] = { 0xD0, 0xBE, 0xD0, 0xBA }; 245 | static const symbol s_4_35[4] = { 0xD1, 0x96, 0xD0, 0xBB }; 246 | static const symbol s_4_36[4] = { 0xD0, 0xB8, 0xD0, 0xBB }; 247 | static const symbol s_4_37[4] = { 0xD1, 0x8F, 0xD0, 0xBC }; 248 | static const symbol s_4_38[6] = { 0xD1, 0x96, 0xD1, 0x8F, 0xD0, 0xBC }; 249 | static const symbol s_4_39[6] = { 0xD0, 0xBE, 0xD1, 0x94, 0xD0, 0xBC }; 250 | static const symbol s_4_40[4] = { 0xD0, 0xB0, 0xD0, 0xBC }; 251 | static const symbol s_4_41[8] = { 0xD1, 0x8F, 0xD1, 0x82, 0xD0, 0xB0, 0xD0, 0xBC }; 252 | static const symbol s_4_42[4] = { 0xD0, 0xB5, 0xD0, 0xBC }; 253 | static const symbol s_4_43[4] = { 0xD0, 0xBE, 0xD0, 0xBC }; 254 | static const symbol s_4_44[4] = { 0xD0, 0xB5, 0xD0, 0xBD }; 255 | static const symbol s_4_45[2] = { 0xD0, 0xBE }; 256 | 257 | static const struct among a_4[46] = 258 | { 259 | { 4, s_4_0, -1, 3, 0}, 260 | { 2, s_4_1, -1, 3, 0}, 261 | { 4, s_4_2, -1, 3, 0}, 262 | { 6, s_4_3, 2, 3, 0}, 263 | { 6, s_4_4, 2, 3, 0}, 264 | { 4, s_4_5, -1, 3, 0}, 265 | { 2, s_4_6, -1, 3, 0}, 266 | { 2, s_4_7, -1, 3, 0}, 267 | { 4, s_4_8, 7, 3, 0}, 268 | { 6, s_4_9, 7, 3, 0}, 269 | { 4, s_4_10, 7, 3, 0}, 270 | { 2, s_4_11, -1, 3, 0}, 271 | { 4, s_4_12, 11, 3, 0}, 272 | { 4, s_4_13, 11, 3, 0}, 273 | { 2, s_4_14, -1, 3, 0}, 274 | { 6, s_4_15, 14, 3, 0}, 275 | { 4, s_4_16, -1, 3, 0}, 276 | { 4, s_4_17, -1, 3, 0}, 277 | { 2, s_4_18, -1, 3, 0}, 278 | { 6, s_4_19, 18, 1, 0}, 279 | { 4, s_4_20, -1, 3, 0}, 280 | { 4, s_4_21, -1, 3, 0}, 281 | { 6, s_4_22, 21, 3, 0}, 282 | { 4, s_4_23, -1, 3, 0}, 283 | { 2, s_4_24, -1, 3, 0}, 284 | { 2, s_4_25, -1, 3, 0}, 285 | { 6, s_4_26, 25, 3, 0}, 286 | { 8, s_4_27, 26, 3, 0}, 287 | { 6, s_4_28, 25, 3, 0}, 288 | { 10, s_4_29, 28, 1, 0}, 289 | { 2, s_4_30, -1, 3, 0}, 290 | { 4, s_4_31, 30, 3, 0}, 291 | { 4, s_4_32, 30, 3, 0}, 292 | { 4, s_4_33, 30, 3, 0}, 293 | { 4, s_4_34, -1, 2, 0}, 294 | { 4, s_4_35, -1, 3, 0}, 295 | { 4, s_4_36, -1, 3, 0}, 296 | { 4, s_4_37, -1, 3, 0}, 297 | { 6, s_4_38, 37, 3, 0}, 298 | { 6, s_4_39, -1, 3, 0}, 299 | { 4, s_4_40, -1, 3, 0}, 300 | { 8, s_4_41, 40, 1, 0}, 301 | { 4, s_4_42, -1, 3, 0}, 302 | { 4, s_4_43, -1, 3, 0}, 303 | { 4, s_4_44, -1, 3, 0}, 304 | { 2, s_4_45, -1, 3, 0} 305 | }; 306 | 307 | static const symbol s_5_0[1] = { '\'' }; 308 | static const symbol s_5_1[2] = { 0xD1, 0x82 }; 309 | static const symbol s_5_2[6] = { 0xD0, 0xBE, 0xD1, 0x81, 0xD1, 0x82 }; 310 | static const symbol s_5_3[6] = { 0xD1, 0x8E, 0xD1, 0x8E, 0xD1, 0x82 }; 311 | static const symbol s_5_4[6] = { 0xD1, 0x83, 0xD1, 0x94, 0xD1, 0x82 }; 312 | static const symbol s_5_5[6] = { 0xD1, 0x8E, 0xD1, 0x94, 0xD1, 0x82 }; 313 | static const symbol s_5_6[6] = { 0xD1, 0x8F, 0xD1, 0x94, 0xD1, 0x82 }; 314 | static const symbol s_5_7[6] = { 0xD0, 0xB0, 0xD1, 0x94, 0xD1, 0x82 }; 315 | static const symbol s_5_8[4] = { 0xD0, 0xB8, 0xD1, 0x82 }; 316 | static const symbol s_5_9[2] = { 0xD1, 0x8C }; 317 | static const symbol s_5_10[6] = { 0xD1, 0x81, 0xD1, 0x8C, 0xD0, 0xBA }; 318 | static const symbol s_5_11[10] = { 0xD1, 0x96, 0xD0, 0xB9, 0xD1, 0x81, 0xD1, 0x8C, 0xD0, 0xBA }; 319 | static const symbol s_5_12[2] = { 0xD0, 0xBD }; 320 | static const symbol s_5_13[6] = { 0xD1, 0x96, 0xD1, 0x87, 0xD0, 0xBD }; 321 | static const symbol s_5_14[4] = { 0xD1, 0x8C, 0xD0, 0xBD }; 322 | static const symbol s_5_15[4] = { 0xD0, 0xB0, 0xD0, 0xBD }; 323 | static const symbol s_5_16[4] = { 0xD0, 0xB5, 0xD0, 0xBD }; 324 | 325 | static const struct among a_5[17] = 326 | { 327 | { 1, s_5_0, -1, 3, 0}, 328 | { 2, s_5_1, -1, 2, 0}, 329 | { 6, s_5_2, 1, 4, 0}, 330 | { 6, s_5_3, 1, 3, 0}, 331 | { 6, s_5_4, 1, 3, 0}, 332 | { 6, s_5_5, 1, 3, 0}, 333 | { 6, s_5_6, 1, 3, 0}, 334 | { 6, s_5_7, 1, 3, 0}, 335 | { 4, s_5_8, 1, 3, 0}, 336 | { 2, s_5_9, -1, 3, 0}, 337 | { 6, s_5_10, -1, 3, 0}, 338 | { 10, s_5_11, 10, 3, 0}, 339 | { 2, s_5_12, -1, 1, 0}, 340 | { 6, s_5_13, 12, 3, 0}, 341 | { 4, s_5_14, 12, 3, 0}, 342 | { 4, s_5_15, 12, 3, 0}, 343 | { 4, s_5_16, 12, 3, 0} 344 | }; 345 | 346 | static const symbol s_0[] = { 0xD0, 0xB7, 0xD0, 0xB4, 0xD0, 0xBE, 0xD1, 0x80 }; 347 | static const symbol s_1[] = { 0xD1, 0x82 }; 348 | static const symbol s_2[] = { 0xD0, 0xBD }; 349 | static const symbol s_3[] = { 0xD0, 0xBA }; 350 | static const symbol s_4[] = { 0xD0, 0xBD }; 351 | static const symbol s_5[] = { 0xD1, 0x82 }; 352 | static const symbol s_6[] = { 0xD1, 0x96, 0xD1, 0x81, 0xD1, 0x82 }; 353 | 354 | static int r_exception1(struct SN_env * z) { 355 | int among_var; 356 | z->bra = z->c; 357 | among_var = find_among(z, a_0, 7); 358 | if (!(among_var)) return 0; 359 | z->ket = z->c; 360 | if (z->c < z->l) return 0; 361 | switch (among_var) { 362 | case 1: 363 | { int ret = slice_from_s(z, 8, s_0); 364 | if (ret < 0) return ret; 365 | } 366 | break; 367 | } 368 | return 1; 369 | } 370 | 371 | static int r_adjective(struct SN_env * z) { 372 | z->ket = z->c; 373 | if (!(find_among_b(z, a_1, 30))) return 0; 374 | z->bra = z->c; 375 | { int ret = slice_del(z); 376 | if (ret < 0) return ret; 377 | } 378 | return 1; 379 | } 380 | 381 | static int r_postfix(struct SN_env * z) { 382 | z->ket = z->c; 383 | if (z->c - 3 <= z->lb || (z->p[z->c - 1] != 140 && z->p[z->c - 1] != 143)) return 0; 384 | if (!(find_among_b(z, a_2, 2))) return 0; 385 | z->bra = z->c; 386 | { int ret = slice_del(z); 387 | if (ret < 0) return ret; 388 | } 389 | return 1; 390 | } 391 | 392 | static int r_verb(struct SN_env * z) { 393 | int among_var; 394 | z->ket = z->c; 395 | among_var = find_among_b(z, a_3, 41); 396 | if (!(among_var)) return 0; 397 | z->bra = z->c; 398 | switch (among_var) { 399 | case 1: 400 | { int ret = slice_from_s(z, 2, s_1); 401 | if (ret < 0) return ret; 402 | } 403 | break; 404 | case 2: 405 | { int ret = slice_del(z); 406 | if (ret < 0) return ret; 407 | } 408 | break; 409 | } 410 | return 1; 411 | } 412 | 413 | static int r_noun(struct SN_env * z) { 414 | int among_var; 415 | z->ket = z->c; 416 | among_var = find_among_b(z, a_4, 46); 417 | if (!(among_var)) return 0; 418 | z->bra = z->c; 419 | switch (among_var) { 420 | case 1: 421 | if (!(eq_s_b(z, 2, s_2))) return 0; 422 | { int ret = slice_del(z); 423 | if (ret < 0) return ret; 424 | } 425 | break; 426 | case 2: 427 | { int ret = slice_from_s(z, 2, s_3); 428 | if (ret < 0) return ret; 429 | } 430 | break; 431 | case 3: 432 | { int ret = slice_del(z); 433 | if (ret < 0) return ret; 434 | } 435 | break; 436 | } 437 | return 1; 438 | } 439 | 440 | static int r_tidy_up(struct SN_env * z) { 441 | int among_var; 442 | z->ket = z->c; 443 | among_var = find_among_b(z, a_5, 17); 444 | if (!(among_var)) return 0; 445 | z->bra = z->c; 446 | switch (among_var) { 447 | case 1: 448 | if (!(eq_s_b(z, 2, s_4))) return 0; 449 | { int ret = slice_del(z); 450 | if (ret < 0) return ret; 451 | } 452 | break; 453 | case 2: 454 | if (!(eq_s_b(z, 2, s_5))) return 0; 455 | { int ret = slice_del(z); 456 | if (ret < 0) return ret; 457 | } 458 | break; 459 | case 3: 460 | { int ret = slice_del(z); 461 | if (ret < 0) return ret; 462 | } 463 | break; 464 | case 4: 465 | { int ret = slice_from_s(z, 6, s_6); 466 | if (ret < 0) return ret; 467 | } 468 | break; 469 | } 470 | return 1; 471 | } 472 | 473 | extern int ukr_stem(struct SN_env * z) { 474 | { int c1 = z->c; 475 | { int c2 = z->c; 476 | { int ret = skip_utf8(z->p, z->c, 0, z->l, + 4); 477 | if (ret < 0) goto lab2; 478 | z->c = ret; 479 | } 480 | goto lab1; 481 | lab2: 482 | z->c = c2; 483 | } 484 | goto lab0; 485 | lab1: 486 | z->c = c1; 487 | { int c3 = z->c; 488 | { int ret = r_exception1(z); 489 | if (ret == 0) goto lab4; 490 | if (ret < 0) return ret; 491 | } 492 | goto lab3; 493 | lab4: 494 | z->c = c3; 495 | z->lb = z->c; z->c = z->l; 496 | 497 | { int m4 = z->l - z->c; (void)m4; 498 | { int m5 = z->l - z->c; (void)m5; 499 | { int ret = r_postfix(z); 500 | if (ret == 0) { z->c = z->l - m5; goto lab6; } 501 | if (ret < 0) return ret; 502 | } 503 | lab6: 504 | ; 505 | } 506 | { int m6 = z->l - z->c; (void)m6; 507 | { int ret = r_adjective(z); 508 | if (ret == 0) goto lab8; 509 | if (ret < 0) return ret; 510 | } 511 | goto lab7; 512 | lab8: 513 | z->c = z->l - m6; 514 | { int ret = r_verb(z); 515 | if (ret == 0) goto lab9; 516 | if (ret < 0) return ret; 517 | } 518 | goto lab7; 519 | lab9: 520 | z->c = z->l - m6; 521 | { int ret = r_noun(z); 522 | if (ret == 0) goto lab5; 523 | if (ret < 0) return ret; 524 | } 525 | } 526 | lab7: 527 | lab5: 528 | z->c = z->l - m4; 529 | } 530 | { int m7 = z->l - z->c; (void)m7; 531 | { int ret = r_tidy_up(z); 532 | if (ret < 0) return ret; 533 | } 534 | z->c = z->l - m7; 535 | } 536 | z->c = z->lb; 537 | } 538 | lab3: 539 | ; 540 | } 541 | lab0: 542 | return 1; 543 | } 544 | 545 | extern struct SN_env * ukr_create_env(void) { return SN_create_env(0, 0); } 546 | 547 | extern void ukr_close_env(struct SN_env * z) { SN_close_env(z, 0); } 548 | 549 | --------------------------------------------------------------------------------