├── LICENSE ├── Readme.md ├── analyze_corpus_tagset_date.py ├── clean.sh ├── create_dict.py ├── create_simple_lemmatization.py ├── export_data.py ├── graph_log.py ├── join_data.py ├── join_dicts.py ├── krnnt ├── __init__.py ├── additional_format.py ├── aglt.py ├── aligner.py ├── analyzers.py ├── blanks.py ├── features.py ├── keras_models.py ├── new.py ├── pipeline.py ├── readers.py ├── serial_pickle.py ├── structure.py ├── tagger_exps.py ├── utils.py └── writers.py ├── krnnt_run.py ├── krnnt_serve.py ├── krnnt_train.py ├── merge_analyzed_gold.py ├── preprocess_data.py ├── process_xces.py ├── reanalyze.py ├── requirements.txt ├── run_test.sh ├── setup.py ├── shuffle.py ├── split_data.py ├── start_flask_server.sh ├── start_gunicorn_server.sh ├── tagger-eval.py ├── tests ├── benchmark │ ├── test_maca.py │ ├── test_maca_analyze.py │ ├── test_shape.py │ └── test_tags.py ├── conftest.py ├── data │ ├── full │ │ ├── gold-task-c.xml │ │ ├── test-raw.txt │ │ └── train-raw.txt │ ├── reference │ │ ├── gold-task-c_evaluation.txt │ │ ├── in_raw.txt │ │ ├── lemmatisation_test.pkl │ │ ├── nkjp1m-1.2-reanalyzed.shuf.spickle │ │ ├── nkjp1m-1.2-reanalyzed.shuf.spickle.part1 │ │ ├── nkjp1m-1.2-reanalyzed.shuf.spickle.part2 │ │ ├── nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess │ │ ├── nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict │ │ ├── nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2 │ │ ├── nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData │ │ ├── nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues │ │ ├── nkjp1m-1.2-reanalyzed.spickle │ │ ├── nkjp1m-1.2.spickle │ │ ├── out.conll │ │ ├── out.conllu │ │ ├── out.jsonl │ │ ├── out.plain │ │ ├── out.xces │ │ ├── weight_test.hdf5 │ │ ├── weight_test.hdf5.final │ │ └── weight_test.hdf5.new │ ├── server │ │ ├── in_raw.txt │ │ ├── in_tokenized.json │ │ ├── in_tokenized_compact.json │ │ ├── out_raw.conll │ │ ├── out_raw.conllu │ │ ├── out_raw.jsonl │ │ ├── out_raw.plain │ │ └── out_raw.xces │ └── small │ │ ├── 00130846.ann.xml │ │ ├── 00132482.ann.xml │ │ ├── gold-task-c.txt │ │ ├── gold-task-c.xml │ │ ├── nkjp1m-1.2-xces.xml │ │ └── train-gold.xml ├── download_model.sh ├── test.sh ├── test_aglt.py ├── test_analyzers.py ├── test_blank.py ├── test_features.py ├── test_morfeusz.py ├── test_parallel_api_speed.py ├── test_process_xces.py ├── test_speed.sh ├── test_structure.py ├── test_system.py ├── test_system_server.py ├── test_tagset.py └── test_writers.py ├── train.py ├── train_lemmatization.py └── voting.py /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /analyze_corpus_tagset_date.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import collections 4 | import glob 5 | import sys 6 | 7 | from argparse import ArgumentParser 8 | 9 | from krnnt.readers import read_xces 10 | 11 | usage = """%prog CORPUS 12 | 13 | Analyze corpus for changes in dictionary. 14 | """ 15 | 16 | if __name__ == '__main__': 17 | parser = ArgumentParser(usage=usage) 18 | parser.add_argument('corpus_path', type=str, help='path to XCES corpus (or path with wildcard)') 19 | args = parser.parse_args() 20 | 21 | # read corpus 22 | stats_forms = collections.defaultdict(int) 23 | stats_tags = collections.defaultdict(int) 24 | 25 | count_sentences=0 26 | count_igns=0 27 | count_blanks=0 28 | count_wo_disamb=0 29 | count_problems=0 30 | for path in glob.iglob(args.corpus_path): 31 | print(path, file=sys.stderr) 32 | for paragraph in read_xces(path): 33 | 34 | for sentence in paragraph: 35 | count_sentences += 1 36 | ign = False 37 | blank = False 38 | wo_disamb = False 39 | for token in sentence: 40 | form = token.form 41 | try: 42 | tag = token.gold_form.tags 43 | stats_forms[(form, tag)] += 1 44 | stats_tags[tag] += 1 45 | if tag=='ign': 46 | ign=True 47 | elif tag=='blank': 48 | blank=True 49 | except: # no disamb 50 | print("Missing disamb", path, form, file=sys.stderr) 51 | wo_disamb=True 52 | pass 53 | if ign: count_igns+=1 54 | if blank: count_blanks+=1 55 | if wo_disamb: count_wo_disamb+=1 56 | if ign or blank or wo_disamb: count_problems+=1 57 | 58 | # stats 59 | print('Sentences: %s' % count_sentences) 60 | print('Sentences wo disamb: %s' % count_wo_disamb) 61 | print('Sentences with ign: %s' % count_igns) 62 | print('Sentences with blank: %s' % count_blanks) 63 | print('Sentences with problems: %s' % count_problems) 64 | print('Tokens: %s' % sum(stats_forms.values())) 65 | print('Unique tokens: %s' % len(set([x[0] for x in stats_forms]))) 66 | print('Unique token+tag: %s' % len(stats_forms)) 67 | print('Unique tags: %s' % len(stats_tags)) 68 | print('Tokens with tag ign: %s' % stats_tags['ign']) 69 | print('Tokens with tag blank: %s' % stats_tags['blank']) 70 | print() 71 | 72 | # analyse 73 | TAGS = 'tags' 74 | FORMS = 'forms' 75 | POSITIVE = '+' 76 | NEGATIVE = '-' 77 | checks = {} 78 | 79 | checks['20141013 brev -> brev:n?pun'] = { 80 | TAGS: { 81 | POSITIVE: [lambda tag: tag in ('brev:pun', 'brev:npun')], 82 | NEGATIVE: [lambda tag: tag == 'brev'] 83 | }, 84 | FORMS: { 85 | POSITIVE: [], 86 | NEGATIVE: [] 87 | } 88 | } 89 | checks['20150127 siebie, ale w NKJP jest'] = { 90 | TAGS: { 91 | POSITIVE: [], 92 | NEGATIVE: [lambda tag: tag.startswith('siebie:')] 93 | }, 94 | FORMS: { 95 | POSITIVE: [], 96 | NEGATIVE: [] 97 | } 98 | } 99 | 100 | checks['20150617 _'] = { 101 | TAGS: { 102 | POSITIVE: [lambda tag: '_' not in tag], 103 | NEGATIVE: [lambda tag: '_' in tag] 104 | }, 105 | FORMS: { 106 | POSITIVE: [], 107 | NEGATIVE: [] 108 | } 109 | } 110 | 111 | checks['20160126 pacta'] = { 112 | TAGS: { 113 | POSITIVE: [lambda tag: tag == 'pacta'], 114 | NEGATIVE: [] 115 | }, 116 | FORMS: { 117 | POSITIVE: [], 118 | NEGATIVE: [] 119 | } 120 | } 121 | 122 | checks['20170301 bardzo:adv:pos, bardziej:adv:com'] = { 123 | TAGS: { 124 | POSITIVE: [], 125 | NEGATIVE: [] 126 | }, 127 | FORMS: { 128 | POSITIVE: [lambda form, tag: form == 'bardziej' and tag == 'adv:com', 129 | lambda form, tag: form == 'bardzo' and tag == 'adv:pos'], 130 | NEGATIVE: [lambda form, tag: form == 'bardziej' and tag == 'adv', 131 | lambda form, tag: form == 'bardzo' and tag == 'adv'] 132 | } 133 | } 134 | checks['20170409 n1,n2,p1,p2,p3 -> n'] = { 135 | TAGS: { 136 | POSITIVE: [lambda tag: {'n'} & set(tag.split(':'))], 137 | NEGATIVE: [lambda tag: {'n1', 'n2', 'n3', 'p1', 'p2', 'p3'} & set(tag.split(':'))] 138 | }, 139 | FORMS: { 140 | POSITIVE: [], 141 | NEGATIVE: [] 142 | } 143 | } 144 | checks['no col,ncol,pt'] = { 145 | TAGS: { 146 | POSITIVE: [lambda tag: not {'col', 'ncol', 'pt'} & set(tag.split(':'))], 147 | NEGATIVE: [lambda tag: {'col', 'ncol', 'pt'} & set(tag.split(':'))] 148 | }, 149 | FORMS: { 150 | POSITIVE: [], 151 | NEGATIVE: [] 152 | } 153 | } 154 | checks['20170430 num:comp -> numcomp'] = { 155 | TAGS: { 156 | POSITIVE: [lambda tag: tag == 'numcomp'], 157 | NEGATIVE: [lambda tag: tag == 'num:comp'] 158 | }, 159 | FORMS: { 160 | POSITIVE: [], 161 | NEGATIVE: [] 162 | } 163 | } 164 | 165 | checks['20170625 jak nie adv'] = { 166 | TAGS: { 167 | POSITIVE: [], 168 | NEGATIVE: [] 169 | }, 170 | FORMS: { 171 | POSITIVE: [], 172 | NEGATIVE: [lambda form, tag: form == 'jak' and tag == 'adv'] 173 | } 174 | } 175 | 176 | checks['20170702 jak:comp'] = { 177 | TAGS: { 178 | POSITIVE: [], 179 | NEGATIVE: [] 180 | }, 181 | FORMS: { 182 | POSITIVE: [lambda form, tag: form == 'jak' and tag == 'comp'], 183 | NEGATIVE: [] 184 | } 185 | } 186 | 187 | checks['20170914 adv na qub'] = { 188 | TAGS: { 189 | POSITIVE: [], 190 | NEGATIVE: [] 191 | }, 192 | FORMS: { 193 | POSITIVE: [lambda form, tag: form == 'niedaleko' and tag == 'prep:gen', 194 | lambda form, tag: form == 'doprawdy' and tag == 'qub'], 195 | NEGATIVE: [lambda form, tag: form == 'doprawdy' and tag == 'adv'] 196 | } 197 | } 198 | 199 | conj_to_comp = ['czym', 'ergo', 'jakokolwiek', 'jakoż', 'przeto', 'tedy', 'to', 'toteż', 'więc', 'zatem'] 200 | checks['20170917 conj na comp'] = { 201 | TAGS: { 202 | POSITIVE: [], 203 | NEGATIVE: [] 204 | }, 205 | FORMS: { 206 | POSITIVE: [], 207 | NEGATIVE: [] 208 | } 209 | } 210 | for token in conj_to_comp: 211 | checks['20170917 conj na comp'][FORMS][POSITIVE].append(lambda form, tag: form == token and tag == 'comp') 212 | checks['20170917 conj na comp'][FORMS][NEGATIVE].append(lambda form, tag: form == token and tag == 'conj') 213 | 214 | checks['20171224 num:..:congr'] = { 215 | TAGS: { 216 | POSITIVE: [lambda tag: tag.startswith('num:') and tag.endswith(':congr')], 217 | NEGATIVE: [] 218 | }, 219 | FORMS: { 220 | POSITIVE: [], 221 | NEGATIVE: [] 222 | } 223 | } 224 | 225 | checks['20180722 adjp -> adjp:dat, adjp:gen; burk -> frag; qub -> part'] = { 226 | TAGS: { 227 | POSITIVE: [lambda tag: tag == 'adjp:dat', 228 | lambda tag: tag == 'adjp:gen', 229 | lambda tag: tag == 'frag', 230 | lambda tag: tag == 'part'], 231 | NEGATIVE: [lambda tag: tag == 'adjp', 232 | lambda tag: tag == 'burk', 233 | lambda tag: tag == 'qub'] 234 | }, 235 | FORMS: { 236 | POSITIVE: [], 237 | NEGATIVE: [] 238 | } 239 | } 240 | 241 | checks['NKJP tagset'] = { 242 | TAGS: { 243 | POSITIVE: [lambda tag: tag == 'interj', 244 | lambda tag: tag == 'adjc', 245 | lambda tag: tag == 'burk', 246 | lambda tag: tag == 'numcol'], 247 | NEGATIVE: [] 248 | }, 249 | FORMS: { 250 | POSITIVE: [], 251 | NEGATIVE: [] 252 | } 253 | } 254 | 255 | checks['dig'] = { 256 | TAGS: { 257 | POSITIVE: [], 258 | NEGATIVE: [lambda tag: tag == 'dig'] 259 | }, 260 | FORMS: { 261 | POSITIVE: [], 262 | NEGATIVE: [] 263 | } 264 | } 265 | 266 | checks['romandig'] = { 267 | TAGS: { 268 | POSITIVE: [], 269 | NEGATIVE: [lambda tag: tag == 'romandig'] 270 | }, 271 | FORMS: { 272 | POSITIVE: [], 273 | NEGATIVE: [] 274 | } 275 | } 276 | 277 | checks['blank'] = { 278 | TAGS: { 279 | POSITIVE: [], 280 | NEGATIVE: [lambda tag: tag == 'blank'] 281 | }, 282 | FORMS: { 283 | POSITIVE: [], 284 | NEGATIVE: [] 285 | } 286 | } 287 | 288 | checks['emoticon'] = { 289 | TAGS: { 290 | POSITIVE: [], 291 | NEGATIVE: [lambda tag: tag == 'emoticon'] 292 | }, 293 | FORMS: { 294 | POSITIVE: [], 295 | NEGATIVE: [] 296 | } 297 | } 298 | 299 | checks['emo'] = { 300 | TAGS: { 301 | POSITIVE: [], 302 | NEGATIVE: [lambda tag: tag == 'emo'] 303 | }, 304 | FORMS: { 305 | POSITIVE: [], 306 | NEGATIVE: [] 307 | } 308 | } 309 | 310 | checks['ign'] = { 311 | TAGS: { 312 | POSITIVE: [], 313 | NEGATIVE: [lambda tag: tag == 'ign'] 314 | }, 315 | FORMS: { 316 | POSITIVE: [], 317 | NEGATIVE: [] 318 | } 319 | } 320 | 321 | checks['morfeusz2 tags not in NKJP'] = { 322 | TAGS: { 323 | POSITIVE: [], 324 | NEGATIVE: [lambda tag: tag == 'prefa', 325 | lambda tag: tag == 'prefppas', 326 | lambda tag: tag == 'prefs', 327 | lambda tag: tag == 'prefv', 328 | lambda tag: tag == 'nie', 329 | lambda tag: tag == 'naj', 330 | lambda tag: tag == 'cond', 331 | lambda tag: tag == 'substa'] 332 | }, 333 | FORMS: { 334 | POSITIVE: [], 335 | NEGATIVE: [] 336 | } 337 | } 338 | 339 | 340 | test_data = [ 341 | ('IV', '', 'num:::'), 342 | ('IV', '', 'romandig'), 343 | ('1', '', 'dig'), 344 | ('prostu', 'adjp', 'adjp:gen'), 345 | (':)', '', 'emo'), 346 | ('godzien', 'adjc', ''), 347 | ('oślep', 'burk', 'frag'), 348 | ('obojga', 'numcol:pl:gen:m1:rec', ''), 349 | ('dwoje', 'numcol:pl:acc:m1:rec', ''), 350 | ('czworo', 'numcol:pl:nom:m1:rec', ''), 351 | ('hej', 'interj', ''), 352 | ('jeszcze', 'qub', 'part'), 353 | ('czterem', 'num:pl:dat:m1:congr', ''), 354 | ('czym', 'conj', 'comp'), 355 | ('niedaleko', 'prep:gen', ''), 356 | ('doprawdy', 'qub', 'adv'), 357 | ('jak', 'qub', 'adv'), 358 | ('pół', '', 'numcomp'), 359 | ('pół', '', 'num:comp'), 360 | ('pół', 'num:pl:acc:n:rec', ''), 361 | ('słowa', 'subst:pl:acc:n', 'subst:sg:gen:n:ncol'), 362 | ('rozklepywało', '', 'praet:sg:n1:ter:imperf'), 363 | ('bardzo', 'adv:pos', 'adv'), 364 | ('bardziej', 'adv:com', ''), 365 | ('znacząco', 'adv:pos', 'pacta'), 366 | ('my', '', 'ppron12:pl:nom:_:pri'), 367 | ('sobie', 'siebie:dat', ''), 368 | ('zł', 'brev:npun', 'brev'), 369 | ] 370 | 371 | for formX, exist, not_exist in test_data: 372 | ch={ 373 | TAGS: { 374 | POSITIVE: [], 375 | NEGATIVE: [] 376 | }, 377 | FORMS: { 378 | POSITIVE: [], 379 | NEGATIVE: [] 380 | } 381 | } 382 | if exist: 383 | ch[FORMS][POSITIVE]=[lambda form, tag: form == formX and tag == exist] 384 | if not_exist: 385 | ch[FORMS][NEGATIVE]=[lambda form, tag: form == formX and tag == not_exist] 386 | 387 | checks[f"{formX}, {exist}, {not_exist}"]=ch 388 | 389 | for date, functions in checks.items(): 390 | print('Checking: %s' % date) 391 | for i, function in enumerate(functions[TAGS][POSITIVE]): 392 | if any([function(tag) for tag in stats_tags]): 393 | print('%s. +' % (i,)) 394 | else: 395 | print('%s. ?' % (i,)) 396 | for i, function in enumerate(functions[TAGS][NEGATIVE]): 397 | if any([function(tag) for tag in stats_tags]): 398 | print('%s. -' % (i,)) 399 | else: 400 | print('%s. ?' % (i,)) 401 | for i, function in enumerate(functions[FORMS][POSITIVE]): 402 | if any([function(form, tag) for form, tag in stats_forms]): 403 | print('%s. +' % (i,)) 404 | else: 405 | print('%s. ?' % (i,)) 406 | 407 | for i, function in enumerate(functions[FORMS][NEGATIVE]): 408 | if any([function(form, tag) for form, tag in stats_forms]): 409 | print('%s. -' % (i,)) 410 | else: 411 | print('%s. ?' % (i,)) 412 | print() 413 | -------------------------------------------------------------------------------- /clean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm devlog_*.log 4 | rm log_*.log 5 | rm lemmatisation_*.pkl 6 | rm weight_*.hdf5* -------------------------------------------------------------------------------- /create_dict.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pickle 3 | from argparse import ArgumentParser 4 | 5 | from tqdm import tqdm 6 | import jsonlines 7 | 8 | from krnnt.new import preprocess_paragraph_preanalyzed, \ 9 | preprocess_paragraph_reanalyzed, serialize_sample_paragraph, create_dict 10 | from krnnt.serial_pickle import SerialPickler, SerialUnpickler 11 | from krnnt.structure import Paragraph 12 | 13 | if __name__ == '__main__': 14 | parser = ArgumentParser(description='Create dictionary of features') 15 | parser.add_argument('input_path', type=str, help='path to preprocessed data') 16 | parser.add_argument('output_path', type=str, help='save path') 17 | args = parser.parse_args() 18 | 19 | file = open(args.input_path, 'rb') 20 | su = SerialUnpickler(file) 21 | 22 | unique_dict = create_dict(su) 23 | 24 | with open(args.output_path, 'wb') as file: 25 | pickle.dump(unique_dict, file) 26 | 27 | with open(args.output_path+'.json','w') as file: 28 | json.dump(unique_dict, file, ensure_ascii=False) -------------------------------------------------------------------------------- /create_simple_lemmatization.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sgjp=sys.argv[1] 4 | 5 | print(sgjp) 6 | 7 | def base_tag(tag): 8 | transformations = { 9 | 'ger': [(['pl'],'sg'), 10 | (['gen','dat','acc','inst','loc','voc'],'nom')], 11 | 'pact': [(['pl'],'sg'), 12 | (['gen','dat','acc','inst','loc','voc'],'nom'), 13 | (['m2','m3','f','n'], 'm1')], 14 | 'ppas': [(['pl'],'sg'), 15 | (['gen','dat','acc','inst','loc','voc'],'nom'), 16 | (['m2','m3','f','n'], 'm1')], 17 | } 18 | 19 | tag=list(tag) 20 | 21 | if tag[0] not in transformations: return None 22 | 23 | transforms = transformations[tag[0]] 24 | for sources, target in transforms: 25 | for source in sources: 26 | try: 27 | index = tag.index(source) 28 | tag[index]=target 29 | break 30 | except ValueError: 31 | pass 32 | return tag 33 | 34 | 35 | 36 | 37 | import itertools 38 | import tqdm 39 | 40 | count=0 41 | 42 | 43 | lt={} 44 | for line in tqdm.tqdm(open(sgjp), total=7221123): 45 | row = line.split('\t')[:-1] 46 | # print(row) 47 | try: 48 | form, lemma, tag, other = row 49 | except ValueError: 50 | continue 51 | 52 | tags=[t.split('.') for t in tag.split(':')] 53 | for tag in itertools.product(*tags): 54 | if tag[0] in ['ger','ppas','pact']: 55 | btag=tuple(base_tag(tag)) 56 | # print(tag, btag, form, lemma) 57 | if btag == tag: 58 | count+=1 59 | lemma=lemma.rsplit(':')[0] 60 | lt[(lemma,tag)]=form 61 | 62 | print(count) 63 | 64 | import pickle 65 | pickle.dump(lt, open('data/ger_ppas_pact.pickle','wb')) -------------------------------------------------------------------------------- /export_data.py: -------------------------------------------------------------------------------- 1 | from krnnt.serial_pickle import SerialUnpickler 2 | from tqdm import tqdm 3 | 4 | from krnnt.structure import Paragraph 5 | 6 | 7 | #!/usr/bin/env python 8 | # -*- coding: utf-8 -*- 9 | from argparse import ArgumentParser 10 | 11 | from krnnt.serial_pickle import SerialUnpickler 12 | from krnnt.writers import get_output_converter 13 | 14 | 15 | def paragraph_to_result(paragraph: Paragraph): 16 | 17 | paragraph2=[] 18 | for sentence in paragraph: 19 | try: 20 | sentence2=[] 21 | paragraph2.append(sentence2) 22 | for token in sentence: 23 | sentence2.append({ 24 | 'token':token.form, 25 | 'sep':token.space_before, 26 | 'tag': token.gold_form.tags, 27 | 'lemmas': [token.gold_form.lemma], 28 | }) 29 | except AttributeError: #omit sentence if some token does no have gold tag 30 | continue 31 | return paragraph2 32 | 33 | if __name__ == '__main__': 34 | parser = ArgumentParser(description='Export data (before preprocessing) to format') 35 | parser.add_argument('input_path', help='input path to data') 36 | parser.add_argument('output_path', help='output path to data') 37 | parser.add_argument('-f','--format', default='txt', help='output format') 38 | 39 | args = parser.parse_args() 40 | 41 | with open(args.input_path, 'rb') as file: 42 | su = SerialUnpickler(file) 43 | 44 | converter=get_output_converter(args.format) 45 | 46 | string=converter((paragraph_to_result(paragraph_gold) for paragraph_gold in su)) 47 | 48 | with open(args.output_path, 'w') as output_file: 49 | output_file.write(string) 50 | 51 | 52 | -------------------------------------------------------------------------------- /graph_log.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from argparse import ArgumentParser 4 | import matplotlib.pyplot as plt 5 | import re 6 | 7 | if __name__ == '__main__': 8 | parser = ArgumentParser(description='Plots data for graph') 9 | parser.add_argument('output_path', help='output path to features dict') 10 | parser.add_argument('input_path', help='input path to log data') 11 | 12 | 13 | args = parser.parse_args() 14 | 15 | 16 | test_scores=[] 17 | dev_scores=[] 18 | with open(args.input_path) as file: 19 | for line in file: 20 | m = re.search(r'\'val_score\', (.*?)\)', line) 21 | if m is None: 22 | continue 23 | test_score=float(m.group(1)) 24 | 25 | m = re.search(r'\'dev_val_score\', (.*?)\)', line) 26 | if m is None: 27 | continue 28 | dev_scores += (float(m.group(1)),) 29 | test_scores+=(test_score, ) 30 | 31 | t=range(len(test_scores)) 32 | plt.plot(test_scores) 33 | 34 | if any([score!=0.0 for score in dev_scores]): 35 | plt.plot(dev_scores) 36 | plt.ylabel('some numbers') 37 | plt.show() 38 | 39 | print('Test scores:') 40 | for score in test_scores: 41 | print(score) 42 | 43 | print('Dev scores:') 44 | for score in dev_scores: 45 | print(score) -------------------------------------------------------------------------------- /join_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from argparse import ArgumentParser 4 | 5 | from tqdm import tqdm 6 | 7 | from krnnt.serial_pickle import SerialPickler, SerialUnpickler 8 | 9 | if __name__ == '__main__': 10 | parser = ArgumentParser(description='Join data') 11 | parser.add_argument('output_path', help='output path to data') 12 | parser.add_argument('input_paths', nargs='+', help='input paths to data') 13 | 14 | args = parser.parse_args() 15 | 16 | sp = SerialPickler(open(args.output_path, 'wb')) 17 | for input_path in args.input_paths: 18 | su = SerialUnpickler(open(input_path, 'rb')) 19 | for paragraph in tqdm(su): 20 | sp.add(paragraph) 21 | sp.close() 22 | -------------------------------------------------------------------------------- /join_dicts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import pickle 4 | from argparse import ArgumentParser 5 | 6 | if __name__ == '__main__': 7 | parser = ArgumentParser(description='Join features dicts') 8 | parser.add_argument('output_path', help='output path to features dict') 9 | parser.add_argument('input_paths', nargs='+', help='input paths to features dicts') 10 | 11 | parser.add_argument('--reproducible', action='store_true', default=False, help='set seeds') 12 | 13 | args = parser.parse_args() 14 | 15 | if args.reproducible: 16 | from numpy.random import seed 17 | 18 | seed(1337) 19 | import random as rn 20 | 21 | rn.seed(1337) 22 | 23 | print(args.input_paths) 24 | joined_unique_features_dict = None 25 | for input_path in args.input_paths: 26 | unique_features_dict = pickle.load(open(input_path, 'rb')) 27 | 28 | if joined_unique_features_dict is None: 29 | joined_unique_features_dict = unique_features_dict 30 | else: 31 | for feature_name, dict2 in unique_features_dict.items(): 32 | 33 | if feature_name not in joined_unique_features_dict: 34 | joined_index = 0 35 | else: 36 | joined_index = max(joined_unique_features_dict[feature_name].values()) + 1 37 | assert joined_index == len(joined_unique_features_dict[feature_name]) 38 | 39 | for value, index in sorted(dict2.items(), key=lambda x: x[1]): 40 | if value not in joined_unique_features_dict[feature_name]: 41 | joined_unique_features_dict[feature_name][value] = joined_index 42 | joined_index += 1 43 | 44 | with open(args.output_path, 'wb') as file: 45 | pickle.dump(joined_unique_features_dict, file) 46 | -------------------------------------------------------------------------------- /krnnt/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | logging.getLogger(__name__).addHandler(logging.NullHandler()) -------------------------------------------------------------------------------- /krnnt/additional_format.py: -------------------------------------------------------------------------------- 1 | def additional_format(data, krnntx, morf): 2 | raise NotImplementedError() 3 | -------------------------------------------------------------------------------- /krnnt/aglt.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def startswith(token, prefixes): 5 | for prefix in prefixes: 6 | if token.lower().startswith(prefix.lower()): 7 | return True 8 | return False 9 | 10 | 11 | def praet_or_winien(tag): 12 | return startswith(tag, ['praet', 'winien']) 13 | 14 | 15 | def rule1(sentence): 16 | """ 17 | Find immediate aglt after praet. 18 | """ 19 | result = [] 20 | 21 | for i, token in enumerate(sentence): 22 | 23 | separator = token['sep'] 24 | tag = token['tag'] 25 | 26 | if tag.startswith('aglt') and separator == 'none': 27 | if praet_or_winien(sentence[i - 1]['tag']): 28 | result.append((i, i - 1, None)) 29 | elif praet_or_winien(sentence[i - 2]['tag']) and sentence[i - 1]['token'] == 'by': 30 | if sentence[i - 1]['sep'] == 'none': 31 | result.append((i, i - 2, i - 1)) 32 | else: 33 | print('błąd?') 34 | return result 35 | 36 | def rule1b(sentence): 37 | """ 38 | Find immediate aglt after praet. 39 | """ 40 | result = [] 41 | 42 | for i, token in enumerate(sentence): 43 | 44 | 45 | tag = token['tag'] 46 | 47 | if praet_or_winien(tag): 48 | try: 49 | next_token=sentence[i+1] 50 | if next_token['tag'].startswith('aglt') and next_token['sep'] == 'none': 51 | result.append((i+1, i, None)) 52 | elif next_token['tag']=='qub' and next_token['token'] == 'by' and next_token['sep'] == 'none': 53 | try: 54 | next_next_token=sentence[i+2] 55 | if next_next_token['tag'].startswith('aglt') and next_next_token['sep'] == 'none': 56 | result.append((i+2, i , i + 1)) 57 | else: 58 | result.append((None, i, i + 1)) 59 | except IndexError: 60 | result.append((None, i, i + 1)) 61 | except IndexError: 62 | pass 63 | return result 64 | 65 | def rule3(sentence): 66 | """ 67 | Find aglt and then praet as successor. 68 | """ 69 | result = [] 70 | 71 | for i, token in enumerate(sentence): 72 | tag = token['tag'] 73 | if tag.startswith('aglt'): 74 | for j in range(i + 1, len(sentence)): 75 | token2 = sentence[j] 76 | if praet_or_winien(token2['tag']): 77 | by_index=None 78 | try: 79 | by_token = sentence[i-1] 80 | if by_token['tag']=='qub' and by_token['token']=='by': 81 | by_index=i-1 82 | except IndexError: 83 | pass 84 | result.append((i, j, by_index)) 85 | # print(sentence[i - 2:j + 2]) 86 | break 87 | elif tag == 'qub' and token['token']=='by': 88 | try: 89 | if not sentence[i+1]['tag'].startswith('aglt'): 90 | for j in range(i + 1, len(sentence)): 91 | token2 = sentence[j] 92 | if praet_or_winien(token2['tag']): 93 | result.append((None, j, i)) 94 | break 95 | except IndexError: 96 | pass 97 | 98 | return result 99 | 100 | 101 | def rewrite_praet(aglt_token, praet_token, by_token=None): 102 | """ 103 | Copy person from aglt to praet and change praet to cond. 104 | """ 105 | praet_tags = list(praet_token['tag'].split(':')) 106 | 107 | # praet i aglt mają tę samą liczbę 108 | if aglt_token is not None: 109 | aglt_person = aglt_token['tag'].split(':')[2] 110 | if aglt_token['tag'].split(':')[1] != praet_tags[1]: 111 | logging.warning( 112 | 'DIFFERENT NUMBER: %s %s %s %s' % (aglt_token['tag'].split(':')[1], praet_tags[1], aglt_token, praet_token)) 113 | return 114 | praet_tags.insert(3, aglt_person) 115 | 116 | if by_token: 117 | praet_tags[0] = 'cond' 118 | if aglt_token is None: 119 | praet_tags.insert(3, 'ter') 120 | 121 | praet_token['tag'] = ':'.join(praet_tags) 122 | 123 | 124 | def remove_tokens(sentence, aglt_indexes): 125 | for i in sorted(aglt_indexes, reverse=True): 126 | token = sentence[i] 127 | 128 | 129 | #dołącz do formy poprzedzającego tokenu i popraw offsety 130 | if token['sep']=='none': 131 | previous_token = sentence[i-1] 132 | previous_token['end']=token['end'] 133 | previous_token['token'] += token['token'] 134 | sentence.pop(i) 135 | 136 | def remove_aglt(sentence, rules): 137 | for rule_index, rule in enumerate(rules): 138 | pairs = rule(sentence) 139 | 140 | for aglt_index, praet_index, by_index in pairs: 141 | if by_index is None: 142 | by_token = None 143 | else: 144 | by_token = sentence[by_index] 145 | 146 | if aglt_index is None: 147 | aglt_token = None 148 | else: 149 | aglt_token = sentence[aglt_index] 150 | rewrite_praet(aglt_token, sentence[praet_index], by_token) 151 | 152 | aglt_indexes = [aglt_index for aglt_index, praet_index, by_index in pairs] + [by_index for 153 | aglt_index, praet_index, by_index 154 | in pairs] 155 | aglt_indexes = [x for x in aglt_indexes if x is not None] 156 | remove_tokens(sentence, aglt_indexes) 157 | 158 | 159 | def remove_aglt_from_results(results, rules): 160 | for paragraph in results: 161 | for sentence in paragraph: 162 | remove_aglt(sentence, rules) 163 | return results 164 | 165 | 166 | def remove_aglt_from_results_rule1_3(results): 167 | return remove_aglt_from_results(results, [rule1b, rule3]) 168 | -------------------------------------------------------------------------------- /krnnt/aligner.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import logging 3 | 4 | from krnnt.structure import Token, Sentence, Paragraph 5 | 6 | 7 | def text(buffer): 8 | return ''.join([' '+token.form if (token.space_before is True or (token.space_before is not False and token.space_before!='none')) else token.form for token in buffer]) 9 | 10 | def align(pred, ref, ref_text_old=''): 11 | pred_buffer = [pred.pop(0)] 12 | ref_buffer = [ref.pop(0)] 13 | if ref_text_old: 14 | t = Token() 15 | t.form = ref_text_old 16 | t.space_before=False 17 | ref_buffer.insert(0, t) 18 | 19 | while pred_buffer or ref_buffer: 20 | pred_text = text(pred_buffer) 21 | ref_text = text(ref_buffer) 22 | # print("BUFFERS: [%s] [%s]" % ([str(x) for x in pred_buffer], [str(x) for x in ref_buffer])) 23 | # print("BUFFERS: [%s] [%s]" % (pred_text, ref_text)) 24 | if len(pred_text) == len(ref_text): # aligned 25 | if pred_text != ref_text: 26 | print('alignment ERROR', pred_text, ref_text, ref, pred, file=sys.stderr) 27 | logging.error("alignment ERROR") 28 | yield (pred_buffer, ref_buffer, ref_text[len(pred_text):]) 29 | 30 | pred_buffer=[] 31 | ref_buffer = [] 32 | 33 | #print(pred) 34 | if not pred or not ref: 35 | #print('break', pred) 36 | break 37 | 38 | pred_buffer = [pred.pop(0)] 39 | ref_buffer = [ref.pop(0)] 40 | elif len(pred_text) < len(ref_text): 41 | if pred: 42 | pred_buffer.append(pred.pop(0)) 43 | else: 44 | 45 | print('break2', pred_text, ref_text, ref_text[len(pred_text):], file=sys.stderr) 46 | 47 | # print([x.form for x in pred_buffer]) 48 | # print([x.space_before for x in pred_buffer]) 49 | # print([x.form for x in ref_buffer]) 50 | # print([x.space_before for x in ref_buffer]) 51 | 52 | #skroc ref_buffer 53 | asd=[] 54 | for x in ref_buffer: 55 | asd.append(x) 56 | if len(pred_text) >= len(text(asd)): 57 | 58 | break 59 | ref_buffer=asd 60 | if len(pred_text) < len(text(asd)): 61 | print('RRRR', asd[-1].form, file=sys.stderr) 62 | asd[-1].form = asd[-1].form[:len(pred_text)-1] 63 | print('RRRR', asd[-1].form, file=sys.stderr) 64 | print(text(ref_buffer), 'XXX', text(ref), file=sys.stderr) 65 | 66 | 67 | break 68 | else: 69 | if ref: 70 | ref_buffer.append(ref.pop(0)) 71 | else: 72 | print('break3', file=sys.stderr) 73 | break 74 | 75 | rest = ref_buffer # + ref 76 | if rest: 77 | yield (pred_buffer+pred, rest, ref_text[len(pred_text):]) 78 | # print('rest', pred, ref) 79 | 80 | def align_paragraphs(paragraph_reanalyzed: Paragraph, paragraph_gold: Paragraph) -> Paragraph: 81 | tokens_gold = [] 82 | for sentence_gold in paragraph_gold: 83 | for token_gold in sentence_gold: 84 | tokens_gold.append(token_gold) 85 | token_gold.form = token_gold.form.replace('\xa0', ' ') # "a j e n t a" 86 | 87 | 88 | ref_text_old = '' 89 | paragraph_reanalyzed.concraft = [] 90 | for sentence_reanalyzed in paragraph_reanalyzed: 91 | # print('XXXXXXXXXXXXXXXXXXXXXXXXXXXNEW') 92 | sentence_reanalyzed_gold = Sentence() 93 | paragraph_reanalyzed.concraft.append(sentence_reanalyzed_gold) 94 | for p, r, ref_text_old in align([token for token in sentence_reanalyzed.tokens], tokens_gold, ref_text_old): 95 | 96 | if p: 97 | for r1 in r: 98 | sentence_reanalyzed_gold.add_token(r1) 99 | if text(p) != text(r): 100 | print('ERR', [t.form for t in p], [t.form for t in r], file=sys.stderr) 101 | # if len(p)!=len(r): 102 | # print(text(p),'_____', text(r)) 103 | # print(len(tokens_gold)) 104 | if len(p) == len(r): 105 | for p1, r1 in zip(p, r): 106 | p1.gold_form = r1.gold_form 107 | return paragraph_reanalyzed 108 | 109 | def align_paragraphs2(paragraph_reanalyzed: Paragraph, paragraph_gold: Paragraph) -> Paragraph: 110 | tokens_gold = [] 111 | for sentence_gold in paragraph_gold: 112 | for token_gold in sentence_gold: 113 | tokens_gold.append(token_gold) 114 | token_gold.form = token_gold.form.replace('\xa0', ' ') # "a j e n t a" 115 | 116 | 117 | ref_text_old = '' 118 | paragraph_reanalyzed.concraft = [] 119 | for sentence_reanalyzed in paragraph_reanalyzed: 120 | # print('XXXXXXXXXXXXXXXXXXXXXXXXXXXNEW') 121 | sentence_reanalyzed_gold = Sentence() 122 | paragraph_reanalyzed.concraft.append(sentence_reanalyzed_gold) 123 | for p, r, ref_text_old in align([token for token in sentence_reanalyzed.tokens], tokens_gold, ref_text_old): 124 | 125 | if p: 126 | for r1 in r: 127 | sentence_reanalyzed_gold.add_token(r1) 128 | if text(p) != text(r): 129 | print('ERR', [t.form for t in p], [t.form for t in r], file=sys.stderr) 130 | # if len(p)!=len(r): 131 | # print(text(p),'_____', text(r)) 132 | # print(len(tokens_gold)) 133 | if len(p) == len(r): 134 | for p1, r1 in zip(p, r): 135 | p1.interpretations = r1.interpretations 136 | return paragraph_reanalyzed -------------------------------------------------------------------------------- /krnnt/analyzers.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | import sys 4 | from subprocess import PIPE, Popen 5 | from typing import Iterable, Generator, List, Tuple 6 | 7 | from krnnt.structure import Form, Token, Sentence, Paragraph 8 | from krnnt.utils import uniq 9 | 10 | try: 11 | from maca_analyse import maca_analyse 12 | except ImportError: 13 | pass 14 | 15 | 16 | # TODO morfeusz analyzer for pretokenized? 17 | 18 | class MacaAnalyzer: 19 | def __init__(self, maca_config: str, toki_config_path: str = ''): 20 | self.maca_config = maca_config 21 | self.toki_config_path = toki_config_path 22 | self.configure() 23 | 24 | def _maca(self, text: str) -> Generator[str, None, None]: 25 | """ 26 | Yields output of Maca by sentences, 27 | """ 28 | raise NotImplementedError() 29 | 30 | def configure(self): 31 | if 'maca_analyse' in sys.modules: 32 | self._maca = self._maca_wrapper 33 | else: 34 | self._maca = self._maca_process 35 | 36 | def analyze(self, text: str) -> Paragraph: 37 | results = self._maca(text) 38 | 39 | paragraph_reanalyzed = Paragraph() 40 | for i, res in enumerate(results): 41 | result = self._parse(res) 42 | sentence_reanalyzed = Sentence() 43 | paragraph_reanalyzed.add_sentence(sentence_reanalyzed) 44 | for form, space_before, interpretations, start, end in result: 45 | token_reanalyzed = Token() 46 | sentence_reanalyzed.add_token(token_reanalyzed) 47 | token_reanalyzed.form = form 48 | token_reanalyzed.space_before = space_before # != 'none' 49 | interpretations = [(re.sub(r':[abcdijnopqsv]\d?$', '', l), t) for l, t in 50 | interpretations] # remove senses 51 | token_reanalyzed.interpretations = [Form(l.replace('_', ' '), t) for l, t in uniq(interpretations)] 52 | token_reanalyzed.start = start 53 | token_reanalyzed.end = end 54 | return paragraph_reanalyzed 55 | 56 | def _maca_process(self, text: str) -> Generator[str, None, None]: 57 | cmd = ['maca-analyse', '-c', self.maca_config, '-l'] #TODO: -l ? 58 | if self.toki_config_path: 59 | cmd.extend(['--toki-config-path', self.toki_config_path]) 60 | p = Popen(cmd, stdout=PIPE, stdin=PIPE, stderr=PIPE) 61 | 62 | self.text = text 63 | # self.text = '\n'.join(batch) 64 | self.last_offset = 0 65 | 66 | stdout = p.communicate(input=self.text.encode('utf-8'))[0] 67 | try: 68 | p.stdin.close() 69 | except BrokenPipeError: 70 | pass 71 | p.wait() 72 | if p.returncode != 0: 73 | raise Exception('Maca is not working properly') 74 | for i in stdout.decode('utf-8').split('\n\n'): 75 | if len(i) > 0: 76 | yield i 77 | 78 | def _maca_wrapper(self, text: str) -> Generator[str, None, None]: 79 | # self.text = '\n'.join(batch) 80 | self.text = text 81 | self.last_offset = 0 82 | 83 | output_text = maca_analyse(self.maca_config, self.toki_config_path, self.text, False, False) 84 | 85 | for i in output_text.split('\n\n'): 86 | if len(i) > 0: 87 | yield i 88 | 89 | def _parse(self, output: str) -> List[Tuple[str, str, List[Tuple[str, str]], int, int]]: 90 | """ 91 | Parses one sentence output of Maca. 92 | """ 93 | data = [] 94 | lemma_lines = [] 95 | token_line = None 96 | for line in output.split("\n"): 97 | if line.startswith("\t"): 98 | lemma_lines.append(line) 99 | else: 100 | if token_line is not None: 101 | data.append((token_line, lemma_lines)) 102 | lemma_lines = [] 103 | token_line = line 104 | data.append((token_line, lemma_lines)) 105 | 106 | tokens = [] 107 | 108 | for index, (token_line, lemma_lines) in enumerate(data): 109 | token = self._construct(token_line, lemma_lines) # 80% 110 | if token is None: continue 111 | form, space_before, interpretations = token 112 | start = self.text.index(form, self.last_offset) 113 | end = start + len(form) 114 | self.last_offset = end 115 | tokens.append((form, space_before, interpretations, start, end)) 116 | 117 | return tokens 118 | 119 | def _construct(self, token_line: str, lemma_lines: Iterable[str]) -> Tuple[str, str, List[Tuple[str, str]]]: 120 | try: 121 | if token_line == '': return None 122 | form, space_before = token_line.split("\t") 123 | except ValueError: 124 | logging.exception("Probably Maca is not working.") 125 | raise Exception('Probably Maca is not working.') 126 | 127 | interpretations = [] 128 | 129 | for lemma_line in lemma_lines: 130 | row = lemma_line.strip().split("\t") 131 | try: 132 | lemma, tags, _ = row # 30% 133 | # disamb = True 134 | except ValueError: 135 | lemma, tags = row # 16% 136 | # disamb = False 137 | interpretation = (lemma, tags) 138 | # lemma.disamb=disamb 139 | interpretations.append(interpretation) 140 | 141 | return form, space_before, interpretations 142 | -------------------------------------------------------------------------------- /krnnt/blanks.py: -------------------------------------------------------------------------------- 1 | def remove_blanks_from_results(results): 2 | for paragraph in results: 3 | for sentence in paragraph: 4 | remove_blanks(sentence) 5 | return results 6 | 7 | def remove_blanks(sentence): 8 | """ 9 | 10 | """ 11 | result = [] 12 | 13 | i=1 14 | while i List[str]: 29 | return ['NIC'] 30 | 31 | @staticmethod 32 | def interps(form, features) -> List[str]: 33 | if 'interp' in features['tags'] and len(form) == 1: 34 | return [form] 35 | else: 36 | return [] 37 | 38 | @staticmethod 39 | def qubliki(form, features=None) -> List[str]: 40 | if form.lower() in FeaturePreprocessor.qubs: 41 | return [form.lower()] #TODO: form.lower() 42 | else: 43 | return [] 44 | 45 | @staticmethod 46 | def shape(form, features=None) -> List[str]: 47 | # print(form, shape(form)) 48 | return [shape(form)] 49 | 50 | @staticmethod 51 | def prefix(n, form, features=None) -> List[str]: 52 | try: 53 | char = form[n].lower() 54 | if char not in FeaturePreprocessor.safe_chars: 55 | char = '??' 56 | except IndexError: 57 | char = 'xx' 58 | 59 | return ['P' + str(n) + char] 60 | 61 | @staticmethod 62 | def prefix1(form, features=None) -> List[str]: 63 | return FeaturePreprocessor.prefix(0, form, features) 64 | 65 | @staticmethod 66 | def prefix2(form, features=None) -> List[str]: 67 | return FeaturePreprocessor.prefix(1, form, features) 68 | 69 | @staticmethod 70 | def prefix3(form, features=None) -> List[str]: 71 | return FeaturePreprocessor.prefix(2, form, features) 72 | 73 | @staticmethod 74 | def suffix(n, form, features=None) -> List[str]: 75 | try: 76 | char = form[-n].lower() 77 | if char not in FeaturePreprocessor.safe_chars: 78 | char = '??' 79 | except IndexError: 80 | char = 'xx' 81 | 82 | return ['S' + str(n) + char] 83 | 84 | @staticmethod 85 | def suffix1(form, features=None) -> List[str]: 86 | return FeaturePreprocessor.suffix(1, form, features) 87 | 88 | @staticmethod 89 | def suffix2(form, features=None) -> List[str]: 90 | return FeaturePreprocessor.suffix(2, form, features) 91 | 92 | @staticmethod 93 | def suffix3(form, features=None) -> List[str]: 94 | return FeaturePreprocessor.suffix(3, form, features) 95 | 96 | 97 | class TagsPreprocessorCython: 98 | @staticmethod 99 | def create_tags4_without_guesser(tags, features=None) -> List[str]: 100 | return krnnt_utils.create_tags4_without_guesser(tags) 101 | 102 | @staticmethod 103 | def create_tags5_without_guesser(tags, features=None) -> List[str]: 104 | return krnnt_utils.create_tags5_without_guesser(tags) 105 | 106 | 107 | class TagsPreprocessor: 108 | cas = ['nom', 'gen', 'dat', 'acc', 'inst', 'loc', 'voc'] 109 | per = ['pri', 'sec', 'ter'] 110 | nmb = ['sg', 'pl'] 111 | gnd = ['m1', 'm2', 'm3', 'f', 'n'] 112 | 113 | @staticmethod 114 | def create_tags4(tags, features=None, keep_guesser=True) -> List[str]: # concraft 115 | if not keep_guesser and 'ign' in tags: 116 | return ['ign'] 117 | # return ['1ign','2ign','1subst:nom','2subst:sg:f','1adj:nom','1subst:gen','2subst:sg:n','2subst:sg:m1','2adj:sg:m3:pos','2subst:sg:m3','1num:acc','2num:pl:m3:rec','1brev','2adj:sg:n:pos','2num:pl:m3:congr','1num:nom','1adj:gen','1adj:loc'] 118 | return uniq(flatten(map(lambda tag: TagsPreprocessor.create_tag4(tag), tags))) 119 | 120 | @staticmethod 121 | def create_tags4_without_guesser(tags, features=None) -> List[str]: 122 | return TagsPreprocessor.create_tags4(tags, features=features, keep_guesser=False) 123 | 124 | @staticmethod 125 | def create_tag4(otag, features=None) -> List[str]: 126 | tags = flatten(map(lambda x: x.split('.'), otag.split(':'))) 127 | pos = tags[0] 128 | tags = tags[1:] 129 | tags2 = [] 130 | 131 | first = None 132 | for tag in tags: 133 | if tag in TagsPreprocessor.cas or tag in TagsPreprocessor.per: 134 | first = tag 135 | break 136 | 137 | if first: 138 | tags.remove(first) 139 | tags2.append('1' + pos + ':' + first) 140 | else: 141 | tags2.append('1' + pos) # TODO sprawdzic 142 | 143 | tags2.append('2' + (':'.join([pos] + tags))) 144 | 145 | # print otag, tags2 146 | return uniq(tags2) 147 | 148 | @staticmethod 149 | def create_tags5(tags, features=None, keep_guesser=True) -> List[str]: # concraft 150 | if not keep_guesser and 'ign' in tags: 151 | return ['ign'] 152 | # return ['ign','sg:loc:m3','sg:nom:n','pl:nom:m3','pl:acc:m3','loc','sg:gen:m3','pl:gen:m3','sg:nom:m1','sg:nom:m3','gen','nom','acc','sg:nom:f'] 153 | 154 | return uniq(flatten(map(lambda tag: TagsPreprocessor.create_tag5(tag), tags))) 155 | 156 | @staticmethod 157 | def create_tags5_without_guesser(tags, features=None) -> List[str]: 158 | return TagsPreprocessor.create_tags5(tags, features=features, keep_guesser=False) 159 | 160 | @staticmethod 161 | def create_tag5(otag, features=None) -> List[str]: 162 | 163 | tags = flatten(map(lambda x: x.split('.'), otag.split(':'))) 164 | 165 | tags_out = [] 166 | tags2 = [] 167 | tags3 = [] 168 | for tag in tags: 169 | if tag in TagsPreprocessor.nmb: 170 | tags2.append(tag) 171 | elif tag in TagsPreprocessor.cas: 172 | tags2.append(tag) 173 | tags3.append(tag) 174 | elif tag in TagsPreprocessor.gnd: 175 | tags2.append(tag) 176 | 177 | for tagsX in [tags2, tags3]: 178 | if tagsX: 179 | tags_out.append(':'.join(tagsX)) 180 | 181 | return uniq(tags_out) 182 | 183 | def create_token_features(token, tags, space_before) -> List[str]: #TODO 184 | f = [] 185 | f+=FeaturePreprocessor.interps(token, {'tags':tags}) 186 | f+=FeaturePreprocessor.qubliki(token) 187 | f+=FeaturePreprocessor.shape(token) # 90% 188 | f+=FeaturePreprocessor.prefix1(token) 189 | f+=FeaturePreprocessor.prefix2(token) 190 | f+=FeaturePreprocessor.prefix3(token) 191 | f+=FeaturePreprocessor.suffix1(token) 192 | f+=FeaturePreprocessor.suffix2(token) 193 | f+=FeaturePreprocessor.suffix3(token) 194 | f+=TagsPreprocessorCython.create_tags4_without_guesser( 195 | tags) # 3% moze cache dla wszystkich tagów 196 | f+=TagsPreprocessorCython.create_tags5_without_guesser(tags) # 3% 197 | f+=space_before 198 | 199 | return f -------------------------------------------------------------------------------- /krnnt/keras_models.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import uuid 3 | from typing import Dict 4 | 5 | import keras 6 | from keras.layers import Dense, Dropout, Input, GRU, TimeDistributed, \ 7 | Masking 8 | from keras.layers.wrappers import Bidirectional 9 | from keras.models import Model 10 | 11 | 12 | class ExperimentParameters: 13 | def __init__(self, pref: Dict, testing=False): 14 | self.pref = pref.copy() 15 | if testing: 16 | pass # TODO self.h 17 | else: 18 | if 'h' not in self.pref: 19 | self.pref['h'] = str(uuid.uuid1()) 20 | self.h = self.pref['h'] 21 | self.pref['weight_path'] = 'weight_' + self.h + '.hdf5' 22 | self.pref['lemmatisation_path'] = 'lemmatisation_' + self.h + '.pkl' 23 | 24 | def save_prefs(self): 25 | # TODO 26 | print(self.pref) 27 | 28 | 29 | class KerasModel: 30 | model: Model 31 | 32 | def __init__(self, parameters: ExperimentParameters): 33 | self.parameters = parameters 34 | 35 | def compile(self): 36 | logging.info('Model compiling') 37 | self.model.compile(loss=self.loss, optimizer=self.optimizer, metrics=['accuracy']) 38 | logging.info('Model compiled') 39 | 40 | def make_predict_func(self): 41 | self.model._make_predict_function() 42 | 43 | def load_weights(self, path): 44 | self.model.load_weights(path) 45 | logging.info('Weights loaded') 46 | 47 | def load_model(self, path): 48 | self.model = keras.models.load_model(path) 49 | 50 | def yaml_model(self): 51 | model_yaml = self.model.to_yaml() 52 | # TODO 53 | return model_yaml 54 | 55 | def create_model(self): 56 | raise NotImplementedError 57 | 58 | 59 | class BEST(KerasModel): 60 | def __init__(self, parameters): 61 | super().__init__(parameters) 62 | 63 | def create_model(self): 64 | features_length = self.parameters.pref['features_length'] 65 | 66 | inputs = Input(shape=(None, features_length)) 67 | x = inputs 68 | x = Masking(mask_value=0., input_shape=(None, features_length))(x) 69 | x = Bidirectional( 70 | GRU(self.parameters.pref['internal_neurons'], return_sequences=True, dropout=0.0, recurrent_dropout=0.5, 71 | implementation=1), input_shape=(None, features_length))(x) 72 | x = Bidirectional( 73 | GRU(self.parameters.pref['internal_neurons'], return_sequences=True, dropout=0.0, recurrent_dropout=0.5, 74 | implementation=1), input_shape=(None, features_length))(x) 75 | x = Dropout(0.5)(x) 76 | x = TimeDistributed(Dense(self.parameters.pref['output_length'], activation='softmax'))(x) 77 | 78 | self.model = Model(inputs=inputs, outputs=x) 79 | 80 | self.loss = 'categorical_crossentropy' 81 | self.optimizer = keras.optimizers.Nadam() 82 | -------------------------------------------------------------------------------- /krnnt/pipeline.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import math 3 | import pickle 4 | import re 5 | import sys 6 | from typing import List, Iterable, Generator, Union 7 | 8 | from krnnt.analyzers import MacaAnalyzer 9 | from krnnt.structure import Paragraph 10 | 11 | from .keras_models import ExperimentParameters, KerasModel 12 | from krnnt.utils import uniq 13 | from .new import k_hot, UniqueFeaturesValues, Lemmatisation, Lemmatisation2 14 | from krnnt.features import create_token_features 15 | 16 | sys.setrecursionlimit(10000) 17 | 18 | from keras.preprocessing import sequence 19 | import numpy as np 20 | import krnnt_utils 21 | 22 | 23 | class KRNNTSingle: 24 | def __init__(self, pref): 25 | self.pref = pref 26 | self.unique_features_dict = pickle.load(open(pref['UniqueFeaturesValues'], 'rb')) 27 | self.km = KerasThread.create_model(pref, testing=True) 28 | self.lemmatisation = pref['lemmatisation_class']() 29 | self.lemmatisation.load(pref['lemmatisation_path']) 30 | 31 | self.configure() 32 | 33 | def tag_sentence(self, sentence: str, preana=False): 34 | return self.__tag([sentence], preana) 35 | 36 | def tag_sentences(self, sentences: List[str], preana=False): 37 | return self.__tag(sentences, preana) 38 | 39 | def tag_sentences_preana(self, sentences: List[Paragraph]): 40 | return self.__tag(sentences, preana=True) 41 | 42 | def tag_paragraphs(self, paragraphs: Iterable[str], preana=False): 43 | return self.__tag_paragraphs(paragraphs, preana) 44 | 45 | def __tag_paragraphs(self, paragraphs: Iterable[str], preana): 46 | 47 | 48 | if preana: 49 | sequences = Preprocess.process_batch_preana(enumerate(paragraphs)) 50 | else: 51 | sequences = Preprocess.process_batch(paragraphs, self.pref['maca_config'], self.pref['toki_config_path']) 52 | 53 | # batch_size=math.ceil(len_sequences/max(math.floor(len_sequences/self.pref['keras_batch_size']), 1)) # dynamic batch 54 | 55 | result = [] 56 | for batch in chunk(sequences, self.pref['keras_batch_size']): 57 | pad_batch = self.pad(batch, self.unique_features_dict, 'tags4e3') 58 | preds = self.km.model.predict_on_batch(pad_batch) 59 | for plain in KerasThread.return_results(batch, preds, self.km.classes, self.lemmatisation): 60 | result.append(plain) 61 | 62 | 63 | # podziel na paragrafy 64 | result2=[] 65 | result_paragraph=[] 66 | for sentence in result: 67 | if not result_paragraph or sentence[0]['document_id']==result_paragraph[-1][0]['document_id']: 68 | result_paragraph+= (sentence, ) 69 | else: 70 | result2+=(result_paragraph,) 71 | result_paragraph=[sentence] 72 | 73 | if result_paragraph: 74 | result2 += (result_paragraph,) 75 | 76 | return result2 77 | 78 | def configure(self): 79 | if 'krnnt_utils' in sys.modules: 80 | self.pad = krnnt_utils.pad 81 | else: 82 | self.pad = Preprocess.pad 83 | 84 | def __tag(self, sentences: List[str], preana: bool): 85 | if preana: 86 | sequences = Preprocess.process_batch_preana(enumerate(sentences)) 87 | else: 88 | sequences = Preprocess.process_batch(sentences, self.pref['maca_config'], self.pref['toki_config_path']) 89 | 90 | # batch_size=math.ceil(len_sequences/max(math.floor(len_sequences/self.pref['keras_batch_size']), 1)) # dynamic batch 91 | 92 | result = [] 93 | for batch in chunk(sequences, self.pref['keras_batch_size']): 94 | pad_batch = self.pad(batch, self.unique_features_dict, 'tags4e3') 95 | preds = self.km.model.predict_on_batch(pad_batch) 96 | for plain in KerasThread.return_results(batch, preds, self.km.classes, self.lemmatisation): 97 | result.append(plain) 98 | 99 | return result 100 | 101 | 102 | class Sample: 103 | def __init__(self): 104 | self.features = {} 105 | 106 | 107 | class Preprocess: 108 | @staticmethod 109 | def create_features(sequence: List[Sample]): 110 | for sample in sequence: 111 | sample.features['tags4e3'] = create_token_features(sample.features['token'], sample.features['tags'], 112 | sample.features['space_before']) 113 | 114 | @staticmethod 115 | def process_batch(documents: Iterable[str], maca_config: str, toki_config_path: str) -> Generator[ 116 | List[Sample], None, None]: 117 | maca_analyzer = MacaAnalyzer(maca_config, toki_config_path) 118 | 119 | for document_id, document in enumerate(documents): 120 | results = maca_analyzer._maca(document) 121 | 122 | for res in results: 123 | result = maca_analyzer._parse(res) 124 | 125 | sequence = [] 126 | for form, space_before, interpretations, start, end in result: 127 | sample = Sample() 128 | sequence.append(sample) 129 | sample.features['token'] = form 130 | sample.features['tags'] = uniq([t for l, t in interpretations]) 131 | interpretations = [(re.sub(r':[abcdijnopqsv]\d?$', '', l), t) for l, t in 132 | interpretations] 133 | sample.features['maca_lemmas'] = [(l.replace('_', ' '), t) for l, t in uniq(interpretations)] 134 | 135 | # TODO: cleanup space before 136 | sample.features['space_before'] = ['space_before'] if space_before !='none' else [ 137 | 'no_space_before'] 138 | sample.features['space_before'].append(space_before) 139 | sample.features['start'] = start 140 | sample.features['end'] = end 141 | sample.features['document_id'] = document_id 142 | Preprocess.create_features(sequence) 143 | 144 | if sequence: 145 | yield sequence 146 | 147 | @staticmethod 148 | def process_batch_preana(batch: Iterable[Paragraph]) -> Generator[List[Sample], None, None]: 149 | for document_id, paragraph in batch: 150 | for sentence in paragraph: 151 | sequence = [] 152 | for token in sentence: 153 | sample = Sample() 154 | sequence.append(sample) 155 | sample.features['token'] = token.form 156 | sample.features['tags'] = uniq([form.tags for form in token.interpretations]) 157 | sample.features['maca_lemmas'] = uniq([(form.lemma, form.tags) for form in token.interpretations]) 158 | sample.features['space_before'] = ['space_before'] if token.space_before else ['no_space_before'] 159 | sample.features['space_before'].append(token.space_before) 160 | sample.features['document_id'] = document_id 161 | Preprocess.create_features(sequence) 162 | 163 | if sequence: 164 | yield sequence 165 | 166 | @staticmethod 167 | def pad(batch: List[List[Sample]], unique_features_dict, feature_name: str): 168 | if not batch: 169 | return [] 170 | 171 | result_batchX = [] 172 | for sentence in batch: 173 | X_sentence = [] 174 | for sample in sentence: 175 | X_sentence.append(np.array(k_hot(sample.features[feature_name], unique_features_dict[feature_name]))) 176 | 177 | result_batchX.append(X_sentence) 178 | 179 | return sequence.pad_sequences(result_batchX) 180 | 181 | 182 | def chunk(l: Iterable, batch_size: int) -> List: 183 | batch = [] 184 | for element in l: 185 | batch.append(element) 186 | if len(batch) == batch_size: 187 | yield batch 188 | batch = [] 189 | if batch: 190 | yield batch 191 | 192 | 193 | class KerasThread(): 194 | 195 | @staticmethod 196 | def create_model(pref, testing=False) -> KerasModel: 197 | keras_model_class = pref['keras_model_class'] 198 | 199 | parameters = ExperimentParameters(pref, testing) 200 | 201 | km = keras_model_class(parameters) 202 | 203 | if 'UniqueFeaturesValues' in pref: 204 | km.unique_features_dict = pickle.load(open(pref['UniqueFeaturesValues'], 'rb')) 205 | else: 206 | # data_path = 'nkjp_paragraphs_shuffled_concraft.spickle_FormatData_PreprocessData' 207 | data_path = pref['data_path'] 208 | km.unique_features_dict = UniqueFeaturesValues(data_path).get() 209 | 210 | unique_tags_dict = km.unique_features_dict[pref['label_name']] 211 | km.classes = list(map(lambda k: k[0], sorted(unique_tags_dict.items(), key=lambda k: k[1]))) 212 | pref = km.parameters.pref 213 | pref['features_length'] = len(km.unique_features_dict[pref['feature_name']]) 214 | pref['output_length'] = len(km.unique_features_dict[pref['label_name']]) 215 | 216 | km.create_model() 217 | # self.km.load_weights('weight_7471898792961270266.hdf5') 218 | # km.load_weights('weight_7471898792961270266.hdf5') 219 | # km.load_weights('../artykul/compare/train_on_all.weights') 220 | km.load_weights(pref['weight_path']) 221 | km.compile() 222 | 223 | return km 224 | 225 | @staticmethod 226 | def return_results(sentences: List[List[Sample]], preds, classes: List[str], 227 | lemmatisation: Union[Lemmatisation, Lemmatisation2]): 228 | for sentence, preds2 in zip(sentences, preds): # TODO sentences 229 | # print(preds2.shape) 230 | # print(preds2) 231 | 232 | response = [] 233 | 234 | preds3 = preds2.argmax(axis=-1) 235 | preds3max = preds2.max(axis=-1) 236 | # print(len(sentence), len(preds3)) 237 | first = True 238 | for sample, max_index, prob in zip(sentence, list(preds3)[-len(sentence):], 239 | list(preds3max)[-len(sentence):]): 240 | # print(sample.features, max_index) 241 | # max_index, max_value = max(enumerate(d), key=lambda x: x[1]) 242 | 243 | token_response = {} 244 | response.append(token_response) 245 | predicted_tag = classes[max_index] 246 | 247 | # TODO 248 | if sample.features['space_before'] == ['space_before']: 249 | sep = 'space' 250 | else: 251 | sep = 'none' 252 | 253 | if 'newline' in sample.features['space_before'] or 'newlines' in sample.features['space_before']: 254 | sep = 'newline' 255 | elif 'space' in sample.features['space_before'] or 'spaces' in sample.features['space_before']: 256 | sep = 'space' 257 | elif 'none' in sample.features['space_before']: 258 | sep = 'none' 259 | 260 | # print(sample.features['token']+'\t'+sep) 261 | # response.append(sample.features['token']+'\t'+sep) 262 | token_response['token'] = sample.features['token'] 263 | token_response['sep'] = sep 264 | token_response['prob'] = float(prob) 265 | token_response['document_id'] = sample.features['document_id'] 266 | 267 | lemmas = [x for x in sample.features['maca_lemmas']] 268 | token_response['tag'] = predicted_tag 269 | token_response['lemmas'] = [] 270 | try: 271 | token_response['start'] = sample.features['start'] 272 | token_response['end'] = sample.features['end'] 273 | except KeyError: 274 | token_response['start'] = None 275 | token_response['end'] = None 276 | 277 | # if not lemmas: 278 | # lemmas.append((sample.features['token'], predicted_tag)) 279 | lemma = lemmatisation.disambiguate(token_response['token'], lemmas, predicted_tag) 280 | 281 | token_response['lemmas'].append(lemma) 282 | 283 | # if lemmas: 284 | # for l, t in lemmas: 285 | # #print('\t'+l+'\t'+t+'\tdisamb') 286 | # #response.append('\t'+l+'\t'+t+'\tdisamb') 287 | # token_response['lemmas'].append(l) 288 | # else: 289 | # #print('\t'+sample.features['token']+'\t'+predicted_tag+'\tdisamb') 290 | # #response.append('\t'+sample.features['token']+'\t'+predicted_tag+'\tdisamb') 291 | # token_response['lemmas'].append(sample.features['token']) 292 | 293 | first = False 294 | # print() 295 | # response.append('') 296 | 297 | yield response 298 | -------------------------------------------------------------------------------- /krnnt/readers.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | from typing import Generator 4 | from xml.etree import ElementTree as ET 5 | 6 | import jsonlines 7 | 8 | from krnnt.structure import Paragraph, Sentence, Token, Form 9 | 10 | 11 | def read_xces(file_path: str) -> Paragraph: 12 | paragraphs_defined = True 13 | ns=False #no separator 14 | first_chunk=True 15 | 16 | for event, elem in ET.iterparse(file_path, events=("start","end",)): 17 | if first_chunk and event=="start" and elem.tag in ('chunk','sentence'): 18 | if elem.get('type') == 's' or elem.tag =='sentence': 19 | paragraphs_defined = False 20 | first_chunk=False 21 | elif event=="end" and elem.tag in ('chunk','sentence'): 22 | xml_sentences=[] 23 | paragraph=Paragraph() 24 | if paragraphs_defined and elem.tag == 'chunk' and elem.get('type')!='s': 25 | xml_sentences = elem.getchildren() 26 | elif (not paragraphs_defined) and ((elem.tag == 'chunk' and elem.get('type')=='s') or elem.tag == 'sentence'): 27 | xml_sentences = [elem] 28 | else: 29 | continue 30 | 31 | for sentence_index, xml_sentence in enumerate(xml_sentences): 32 | sentence=Sentence() 33 | paragraph.add_sentence(sentence) 34 | for token_index, xml_token in enumerate(xml_sentence.getchildren()): 35 | if xml_token.tag=='ns': 36 | if token_index>0 or sentence_index>0: #omit first ns in paragraph 37 | ns=True 38 | elif xml_token.tag=='tok': 39 | token=Token() 40 | token.space_before=not ns 41 | 42 | for xml_node in xml_token.getchildren(): 43 | if xml_node.tag=='orth': 44 | orth=xml_node.text 45 | token.form=orth 46 | elif xml_node.tag=='lex': 47 | if xml_node.get('disamb')=='1': 48 | disamb=True 49 | else: 50 | disamb=False 51 | 52 | base=xml_node.find('base').text 53 | ctag=xml_node.find('ctag').text 54 | 55 | form = Form(base, ctag) 56 | if disamb: 57 | if token.gold_form is not None: 58 | logging.warning(f'More than 1 disamb {file_path} {orth}') 59 | token.gold_form=form 60 | else: 61 | token.interpretations.append(form) 62 | elif xml_node.tag=='ann': 63 | continue 64 | else: 65 | logging.error('Error 1 {xml_token}') 66 | if token.form: 67 | sentence.add_token(token) 68 | ns=False 69 | else: 70 | logging.error(f'Error 2 {xml_token}') 71 | yield paragraph 72 | elem.clear() 73 | 74 | 75 | def read_jsonl(file_path: str) -> Generator[Paragraph,None,None]: 76 | with jsonlines.Reader(file_path) as reader: 77 | for obj in reader: 78 | a = _list_to_paragraph(obj) 79 | yield a 80 | 81 | 82 | def _list_to_paragraph(l) -> Paragraph: 83 | paragraph = Paragraph() 84 | for s in l: 85 | sentence = Sentence() 86 | paragraph.add_sentence(sentence) 87 | for t in s: 88 | token = Token() 89 | form=t[0] 90 | token.form = form 91 | 92 | # print(t) 93 | try: 94 | space=t[1] 95 | token.space_before = (space == 1) 96 | except IndexError: 97 | token.space_before = True # ? 98 | 99 | interpretations = t[2:] 100 | token.interpretations.extend([Form(base, ctag) for (base, ctag) in interpretations]) 101 | 102 | sentence.add_token(token) 103 | return paragraph 104 | 105 | 106 | def json_to_objects(data): 107 | paragraphs = [] 108 | for input_paragraph in data['documents']: 109 | paragraph = Paragraph() 110 | paragraphs.append(paragraph) 111 | for input_sentence in input_paragraph['sentences']: 112 | sentence = Sentence() 113 | paragraph.add_sentence(sentence) 114 | for input_token in input_sentence['tokens']: 115 | token = Token() 116 | token.form = input_token['form'] 117 | if len(input_token)>=2: 118 | separator=input_token['separator'] 119 | if separator is not None: 120 | token.space_before=separator 121 | elif len(input_token)>=4: 122 | token.start=input_token['start'] 123 | token.end = input_token['end'] 124 | #infer separator before from positions 125 | if len(sentence.tokens)==0: 126 | token.space_before='space' 127 | else: 128 | if sentence.tokens[-1].end==token.start: 129 | token.space_before = 'none' 130 | else: 131 | token.space_before = 'space' 132 | else: 133 | token.space_before = 'space' # TODO ? 134 | sentence.add_token(token) 135 | return paragraphs 136 | 137 | 138 | def json_compact_to_objects(data): 139 | paragraphs = [] 140 | for input_paragraph in data: 141 | paragraph = Paragraph() 142 | paragraphs.append(paragraph) 143 | for input_sentence in input_paragraph: 144 | sentence = Sentence() 145 | paragraph.add_sentence(sentence) 146 | for input_token in input_sentence: 147 | token = Token() 148 | token.form = input_token[0] 149 | if len(input_token) >= 2: 150 | separator = input_token[1] 151 | if separator is not None: 152 | token.space_before = separator 153 | elif len(input_token) >= 4: 154 | token.start = input_token[2] 155 | token.end = input_token[3] 156 | # infer separator before from positions 157 | if len(sentence.tokens) == 0: 158 | token.space_before = 'space' 159 | else: 160 | if sentence.tokens[-1].end == token.start: 161 | token.space_before = 'none' 162 | else: 163 | token.space_before = 'space' 164 | else: 165 | token.space_before = 'space' # TODO ? 166 | sentence.add_token(token) 167 | return paragraphs 168 | 169 | -------------------------------------------------------------------------------- /krnnt/serial_pickle.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from typing import BinaryIO, Iterable 3 | 4 | 5 | class SerialPickler: 6 | def __init__(self, file: BinaryIO, mode=3): # don't work with protocol 4 7 | self.file = file 8 | self.p = pickle.Pickler(file, mode) 9 | 10 | def add(self, obj): 11 | self.p.dump(obj) 12 | self.p.memo.clear() 13 | 14 | def extend(self, objs: Iterable): 15 | for obj in objs: 16 | self.p.dump(obj) 17 | self.p.memo.clear() 18 | 19 | def close(self): 20 | self.file.close() 21 | 22 | 23 | class SerialUnpickler: 24 | def __init__(self, file: BinaryIO, stop: int=-1, start: int =0, ids: Iterable = None): 25 | """ 26 | 27 | :param file: 28 | :param start: unpickle objects starting from index start 29 | :param stop: unpickle objects ending with index stop 30 | :param ids: unpickle objects with indexes in ids 31 | """ 32 | if ids is None: 33 | ids = [] 34 | self.file = file 35 | self.p = pickle.Unpickler(file) 36 | self.c = 0 37 | self.stop = stop 38 | self.start = start 39 | self.ids = set(ids) 40 | 41 | def __iter__(self): 42 | if self.ids: 43 | return self.__iter2() 44 | else: 45 | return self.__iter1() 46 | 47 | def __iter1(self): 48 | while True: 49 | try: 50 | if self.c == self.stop: 51 | break 52 | self.c += 1 53 | x = self.p.load() 54 | if self.c - 1 < self.start: 55 | continue 56 | 57 | # print self.c 58 | yield x 59 | except EOFError: 60 | break 61 | 62 | def __iter2(self): 63 | while True: 64 | try: 65 | x = self.p.load() 66 | if self.c in self.ids: 67 | yield x 68 | self.c += 1 69 | except EOFError: 70 | break 71 | 72 | 73 | def count_samples(path: str) -> int: 74 | """ 75 | Return number of items in serial pickle file. 76 | """ 77 | with open(path, 'rb') as file: 78 | su = SerialUnpickler(file) 79 | 80 | count = 0 81 | for paragraph in su: 82 | count += 1 83 | 84 | return count -------------------------------------------------------------------------------- /krnnt/structure.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from typing import List 5 | 6 | 7 | class Paragraph: 8 | sentences: List['Sentences'] 9 | 10 | __slots__ = ['sentences', 'concraft'] 11 | 12 | def __init__(self): 13 | self.sentences = [] 14 | 15 | def add_sentence(self, sentence: 'Sentence'): 16 | self.sentences.append(sentence) 17 | 18 | def __iter__(self): 19 | return self.sentences.__iter__() 20 | 21 | def text(self) -> str: 22 | raw = ''.join([sentence.text() for sentence in self.sentences]) 23 | try: 24 | if self.sentences[0].tokens[0].space_before: 25 | return raw[1:] 26 | else: 27 | return raw 28 | except: 29 | return raw 30 | 31 | def __str__(self): 32 | return 'Paragraph([%s])' % ','.join([str(x) for x in self.sentences]) 33 | 34 | 35 | class Sentence: 36 | tokens: List['Token'] 37 | 38 | __slots__ = ['tokens'] 39 | 40 | def __init__(self): 41 | self.tokens = [] 42 | 43 | def add_token(self, token: 'Token'): 44 | self.tokens.append(token) 45 | 46 | def text(self) -> str: 47 | return ''.join(map(lambda token: ' ' + token.form if token.space_before else token.form, self.tokens)) 48 | 49 | def __iter__(self): 50 | return self.tokens.__iter__() 51 | 52 | def __str__(self): 53 | return 'Sentence([%s])' % ','.join([str(x) for x in self.tokens]) 54 | 55 | class Token: 56 | form: str 57 | interpretations: List['Form'] 58 | gold_form: 'Form' 59 | 60 | __slots__ = ['form', 'space_before', 'interpretations', 'gold_form', 'start', 'end'] 61 | 62 | def __init__(self): 63 | self.form = None 64 | self.space_before = None 65 | self.interpretations = [] 66 | self.gold_form = None 67 | 68 | def add_interpretation(self, interpretation: 'Form'): 69 | self.interpretations.append(interpretation) 70 | 71 | def __str__(self): 72 | return 'Token(%s, %s, %s, %s)' % (self.form, ','.join([str(x) for x in self.interpretations]), self.space_before, str(self.gold_form)) 73 | 74 | 75 | class Form: 76 | def __init__(self, lemma: str, tags: str): 77 | self.lemma = lemma 78 | self.tags = tags 79 | 80 | def __str__(self): 81 | return 'Form(%s, %s)' % (self.lemma, self.tags) 82 | 83 | def __eq__(self, y): 84 | return self.lemma == y.lemma and self.tags == y.tags 85 | 86 | def __hash__(self): 87 | return hash((self.lemma, self.tags)) 88 | -------------------------------------------------------------------------------- /krnnt/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, List 2 | 3 | import regex 4 | 5 | 6 | def unix_uniq(l: str) -> str: 7 | packed = [] 8 | 9 | for el in l: 10 | if not packed or packed[-1] != el: 11 | packed.append(el) 12 | return ''.join(packed) 13 | 14 | 15 | def uniq(seq: Iterable) -> List: 16 | seen = set() 17 | return [x for x in seq if not (x in seen or seen.add(x))] 18 | 19 | 20 | def flatten(l: Iterable) -> List: 21 | return [item for sublist in l for item in sublist] 22 | 23 | 24 | def shape(word: str) -> str: # TODO zredukowac czas 25 | word = regex.sub(r'(?V1)\p{Lowercase}', 'l', word, flags=regex.U) # 80% 26 | word = regex.sub(r'(?V1)\p{Uppercase}', 'u', word, flags=regex.U) 27 | word = regex.sub(r'\p{gc=Decimal_Number}', 'd', word, flags=regex.U) 28 | word = regex.sub(r'[^A-Za-z0-9]', 'x', word, flags=regex.LOCALE) 29 | return unix_uniq(word) -------------------------------------------------------------------------------- /krnnt/writers.py: -------------------------------------------------------------------------------- 1 | import io 2 | import json 3 | import logging 4 | import sys 5 | from typing import Callable 6 | 7 | import jsonlines 8 | 9 | 10 | def results_to_txt_str(result_paragraphs): 11 | result_str = [] 12 | for paragraph in result_paragraphs: 13 | for sentence in paragraph: 14 | for i, token in enumerate(sentence): 15 | # print(token['sep']) 16 | if i > 0 and token['sep'] != 'none': 17 | result_str += (' ',) 18 | result_str += (token['token'],) 19 | result_str += ("\n",) 20 | result_str += ("\n",) 21 | return ''.join(result_str) 22 | 23 | 24 | def results_to_conll_str(result_paragraphs): 25 | result_str = [] 26 | for paragraph in result_paragraphs: 27 | for sentence in paragraph: 28 | for token in sentence: 29 | try: 30 | start = token['start'] 31 | except KeyError: 32 | start = '' 33 | 34 | try: 35 | end = token['end'] 36 | except KeyError: 37 | end = '' 38 | 39 | result_str += ('%s\t%s\t%s\t%s\t%s\t%s' % ( 40 | token['token'], token['lemmas'][0], 0 if token['sep'] == 'none' else 1, token['tag'], start, end),) 41 | result_str += ("",) 42 | result_str += ("",) 43 | return '\n'.join(result_str) 44 | 45 | 46 | def results_to_jsonl_str(result_paragraphs): 47 | fp = io.StringIO() 48 | with jsonlines.Writer(fp) as writer: 49 | for paragraph in result_paragraphs: 50 | output_paragraph=[] 51 | for sentence in paragraph: 52 | ss = [(token['token'], token['lemmas'][0], token['tag']) for token in sentence] 53 | output_paragraph+=(ss,) 54 | writer.write(output_paragraph) 55 | return fp.getvalue() 56 | 57 | def results_to_json_str(result_paragraphs): 58 | return json.dumps(result_paragraphs) 59 | 60 | 61 | def results_to_conllu_str(result_paragraphs): 62 | result_str = [] 63 | for paragraph in result_paragraphs: 64 | for sentence in paragraph: 65 | for i, token in enumerate(sentence): 66 | result_str += ('%s\t%s\t%s\t_\t%s\t_\t_\t_\t_\t_' % ( 67 | i + 1, token['token'], token['lemmas'][0], token['tag']),) 68 | result_str += ("",) 69 | result_str += ("",) 70 | return '\n'.join(result_str) 71 | 72 | 73 | def results_to_plain_str(result_paragraphs): 74 | result_str = [] 75 | for paragraph in result_paragraphs: 76 | for sentence in paragraph: 77 | for token in sentence: 78 | result_str += ('%s\t%s' % (token['token'], token['sep']),) 79 | for lemma in token['lemmas']: 80 | result_str += ('\t%s\t%s\tdisamb' % (lemma, token['tag']),) 81 | result_str += ("",) 82 | result_str += ("",) 83 | return '\n'.join(result_str) 84 | 85 | 86 | def results_to_xces_str(result_paragraphs): 87 | result_str = [] 88 | result_str += ('', 89 | '', 90 | '', 91 | '') 92 | for paragraph in result_paragraphs: 93 | result_str += (' ', ) 94 | for sentence in paragraph: 95 | result_str += (' ',) 96 | for token in sentence: 97 | if token['sep'] == 'none': 98 | result_str += (' ',) 99 | result_str += (' ',) 100 | result_str += (' %s' % escape_xml(token['token']),) 101 | for lemma in token['lemmas']: 102 | result_str += (' %s%s' % (escape_xml(lemma), 103 | token['tag']),) 104 | result_str += (' ',) 105 | result_str += (' ',) 106 | result_str += (' ',) 107 | 108 | result_str += ('', 109 | '') 110 | return '\n'.join(result_str) 111 | 112 | 113 | def escape_xml(s): 114 | return s.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace('\'', 115 | ''') 116 | 117 | 118 | def get_output_converter(output_format: str) -> Callable: 119 | output_format=output_format.lower() 120 | if output_format == 'xces': 121 | conversion = results_to_xces_str 122 | elif output_format == 'plain': 123 | conversion = results_to_plain_str 124 | elif output_format in ('conll','tsv'): 125 | conversion = results_to_conll_str 126 | elif output_format == 'conllu': 127 | conversion = results_to_conllu_str 128 | elif output_format == 'jsonl': 129 | conversion = results_to_jsonl_str 130 | elif output_format == 'json': 131 | conversion = results_to_json_str 132 | elif output_format in ('txt','text'): 133 | conversion = results_to_txt_str 134 | else: 135 | logging.error('Wrong output format.') 136 | sys.exit(1) 137 | 138 | return conversion -------------------------------------------------------------------------------- /krnnt_run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import logging 4 | import sys 5 | 6 | from argparse import ArgumentParser 7 | 8 | from krnnt.aglt import remove_aglt_from_results_rule1_3 9 | from krnnt.blanks import remove_blanks_from_results 10 | from krnnt.keras_models import BEST 11 | from krnnt.new import Lemmatisation, Lemmatisation2, get_morfeusz, analyze_tokenized 12 | from krnnt.pipeline import KRNNTSingle, chunk 13 | from krnnt.readers import read_xces, read_jsonl 14 | from krnnt.writers import results_to_jsonl_str, results_to_conll_str, results_to_conllu_str, \ 15 | results_to_xces_str, results_to_plain_str 16 | 17 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO) 18 | 19 | 20 | if __name__ == '__main__': 21 | parser = ArgumentParser(description='Run tagger') 22 | parser.add_argument('weight_path', help='path to weights, lemmatisation data and dictionary') 23 | parser.add_argument('lemmatisation_data', help='path to lemmatisation data') 24 | parser.add_argument('dictionary', help='path to dictionary') 25 | parser.add_argument('-p', '--preanalyzed', action='store_false', 26 | default=True, dest='reanalyzed', 27 | help='training data have not been reanalyzed') 28 | parser.add_argument('-i', '--input-format', default='xces', dest='input_format', 29 | help='input format of preanalyzed data: xces, jsonl') 30 | parser.add_argument('-o', '--output-format', 31 | default='xces', dest='output_format', 32 | help='output format: xces, plain, conll, conllu, jsonl') 33 | parser.add_argument('--maca_config', 34 | default='morfeusz2-nkjp', 35 | help='Maca config') 36 | parser.add_argument('--toki_config_path', 37 | default='', 38 | help='Toki config path (directory)') 39 | parser.add_argument('--lemmatisation', 40 | default='sgjp', 41 | help='lemmatization mode (sgjp, simple)') 42 | parser.add_argument('-g', '--debug', action='store_true', dest='debug_mode') # TODO 43 | parser.add_argument('--tokenized', action='store_true', 44 | help='input data are tokenized, but not analyzed') 45 | parser.add_argument('--reproducible', action='store_true', default=False, help='set seeds') 46 | parser.add_argument('--chunk_size', 47 | default=100000, type=int, 48 | help='chunk size') 49 | parser.add_argument('--remove_aglt', action='store_true') 50 | parser.add_argument('--dont_remove_blank', action='store_false') 51 | args = parser.parse_args() 52 | 53 | if args.reproducible: 54 | from numpy.random import seed 55 | seed(1337) 56 | import random as rn 57 | rn.seed(1337) 58 | import tensorflow as tf 59 | session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, 60 | inter_op_parallelism_threads=1) 61 | from keras import backend as K 62 | tf.set_random_seed(1337) 63 | sess = tf.Session(graph=tf.get_default_graph(), config=session_conf) 64 | K.set_session(sess) 65 | 66 | pref = {'keras_batch_size': 32, 'internal_neurons': 256, 'feature_name': 'tags4e3', 'label_name': 'label', 67 | 'keras_model_class': BEST, 'maca_config':args.maca_config, 'toki_config_path':args.toki_config_path} 68 | 69 | if args.lemmatisation== 'simple': 70 | pref['lemmatisation_class'] = Lemmatisation2 71 | else: 72 | pref['lemmatisation_class'] = Lemmatisation 73 | 74 | pref['reanalyze'] = args.reanalyzed 75 | # pref['input_format'] = options.input_format 76 | pref['output_format'] = args.output_format 77 | 78 | pref['weight_path'] = args.weight_path 79 | pref['lemmatisation_path'] = args.lemmatisation_data 80 | pref['UniqueFeaturesValues'] = args.dictionary 81 | 82 | krnnt = KRNNTSingle(pref) 83 | #time python3 -m cProfile -o gpu_run_train2.profil krnnt_run.py ../krnnt/data/weights.hdf5 ../krnnt/data/lemmatisation.pkl ../krnnt/data/dictionary.pkl -o xces > /tmp/out.xces < ../krnnt-refactor/tests/data/full/train-raw.txt 84 | 85 | if args.tokenized: 86 | if args.input_format == 'jsonl': 87 | corpus = read_jsonl(sys.stdin) 88 | else: 89 | print('Wrong input format.') 90 | sys.exit(1) 91 | 92 | morf=get_morfeusz() 93 | corpus = analyze_tokenized(morf, corpus) 94 | results = krnnt.tag_paragraphs(corpus, preana=True) 95 | elif args.reanalyzed: 96 | data=sys.stdin.read().split('\n\n') 97 | results=[] 98 | for batch in chunk(data, args.chunk_size): 99 | results += krnnt.tag_paragraphs(batch) # ['Ala ma kota.', 'Ale nie ma psa.'] 100 | #TODO: print here 101 | else: 102 | #f = io.StringIO(sys.stdin.read()) 103 | if args.input_format== 'xces': 104 | corpus = read_xces(sys.stdin) 105 | elif args.input_format== 'jsonl': 106 | corpus = read_jsonl(sys.stdin) 107 | else: 108 | print('Wrong input format.') 109 | sys.exit(1) 110 | 111 | results = krnnt.tag_paragraphs(corpus, preana=True) 112 | 113 | # print(results) 114 | 115 | if args.output_format == 'xces': 116 | conversion = results_to_xces_str 117 | elif args.output_format == 'plain': 118 | conversion = results_to_plain_str 119 | elif args.output_format == 'conll': 120 | conversion = results_to_conll_str 121 | elif args.output_format == 'conllu': 122 | conversion = results_to_conllu_str 123 | elif args.output_format == 'jsonl': 124 | conversion = results_to_jsonl_str 125 | else: 126 | print('Wrong output format.') 127 | sys.exit(1) 128 | 129 | 130 | if args.remove_aglt: 131 | remove_aglt_from_results_rule1_3(results) 132 | 133 | if args.dont_remove_blank: 134 | remove_blanks_from_results(results) 135 | 136 | print(conversion(results), end='') 137 | -------------------------------------------------------------------------------- /krnnt_serve.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | from argparse import ArgumentParser 6 | 7 | from flask import Flask 8 | from flask import request 9 | from krnnt.additional_format import additional_format 10 | from krnnt.aglt import remove_aglt_from_results_rule1_3 11 | from krnnt.analyzers import MacaAnalyzer 12 | from krnnt.blanks import remove_blanks_from_results 13 | from krnnt.keras_models import BEST 14 | from krnnt.new import Lemmatisation, Lemmatisation2, get_morfeusz, analyze_tokenized 15 | from krnnt.writers import get_output_converter 16 | from krnnt.readers import json_to_objects, json_compact_to_objects 17 | from krnnt.pipeline import KRNNTSingle 18 | 19 | app = Flask(__name__) 20 | app.config['JSON_AS_ASCII'] = False 21 | application = app 22 | 23 | global krnntx, conversion, maca_analyzer, morfeusz 24 | 25 | 26 | def render(text='', str_results=''): 27 | return """ 28 | 29 | 30 | 31 | KRNNT 32 | 33 | 34 |

KRNNT: Polish Recurrent Neural Network Tagger

35 |
36 |
37 | 38 |
39 |
%s
40 |

The tagset is described here: http://nkjp.pl/poliqarp/help/ense2.html

41 |

Wróbel Krzysztof, KRNNT: Polish Recurrent Neural Network Tagger

42 |

Source code: https://github.com/kwrobel-nlp/krnnt

43 | 44 | """ % (text, str_results) 45 | 46 | 47 | @app.route('/', methods=['GET']) 48 | def gui(): 49 | return render() 50 | 51 | 52 | @app.route('/', methods=['POST']) 53 | def tag_raw(): 54 | request.get_data() 55 | 56 | input_format = request.args.get('input_format', default=None, type=str) 57 | output_format = request.args.get('output_format', default='plain', type=str) 58 | remove_aglt = request.args.get('remove_aglt', default='0', type=str) 59 | remove_blank = request.args.get('remove_blank', default='1', type=str) 60 | 61 | conversion2 = get_output_converter(output_format) 62 | 63 | if remove_aglt!='0': 64 | conversionx=conversion2 65 | conversion2=lambda x: conversionx(remove_aglt_from_results_rule1_3(x)) 66 | 67 | if remove_blank!='0': 68 | conversionx2=conversion2 69 | conversion2=lambda x: conversionx2(remove_blanks_from_results(x)) 70 | 71 | if request.is_json: 72 | data = request.get_json() 73 | 74 | if 'docs' in data: 75 | return additional_format(data, krnntx, morfeusz) 76 | else: 77 | if 'documents' in data: 78 | paragraphs = json_to_objects(data) 79 | else: 80 | paragraphs = json_compact_to_objects(data) 81 | 82 | corpus = analyze_tokenized(morfeusz, paragraphs) 83 | results = krnntx.tag_paragraphs(corpus, preana=True) 84 | 85 | return conversion2(results) 86 | elif 'text' in request.form: 87 | text = request.form['text'] 88 | 89 | 90 | 91 | results = krnntx.tag_paragraphs([text]) # ['Ala ma kota.', 'Ale nie ma psa.'] 92 | return render(text, conversion(results)) 93 | else: 94 | text = request.get_data() 95 | 96 | if input_format == 'lines': 97 | data = text.decode('utf-8').split('\n\n') #TODO 98 | else: 99 | data = [text.decode('utf-8')] 100 | 101 | results = krnntx.tag_paragraphs(data) 102 | 103 | return conversion2(results) 104 | 105 | 106 | @app.route('/tag/', methods=['POST']) 107 | def tag(): 108 | text = request.form['text'] 109 | results = krnntx.tag_sentences(text.split('\n\n')) # ['Ala ma kota.', 'Ale nie ma psa.'] 110 | return render(text, conversion(results)) 111 | 112 | @app.route('/maca/', methods=['POST']) 113 | def maca(): 114 | text = request.get_data() 115 | # print(text.decode('utf-8').split('\n\n')) 116 | 117 | results = maca_analyzer._maca(text.decode('utf-8').split('\n\n')) 118 | results = list(results) 119 | return str(results) 120 | 121 | 122 | def main(argv=sys.argv[1:]): 123 | print(argv) 124 | global conversion,krnntx,maca_analyzer, morfeusz 125 | 126 | parser = ArgumentParser(usage='HTTP Tagger server') 127 | parser.add_argument('model_path', help='path to directory woth weights, lemmatisation data and dictionary') 128 | parser.add_argument('-p', '--port', 129 | default=9003, 130 | help='server port (defaults to 9003)') 131 | parser.add_argument('-t', '--host', 132 | default='0.0.0.0', 133 | help='server host (defaults to localhost)') 134 | parser.add_argument('--maca_config', 135 | default='morfeusz-nkjp-official', 136 | help='Maca config') 137 | parser.add_argument('--toki_config_path', 138 | default='', 139 | help='Toki config path (directory)') 140 | parser.add_argument('--lemmatisation', 141 | default='sgjp', 142 | help='lemmatization mode (sgjp, simple)') 143 | parser.add_argument('-o', '--output-format', 144 | default='plain', dest='output_format', 145 | help='output format: xces, plain, conll, conllu, jsonl') 146 | parser.add_argument('-b', '--batch_size', 147 | default=32, type=int, 148 | help='batch size') 149 | parser.add_argument('--remove_aglt', action='store_true') 150 | parser.add_argument('--dont_remove_blank', action='store_false') 151 | args = parser.parse_args(argv) 152 | 153 | pref = {'keras_batch_size': args.batch_size, 'internal_neurons': 256, 'feature_name': 'tags4e3', 'label_name': 'label', 154 | 'keras_model_class': BEST, 'maca_config': args.maca_config, 'toki_config_path': args.toki_config_path} 155 | 156 | if args.lemmatisation == 'simple': 157 | pref['lemmatisation_class'] = Lemmatisation2 158 | else: 159 | pref['lemmatisation_class'] = Lemmatisation 160 | 161 | pref['reanalyze'] = True 162 | 163 | pref['weight_path'] = args.model_path + "/weights.hdf5" 164 | pref['lemmatisation_path'] = args.model_path + "/lemmatisation.pkl" 165 | pref['UniqueFeaturesValues'] = args.model_path + "/dictionary.pkl" 166 | 167 | morfeusz = get_morfeusz() 168 | maca_analyzer = MacaAnalyzer(args.maca_config) 169 | krnntx = KRNNTSingle(pref) 170 | 171 | krnntx.tag_sentences(['Ala']) 172 | 173 | conversion= get_output_converter(args.output_format) 174 | 175 | if args.remove_aglt: 176 | conversionx = conversion 177 | conversion=lambda x: conversionx(remove_aglt_from_results_rule1_3(x)) 178 | 179 | if args.dont_remove_blank: 180 | conversionx2 = conversion 181 | conversion=lambda x: conversionx2(remove_blanks_from_results(x)) 182 | 183 | 184 | return app, args.host, args.port 185 | 186 | 187 | 188 | if __name__ == '__main__': 189 | app,host,port = main() 190 | # from werkzeug.middleware.profiler import ProfilerMiddleware 191 | # app.config['PROFILE'] = True 192 | # app = ProfilerMiddleware(app) 193 | # app.wsgi_app = ProfilerMiddleware( 194 | # app.wsgi_app, profile_dir="." 195 | # ) 196 | app.run(host=host, port=port, debug=False) # threaded=False on GPU 197 | 198 | def start(*args, **kwargs): 199 | app, host, port = main(args) 200 | return app 201 | 202 | #gunicorn -b 127.0.0.1:9003 -w 4 -k gevent -t 3600 --threads 4 'krnnt_serve:start("model_data","--maca_config","morfeusz2-nkjp","--toki_config_path","/home/krnnt/")' -------------------------------------------------------------------------------- /krnnt_train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from argparse import ArgumentParser 5 | 6 | from krnnt.keras_models import BEST, ExperimentParameters 7 | from krnnt.new import UnalignedSimpleEvaluator 8 | from krnnt.tagger_exps import RunFolds2, KerasData, RunExperiment 9 | 10 | 11 | if __name__ == '__main__': 12 | parser = ArgumentParser() 13 | parser.add_argument('corpus_path', help='path to corpus') 14 | parser.add_argument('-p', '--preanalyzed', action='store_false', 15 | default=True, dest='reanalyzed', 16 | help='training data have not been reanalyzed') 17 | parser.add_argument('-c', '--cv', action='store_true', 18 | default=False, dest='cv', 19 | help='run 10-fold cross-validation') 20 | parser.add_argument('-t', '--train_ratio', 21 | default=1.0, dest='train_ratio', type=float, 22 | help='percentage of data for training') 23 | parser.add_argument('-d', '--dev_ratio', 24 | default=0.0, dest='dev_ratio', type=float, 25 | help='percentage of training data for development') 26 | parser.add_argument('-e', '--epochs', 27 | default=100, dest='epochs', type=int, 28 | help='number of epochs') 29 | parser.add_argument('--patience', 30 | default=10, dest='patience', type=int, 31 | help='patience') 32 | parser.add_argument('--maca_config', 33 | default='morfeusz-nkjp-official', 34 | help='Maca config') 35 | parser.add_argument('-g', '--debug', action='store_true', dest='debug_mode') # TODO 36 | parser.add_argument('--hash', action='store', default=None, dest='hash') 37 | parser.add_argument('--reproducible', action='store_true', default=False, help='set seeds') 38 | parser.add_argument('-f', '--fold', default=None, dest='fold') 39 | args = parser.parse_args() 40 | 41 | if args.reproducible: 42 | from numpy.random import seed 43 | seed(1337) 44 | import random as rn 45 | rn.seed(1337) 46 | import tensorflow as tf 47 | session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, 48 | inter_op_parallelism_threads=1) 49 | from keras import backend as K 50 | tf.set_random_seed(1337) 51 | sess = tf.Session(graph=tf.get_default_graph(), config=session_conf) 52 | K.set_session(sess) 53 | 54 | pref = {'nb_epoch': 100, 'batch_size': 256, 55 | 'internal_neurons': 256, 'feature_name': 'tags4e3', 'label_name': 'label', 56 | 'evaluator': UnalignedSimpleEvaluator, 'patience': 10, 57 | 'weight_path': 'weights.hdf5', 'samples_per_epoch': 10000, 'keras_model_class': BEST, 58 | 'corpus_path': 'data/train-reanalyzed.spickle', 'reanalyze': True, 'train_data_ratio': 0.9, 59 | 'dev_data_ratio': 0.1} 60 | 61 | pref['reanalyze'] = args.reanalyzed 62 | pref['train_data_ratio'] = float(args.train_ratio) 63 | pref['dev_data_ratio'] = float(args.dev_ratio) 64 | pref['nb_epoch'] = int(args.epochs) 65 | pref['corpus_path'] = args.corpus_path 66 | pref['patience'] = args.patience 67 | pref['maca_config'] = args.maca_config 68 | if args.hash is not None: 69 | pref['h'] = args.hash 70 | if args.fold is not None: 71 | pref['fold'] = int(args.fold) 72 | 73 | keras_model_class = pref['keras_model_class'] 74 | 75 | if args.cv: 76 | rf = RunFolds2(keras_model_class, pref) 77 | rf.run() 78 | else: 79 | parameters = ExperimentParameters(pref) 80 | km = keras_model_class(parameters) 81 | 82 | print('Model will be saved under: %s.final' % parameters.pref['weight_path']) 83 | print('Lemmatisation model will be saved under: %s' % parameters.pref['lemmatisation_path']) 84 | 85 | kd = KerasData(pref['corpus_path'], pref['reanalyze']) 86 | re = RunExperiment(kd, km) 87 | re.run() 88 | 89 | print('Model is saved under: %s' % parameters.pref['weight_path']) 90 | print('Lemmatisation model is saved under: %s' % parameters.pref['lemmatisation_path']) 91 | if pref['reanalyze']: 92 | print('Dictionary is saved under: %s' % parameters.pref[ 93 | 'corpus_path'] + '_FormatData2_PreprocessData_UniqueFeaturesValues') 94 | else: 95 | print('Dictionary is saved under: %s' % parameters.pref[ 96 | 'corpus_path'] + '_FormatDataPreAnalyzed_PreprocessData_UniqueFeaturesValues') 97 | -------------------------------------------------------------------------------- /merge_analyzed_gold.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from argparse import ArgumentParser 4 | 5 | from krnnt.serial_pickle import SerialPickler, SerialUnpickler 6 | 7 | if __name__ == '__main__': 8 | parser = ArgumentParser( 9 | description='Combines analyzed corpus with gold. Analyzed corpus must be with gold segmentation.') 10 | parser.add_argument('gold_path', help='') 11 | parser.add_argument('analyzed_path', help='') 12 | parser.add_argument('output_path', help='') 13 | args = parser.parse_args() 14 | 15 | file_path1 = args.gold_path 16 | file_path2 = args.analyzed_path 17 | output_path = args.output_path 18 | 19 | file1 = open(file_path1, 'rb') 20 | su_gold = SerialUnpickler(file1) 21 | 22 | file2 = open(file_path2, 'rb') 23 | su_analyzed = SerialUnpickler(file2) 24 | 25 | file3 = open(output_path, 'wb') 26 | sp = SerialPickler(file3) 27 | 28 | for paragraph_gold in su_gold: 29 | for sentence_gold in paragraph_gold: 30 | paragraph_analyzed = next(su_analyzed.__iter__()) 31 | assert len(paragraph_analyzed.sentences), 1 32 | sentence_analyzed = paragraph_analyzed.sentences[0] 33 | assert len(sentence_analyzed.tokens), len(sentence_gold.tokens) 34 | for token_gold, token_analyzed in zip(sentence_gold, sentence_analyzed): 35 | token_gold.interpretations = token_analyzed.interpretations 36 | sp.add(paragraph_gold) 37 | 38 | file3.close() 39 | -------------------------------------------------------------------------------- /preprocess_data.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | from tqdm import tqdm 4 | 5 | from krnnt.new import preprocess_paragraph_preanalyzed, \ 6 | preprocess_paragraph_reanalyzed 7 | from krnnt.serial_pickle import SerialPickler, SerialUnpickler 8 | from krnnt.structure import Paragraph 9 | 10 | if __name__ == '__main__': 11 | parser = ArgumentParser(description='Create features for neural network.') 12 | parser.add_argument('input_path', type=str, help='path to re/preanalyzed data') 13 | parser.add_argument('output_path', type=str, help='save path') 14 | parser.add_argument('-p', '--preanalyzed', action='store_false', 15 | default=True, dest='reanalyzed', 16 | help='training data have not been reanalyzed') 17 | args = parser.parse_args() 18 | 19 | file = open(args.input_path, 'rb') 20 | su = SerialUnpickler(file) 21 | 22 | file2 = open(args.output_path, 'wb') 23 | sp = SerialPickler(file2) 24 | 25 | if args.reanalyzed: 26 | preprocess_method = preprocess_paragraph_reanalyzed 27 | else: 28 | preprocess_method = preprocess_paragraph_preanalyzed 29 | 30 | paragraph: Paragraph 31 | for paragraph in tqdm(su, total=18484): 32 | paragraph_sequence = preprocess_method(paragraph) 33 | 34 | sp.add(paragraph_sequence) 35 | 36 | file.close() 37 | file2.close() 38 | 39 | -------------------------------------------------------------------------------- /process_xces.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import glob 4 | 5 | from krnnt.serial_pickle import SerialPickler 6 | from argparse import ArgumentParser 7 | 8 | from krnnt.readers import read_xces 9 | 10 | usage = """%prog CORPUS SAVE_PATH 11 | 12 | Converts XCES corpus to internal KRNNT representation and saves it to file. 13 | 14 | E.g. %prog train-analyzed.xml train-analyzed.spickle 15 | """ 16 | 17 | if __name__ == '__main__': 18 | parser = ArgumentParser(usage="usage") 19 | parser.add_argument('file_path', type=str, help='path to XCES corpus (or path with wildcard)') 20 | parser.add_argument('output_path', type=str, help='save path') 21 | args = parser.parse_args() 22 | 23 | with open(args.output_path, 'wb') as file: 24 | sp = SerialPickler(file) 25 | 26 | for path in glob.iglob(args.file_path): 27 | print(path) 28 | for paragraph in read_xces(path): 29 | sp.add(paragraph) 30 | -------------------------------------------------------------------------------- /reanalyze.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from argparse import ArgumentParser 4 | 5 | from tqdm import tqdm 6 | 7 | from krnnt.aligner import align_paragraphs 8 | from krnnt.analyzers import MacaAnalyzer 9 | from krnnt.structure import Paragraph 10 | from krnnt.serial_pickle import SerialPickler, SerialUnpickler 11 | 12 | usage = """prog CORPUS_GOLD CORPUS_SAVE 13 | 14 | Reanalyze corpus with Maca. 15 | 16 | E.g. prog train-gold.spickle train-reanalyzed.spickle 17 | """ 18 | 19 | if __name__ == '__main__': 20 | parser = ArgumentParser(usage=usage) 21 | parser.add_argument('file_path', type=str, help='paths to corpus') 22 | parser.add_argument('output_path', type=str, help='save path') 23 | parser.add_argument('--maca_config', default='morfeusz2-nkjp', help='Maca config') 24 | parser.add_argument('--toki_config_path', default='', help='Toki config path (directory)') 25 | args = parser.parse_args() 26 | 27 | file1 = open(args.file_path, 'rb') 28 | su_gold = SerialUnpickler(file1) 29 | 30 | file2 = open(args.output_path, 'wb') 31 | sp = SerialPickler(file2) 32 | 33 | maca_analyzer = MacaAnalyzer(args.maca_config) 34 | 35 | paragraph_gold: Paragraph 36 | for j, paragraph_gold in tqdm(enumerate(su_gold), total=18484, desc='Morphological analysis'): 37 | paragraph_raw = paragraph_gold.text() 38 | 39 | paragraph_reanalyzed = maca_analyzer.analyze(paragraph_raw) 40 | 41 | print('Number of sentences by Maca vs gold', len(paragraph_reanalyzed.sentences), len(paragraph_gold.sentences)) 42 | 43 | paragraph_reanalyzed = align_paragraphs(paragraph_reanalyzed, paragraph_gold) 44 | 45 | sp.add(paragraph_reanalyzed) 46 | 47 | file2.close() 48 | 49 | # TODO: count mismatched sentences 50 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Cython 2 | scikit-learn 3 | flask 4 | tqdm 5 | h5py==2.9.0 6 | Keras==2.2.4 7 | numpy==1.16.4 8 | regex==2019.6.8 9 | requests==2.22.0 10 | jsonlines==1.2.0 11 | tensorflow-gpu==1.12.0 12 | pytest 13 | gunicorn 14 | git+https://github.com/djstrong/pytest-shell.git#egg=pytest-shell 15 | git+https://github.com/djstrong/krnnt_text_utils.git@cython 16 | pytest-benchmark 17 | -------------------------------------------------------------------------------- /run_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #check if server is running 4 | SERVER_STARTED=0 5 | if [ `ps aux | grep krnnt_serve -c` -eq 1 ]; then 6 | echo 'Starting server' 7 | ./start_flask_server.sh > /dev/null 2>&1 & 8 | PID=$! 9 | echo "PID: $PID" 10 | SERVER_STARTED=1 11 | sleep 5 12 | fi 13 | 14 | 15 | cd tests 16 | python3 -m pytest 17 | 18 | 19 | if [ $SERVER_STARTED -eq 1 ]; then 20 | echo 'Killing server' 21 | pkill -P "$PID" 22 | fi -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='krnnt', 4 | version='1.0.0', 5 | description='Part of speech tagger for Polish', 6 | url='https://github.com/kwrobel-nlp/krnnt', 7 | author='Krzysztof Wróbel', 8 | author_email='Wrobel.Krzysztof@gmail.com', 9 | packages=['krnnt'], 10 | license='LGPL', 11 | python_requires='>=3, <4', 12 | install_requires=[ 13 | 'Cython', 'h5py', 'Keras==2.2.5', 'numpy', 'regex', 'requests', 'jsonlines', 'tqdm', 'flask', 'gunicorn', 14 | 'krnnt_utils @ git+https://github.com/Zhylkaaa/krnnt_text_utils@cython' 15 | ], 16 | extras_require={ 17 | 'train': ['scikit-learn'], 18 | 'pytest': ['pytest', 'pytest-benchmark', 19 | 'pytest-shell @ https://api.github.com/repos/djstrong/pytest-shell/tarball/'], 20 | 'tfcpu': ['tensorflow==1.14.0'], 21 | 'tfgpu': ['tensorflow-gpu==1.12.0'] 22 | }, 23 | zip_safe=False) -------------------------------------------------------------------------------- /shuffle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import random 5 | from argparse import ArgumentParser 6 | 7 | from tqdm import tqdm 8 | 9 | from krnnt.structure import Paragraph 10 | from krnnt.serial_pickle import SerialPickler, SerialUnpickler 11 | 12 | usage = """%prog CORPUS SAVE_PATH 13 | 14 | Shuffle training data. 15 | 16 | E.g. %prog train-merged.spickle train-merged.shuf.spickle 17 | """ 18 | 19 | if __name__ == '__main__': 20 | parser = ArgumentParser(usage=usage) 21 | parser.add_argument('file_path', type=str, help='paths to corpus') 22 | parser.add_argument('output_path', type=str, help='save path') 23 | parser.add_argument('--seed', '-s', type=int, default=1337, help='seed') 24 | args = parser.parse_args() 25 | 26 | file_path1 = args.file_path 27 | file_path2 = args.output_path 28 | 29 | file = open(file_path1, 'rb') 30 | su = SerialUnpickler(file) 31 | 32 | paragraphs = [] 33 | paragraph: Paragraph 34 | for paragraph in tqdm(su, desc='Loading', total=18484): 35 | paragraphs.append(paragraph) 36 | file.close() 37 | 38 | random.seed(args.seed) 39 | random.shuffle(paragraphs) 40 | 41 | file2 = open(file_path2, 'wb') 42 | sp = SerialPickler(file2) 43 | 44 | for paragraph in tqdm(paragraphs, desc='Saving'): 45 | sp.add(paragraph) 46 | 47 | file2.close() 48 | -------------------------------------------------------------------------------- /split_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import math 4 | from argparse import ArgumentParser 5 | 6 | from krnnt.serial_pickle import SerialPickler, SerialUnpickler, count_samples 7 | 8 | if __name__ == '__main__': 9 | parser = ArgumentParser(description='Split data') 10 | parser.add_argument('input_path', help='input path to data') 11 | parser.add_argument('output_path1', help='output path to data') 12 | parser.add_argument('output_path2', help='output path to data') 13 | parser.add_argument('ratio', type=float, help='ratio of data to write to the first output') 14 | 15 | args = parser.parse_args() 16 | 17 | num_data = count_samples(args.input_path) 18 | first_part = math.ceil(num_data * args.ratio) 19 | 20 | sp1 = SerialPickler(open(args.output_path1, 'wb')) 21 | sp2 = SerialPickler(open(args.output_path2, 'wb')) 22 | 23 | su = SerialUnpickler(open(args.input_path, 'rb')) 24 | for i, paragraph in enumerate(su): 25 | if i < first_part: 26 | sp1.add(paragraph) 27 | else: 28 | sp2.add(paragraph) 29 | sp1.close() 30 | sp2.close() 31 | -------------------------------------------------------------------------------- /start_flask_server.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | PORT=${PORT:-9003} 4 | 5 | export CUDA_VISIBLE_DEVICES="" 6 | 7 | python3 krnnt_serve.py model_data --maca_config morfeusz2-nkjp -p $PORT -------------------------------------------------------------------------------- /start_gunicorn_server.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | PORT=${PORT:-9003} 4 | WORKERS=${WORKERS:-1} 5 | 6 | echo "Starting server with $WORKERS workers." 7 | 8 | export CUDA_VISIBLE_DEVICES="" 9 | 10 | gunicorn -b 0.0.0.0:$PORT -w $WORKERS -k sync -t 3600 --threads 1 'krnnt_serve:start("model_data","--maca_config","morfeusz2-nkjp")' -------------------------------------------------------------------------------- /tests/benchmark/test_maca.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from krnnt.analyzers import MacaAnalyzer 4 | 5 | paragraph_raw = 'Moje niefortunne pudła przygnębiły mnie do reszty. Zawsze miałem pretensje, że jestem dobrym myśliwym, a od dzieciństwa nie rozstawałem się ze strzelbą, a tu wśród obcych zblamowałem się jak nigdy w życiu. Jakże inaczej strzelałem cietrzewie i pardwy z moich "hollandów", które pozostawiłem na wieczną zgubę w Petersburgu. Poczciwy Staś Sierakowski pośpieszył mi z pomocą, by wyjaśnić moje niepowodzenia. - Pokaż mi strzelbę - poprosił, a gdy podałem mu mojego mauzera, spytał ze śmiechem: - Gdzieś to świństwo wykopał? - Ano w Gdańsku - odrzekłem zawstydzony. - Chyba byłeś ślepy, kupując taką szkaradę. Z czego strzelałeś przed wojną? - Miałem hollandy - odrzekłem. - Jedyna rada - rzekł w końcu Staś po oględzinach mojej broni. - Każ sobie skrócić szyję na dobrych kilka centymetrów, albo jeszcze lepiej rzuć to świństwo do pieca, a co się nie spali - na śmietnik.' 6 | MACA_CONFIG1 = 'morfeusz-nkjp-official' 7 | MACA_CONFIG2 = 'morfeusz2-nkjp' 8 | 9 | 10 | @pytest.fixture 11 | def get_maca_wrapper(): 12 | try: 13 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1) 14 | list(maca_analyzer._maca_wrapper(paragraph_raw)) 15 | except: 16 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2) 17 | list(maca_analyzer._maca_wrapper(paragraph_raw)) 18 | 19 | return maca_analyzer 20 | 21 | 22 | @pytest.fixture 23 | def get_maca_process(): 24 | try: 25 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1) 26 | list(maca_analyzer._maca_process(paragraph_raw)) 27 | except: 28 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2) 29 | list(maca_analyzer._maca_process(paragraph_raw)) 30 | 31 | return maca_analyzer 32 | 33 | 34 | def analyze_process(maca_analyzer, data): 35 | results = maca_analyzer._maca_process(data) 36 | return list(results) 37 | 38 | 39 | def analyze_wrapper(maca_analyzer, data): 40 | results = maca_analyzer._maca_wrapper(data) 41 | return list(results) 42 | 43 | 44 | @pytest.mark.slow 45 | def test_maca_process_speed(benchmark, get_maca_process): 46 | maca_analyzer = get_maca_process 47 | benchmark(analyze_process, maca_analyzer, paragraph_raw) 48 | 49 | 50 | @pytest.mark.slow 51 | def test_maca_wrapper_speed(benchmark, get_maca_wrapper): 52 | maca_analyzer = get_maca_wrapper 53 | benchmark(analyze_wrapper, maca_analyzer, paragraph_raw) 54 | -------------------------------------------------------------------------------- /tests/benchmark/test_maca_analyze.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from krnnt.analyzers import MacaAnalyzer 4 | 5 | paragraph_raw = 'Moje niefortunne pudła przygnębiły mnie do reszty. Zawsze miałem pretensje, że jestem dobrym myśliwym, a od dzieciństwa nie rozstawałem się ze strzelbą, a tu wśród obcych zblamowałem się jak nigdy w życiu. Jakże inaczej strzelałem cietrzewie i pardwy z moich "hollandów", które pozostawiłem na wieczną zgubę w Petersburgu. Poczciwy Staś Sierakowski pośpieszył mi z pomocą, by wyjaśnić moje niepowodzenia. - Pokaż mi strzelbę - poprosił, a gdy podałem mu mojego mauzera, spytał ze śmiechem: - Gdzieś to świństwo wykopał? - Ano w Gdańsku - odrzekłem zawstydzony. - Chyba byłeś ślepy, kupując taką szkaradę. Z czego strzelałeś przed wojną? - Miałem hollandy - odrzekłem. - Jedyna rada - rzekł w końcu Staś po oględzinach mojej broni. - Każ sobie skrócić szyję na dobrych kilka centymetrów, albo jeszcze lepiej rzuć to świństwo do pieca, a co się nie spali - na śmietnik.' 6 | MACA_CONFIG1 = 'morfeusz-nkjp-official' 7 | MACA_CONFIG2 = 'morfeusz2-nkjp' 8 | 9 | 10 | @pytest.fixture 11 | def get_maca_wrapper(): 12 | try: 13 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1) 14 | list(maca_analyzer._maca_wrapper(paragraph_raw)) 15 | maca_analyzer._maca = maca_analyzer._maca_wrapper 16 | except: 17 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2) 18 | list(maca_analyzer._maca_wrapper(paragraph_raw)) 19 | maca_analyzer._maca = maca_analyzer._maca_wrapper 20 | 21 | return maca_analyzer 22 | 23 | 24 | @pytest.fixture 25 | def get_maca_process(): 26 | try: 27 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1) 28 | list(maca_analyzer._maca_process(paragraph_raw)) 29 | maca_analyzer._maca = maca_analyzer._maca_process 30 | except: 31 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2) 32 | list(maca_analyzer._maca_process(paragraph_raw)) 33 | maca_analyzer._maca = maca_analyzer._maca_process 34 | 35 | return maca_analyzer 36 | 37 | 38 | def analyze_process(maca_analyzer, data): 39 | results = maca_analyzer.analyze(data) 40 | return list(results) 41 | 42 | 43 | def analyze_wrapper(maca_analyzer, data): 44 | results = maca_analyzer.analyze(data) 45 | return list(results) 46 | 47 | 48 | @pytest.mark.slow 49 | def test_maca_process_speed(benchmark, get_maca_process): 50 | maca_analyzer = get_maca_process 51 | benchmark(analyze_process, maca_analyzer, paragraph_raw) 52 | 53 | 54 | @pytest.mark.slow 55 | def test_maca_wrapper_speed(benchmark, get_maca_wrapper): 56 | maca_analyzer = get_maca_wrapper 57 | benchmark(analyze_wrapper, maca_analyzer, paragraph_raw) 58 | -------------------------------------------------------------------------------- /tests/benchmark/test_shape.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from krnnt.utils import shape 4 | import krnnt_utils 5 | 6 | @pytest.fixture 7 | def word(): 8 | return "ljhbasjk8f5IYTVIGHVaisftityvfiouyfO*86f97f697" 9 | 10 | @pytest.mark.slow 11 | def test_shape_regex(word, benchmark): 12 | benchmark(shape,word) 13 | 14 | @pytest.mark.slow 15 | def test_shape_cython(word, benchmark): 16 | benchmark(krnnt_utils.shape,word) -------------------------------------------------------------------------------- /tests/benchmark/test_tags.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from krnnt.features import TagsPreprocessor, TagsPreprocessorCython 4 | 5 | 6 | @pytest.fixture 7 | def tags(): 8 | return ['fin:sg:ter:imperf', 'subst:sg:nom:f'] 9 | 10 | 11 | @pytest.mark.slow 12 | def test_tags4(tags, benchmark): 13 | benchmark(TagsPreprocessor.create_tags4_without_guesser, tags) 14 | 15 | 16 | @pytest.mark.slow 17 | def test_tags4_cython(tags, benchmark): 18 | benchmark(TagsPreprocessorCython.create_tags4_without_guesser, tags) 19 | 20 | 21 | @pytest.mark.slow 22 | def test_tags5(tags, benchmark): 23 | benchmark(TagsPreprocessor.create_tags5_without_guesser, tags) 24 | 25 | 26 | @pytest.mark.slow 27 | def test_tags5_cython(tags, benchmark): 28 | benchmark(TagsPreprocessorCython.create_tags5_without_guesser, tags) 29 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | 4 | @pytest.fixture 5 | def rootdir(): 6 | return os.path.dirname(os.path.abspath(__file__)) -------------------------------------------------------------------------------- /tests/data/reference/gold-task-c_evaluation.txt: -------------------------------------------------------------------------------- 1 | ### FOLD 1: tests/data/small/gold-task-c.xml (tag) v. /tmp/out.xces (ref) 2 | PolEval 2017 competition scores 3 | ------------------------------- 4 | POS accuracy (Subtask A score): 33.1343% 5 | POS accuracy (known words): 33.1343% 6 | POS accuracy (unknown words): 0.0000% 7 | Lemmatization accuracy (Subtask B score): 51.3433% 8 | Lemmatization accuracy (known words): 51.3433% 9 | Lemmatization accuracy (unknown words): 0.0000% 10 | Overall accuracy (Subtask C score): 42.2388% 11 | ---- 12 | REF-toks 335 13 | KN 100.0000% 14 | KN_POS_SC_LOWER 53.1343% 15 | KN_SC_LOWER 33.1343% 16 | KN_SEG_CHANGE 0.8955% 17 | KN_SL_LOWER 51.3433% 18 | KN_WC_LOWER 34.0299% 19 | POS_SC_LOWER 53.1343% 20 | POS_WC_LOWER 53.1343% 21 | SC_LOWER 33.1343% 22 | SEG_CHANGE 0.8955% 23 | SEG_NOCHANGE 99.1045% 24 | SL_CASE_CAT_HEUR 51.3433% 25 | SL_LOWER 51.3433% 26 | SL_NOCASE_CAT_HEUR 54.3284% 27 | SL_NOCASE_LOWER 54.3284% 28 | UNK 0.0000% 29 | UNK_POS_SC_LOWER 0.0000% 30 | UNK_SC_LOWER 0.0000% 31 | UNK_SEG_CHANGE 0.0000% 32 | UNK_SL_LOWER 0.0000% 33 | UNK_WC_LOWER 0.0000% 34 | WC_LOWER 34.0299% 35 | WL_LOWER 51.3433% 36 | WC_UPPER 34.9254% 37 | AVG weak lemma lower bound 51.3433% 38 | AVG KN strong lemma lower bound 51.3433% 39 | AVG UNK strong lemma lower bound 0.0000% 40 | AVG strong lemma lower bound 51.3433% 41 | AVG strong lemma nocase lower bound 54.3284% 42 | AVG strong lemma case concat heur 51.3433% 43 | AVG strong lemma nocase concat heur 54.3284% 44 | AVG weak corr lower bound 34.0299% 45 | AVG weak corr upper bound 34.9254% 46 | AVG UNK weak corr lower bound 0.0000% 47 | AVG UNK weak corr upper bound 0.0000% 48 | AVG KN weak corr lower bound 34.0299% 49 | AVG KN weak corr upper bound 34.9254% 50 | AVG POS strong corr lower bound 53.1343% 51 | AVG percentage UNK 0.0000% 52 | AVG percentage seg change 0.8955% 53 | -------------------------------------------------------------------------------- /tests/data/reference/in_raw.txt: -------------------------------------------------------------------------------- 1 | Lubię placki. Ala ma kota. 2 | 3 | Raz dwa trzy. -------------------------------------------------------------------------------- /tests/data/reference/lemmatisation_test.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/lemmatisation_test.pkl -------------------------------------------------------------------------------- /tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle -------------------------------------------------------------------------------- /tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part1 -------------------------------------------------------------------------------- /tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part2 -------------------------------------------------------------------------------- /tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess -------------------------------------------------------------------------------- /tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict -------------------------------------------------------------------------------- /tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2 -------------------------------------------------------------------------------- /tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData -------------------------------------------------------------------------------- /tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues -------------------------------------------------------------------------------- /tests/data/reference/nkjp1m-1.2-reanalyzed.spickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.spickle -------------------------------------------------------------------------------- /tests/data/reference/nkjp1m-1.2.spickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2.spickle -------------------------------------------------------------------------------- /tests/data/reference/out.conll: -------------------------------------------------------------------------------- 1 | Lubię Lubię 1 adj:pl:nom:m1:pos 0 5 2 | placki placka 1 subst:pl:acc:f 6 12 3 | . . 0 interp 12 13 4 | 5 | Ala Ala 1 subst:sg:nom:f 14 17 6 | ma ma 1 subst:sg:nom:f 18 20 7 | kota kota 1 subst:sg:nom:f 21 25 8 | . . 0 interp 25 26 9 | 10 | 11 | Raz Raz 1 subst:sg:nom:f 0 3 12 | dwa dwa 1 adj:pl:acc:f:pos 4 7 13 | trzy trzy 1 subst:pl:acc:f 8 12 14 | . . 0 interp 12 13 15 | 16 | 17 | -------------------------------------------------------------------------------- /tests/data/reference/out.conllu: -------------------------------------------------------------------------------- 1 | 1 Lubię Lubię _ adj:pl:nom:m1:pos _ _ _ _ _ 2 | 2 placki placka _ subst:pl:acc:f _ _ _ _ _ 3 | 3 . . _ interp _ _ _ _ _ 4 | 5 | 1 Ala Ala _ subst:sg:nom:f _ _ _ _ _ 6 | 2 ma ma _ subst:sg:nom:f _ _ _ _ _ 7 | 3 kota kota _ subst:sg:nom:f _ _ _ _ _ 8 | 4 . . _ interp _ _ _ _ _ 9 | 10 | 11 | 1 Raz Raz _ subst:sg:nom:f _ _ _ _ _ 12 | 2 dwa dwa _ adj:pl:acc:f:pos _ _ _ _ _ 13 | 3 trzy trzy _ subst:pl:acc:f _ _ _ _ _ 14 | 4 . . _ interp _ _ _ _ _ 15 | 16 | 17 | -------------------------------------------------------------------------------- /tests/data/reference/out.jsonl: -------------------------------------------------------------------------------- 1 | [[["Lubię", "Lubię", "adj:pl:nom:m1:pos"], ["placki", "placka", "subst:pl:acc:f"], [".", ".", "interp"]], [["Ala", "Ala", "subst:sg:nom:f"], ["ma", "ma", "subst:sg:nom:f"], ["kota", "kota", "subst:sg:nom:f"], [".", ".", "interp"]]] 2 | [[["Raz", "Raz", "subst:sg:nom:f"], ["dwa", "dwa", "adj:pl:acc:f:pos"], ["trzy", "trzy", "subst:pl:acc:f"], [".", ".", "interp"]]] 3 | 4 | -------------------------------------------------------------------------------- /tests/data/reference/out.plain: -------------------------------------------------------------------------------- 1 | Lubię newline 2 | Lubię adj:pl:nom:m1:pos disamb 3 | placki space 4 | placka subst:pl:acc:f disamb 5 | . none 6 | . interp disamb 7 | 8 | Ala space 9 | Ala subst:sg:nom:f disamb 10 | ma space 11 | ma subst:sg:nom:f disamb 12 | kota space 13 | kota subst:sg:nom:f disamb 14 | . none 15 | . interp disamb 16 | 17 | 18 | Raz newline 19 | Raz subst:sg:nom:f disamb 20 | dwa space 21 | dwa adj:pl:acc:f:pos disamb 22 | trzy space 23 | trzy subst:pl:acc:f disamb 24 | . none 25 | . interp disamb 26 | 27 | 28 | -------------------------------------------------------------------------------- /tests/data/reference/out.xces: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Lubię 9 | Lubięadj:pl:nom:m1:pos 10 | 11 | 12 | placki 13 | plackasubst:pl:acc:f 14 | 15 | 16 | 17 | . 18 | .interp 19 | 20 | 21 | 22 | 23 | Ala 24 | Alasubst:sg:nom:f 25 | 26 | 27 | ma 28 | masubst:sg:nom:f 29 | 30 | 31 | kota 32 | kotasubst:sg:nom:f 33 | 34 | 35 | 36 | . 37 | .interp 38 | 39 | 40 | 41 | 42 | 43 | 44 | Raz 45 | Razsubst:sg:nom:f 46 | 47 | 48 | dwa 49 | dwaadj:pl:acc:f:pos 50 | 51 | 52 | trzy 53 | trzysubst:pl:acc:f 54 | 55 | 56 | 57 | . 58 | .interp 59 | 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /tests/data/reference/weight_test.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/weight_test.hdf5 -------------------------------------------------------------------------------- /tests/data/reference/weight_test.hdf5.final: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/weight_test.hdf5.final -------------------------------------------------------------------------------- /tests/data/reference/weight_test.hdf5.new: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/weight_test.hdf5.new -------------------------------------------------------------------------------- /tests/data/server/in_raw.txt: -------------------------------------------------------------------------------- 1 | Lubię placki. Ala ma kota. 2 | 3 | Raz dwa trzy. -------------------------------------------------------------------------------- /tests/data/server/in_tokenized.json: -------------------------------------------------------------------------------- 1 | { 2 | "documents": [ 3 | { 4 | "text": "Lubię placki. Ala ma kota.", 5 | "sentences": [ 6 | { 7 | "tokens": [ 8 | { 9 | "form": "Lubię", 10 | "separator": "newline", 11 | "start": 0, 12 | "end": 0 13 | }, 14 | { 15 | "form": "placki", 16 | "separator": "space", 17 | "start": 0, 18 | "end": 0 19 | }, 20 | { 21 | "form": ".", 22 | "separator": "none", 23 | "start": 0, 24 | "end": 0 25 | } 26 | ] 27 | }, 28 | { 29 | "tokens": [ 30 | { 31 | "form": "Ala", 32 | "separator": "space", 33 | "start": 0, 34 | "end": 0 35 | }, 36 | { 37 | "form": "ma", 38 | "separator": "space", 39 | "start": 0, 40 | "end": 0 41 | }, 42 | { 43 | "form": "kota", 44 | "separator": "space", 45 | "start": 0, 46 | "end": 0 47 | }, 48 | { 49 | "form": ".", 50 | "separator": "none", 51 | "start": 0, 52 | "end": 0 53 | } 54 | ] 55 | } 56 | ] 57 | }, 58 | { 59 | "text": "Raz dwa trzy.", 60 | "sentences": [ 61 | { 62 | "tokens": [ 63 | { 64 | "form": "Raz", 65 | "separator": "newline", 66 | "start": 0, 67 | "end": 0 68 | }, 69 | { 70 | "form": "dwa", 71 | "separator": "space", 72 | "start": 0, 73 | "end": 0 74 | }, 75 | { 76 | "form": "trzy", 77 | "separator": "space", 78 | "start": 0, 79 | "end": 0 80 | }, 81 | { 82 | "form": ".", 83 | "separator": "none", 84 | "start": 0, 85 | "end": 0 86 | } 87 | ] 88 | } 89 | ] 90 | } 91 | ] 92 | } -------------------------------------------------------------------------------- /tests/data/server/in_tokenized_compact.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | [["Lubię","newline"],["placki","space"],[".","none"]], 4 | [["Ala","space"],["ma","space"],["kota","space"],[".","none"]] 5 | ], 6 | [ 7 | [["Raz","newline"],["dwa","space"],["trzy","space"],[".","none"]] 8 | ] 9 | ] -------------------------------------------------------------------------------- /tests/data/server/out_raw.conll: -------------------------------------------------------------------------------- 1 | Lubię lubić 1 fin:sg:pri:imperf 0 5 2 | placki placek 1 subst:pl:acc:m3 6 12 3 | . . 0 interp 12 13 4 | 5 | Ala Ala 1 subst:sg:nom:f 14 17 6 | ma mieć 1 fin:sg:ter:imperf 18 20 7 | kota kot 1 subst:sg:acc:m2 21 25 8 | . . 0 interp 25 26 9 | 10 | 11 | Raz raz 1 subst:sg:nom:m3 0 3 12 | dwa dwa 1 num:pl:nom:m3:congr 4 7 13 | trzy trzy 1 num:pl:nom:m3:congr 8 12 14 | . . 0 interp 12 13 15 | 16 | -------------------------------------------------------------------------------- /tests/data/server/out_raw.conllu: -------------------------------------------------------------------------------- 1 | 1 Lubię lubić _ fin:sg:pri:imperf _ _ _ _ _ 2 | 2 placki placek _ subst:pl:acc:m3 _ _ _ _ _ 3 | 3 . . _ interp _ _ _ _ _ 4 | 5 | 1 Ala Ala _ subst:sg:nom:f _ _ _ _ _ 6 | 2 ma mieć _ fin:sg:ter:imperf _ _ _ _ _ 7 | 3 kota kot _ subst:sg:acc:m2 _ _ _ _ _ 8 | 4 . . _ interp _ _ _ _ _ 9 | 10 | 11 | 1 Raz raz _ subst:sg:nom:m3 _ _ _ _ _ 12 | 2 dwa dwa _ num:pl:nom:m3:congr _ _ _ _ _ 13 | 3 trzy trzy _ num:pl:nom:m3:congr _ _ _ _ _ 14 | 4 . . _ interp _ _ _ _ _ 15 | 16 | -------------------------------------------------------------------------------- /tests/data/server/out_raw.jsonl: -------------------------------------------------------------------------------- 1 | [[["Lubię", "lubić", "fin:sg:pri:imperf"], ["placki", "placek", "subst:pl:acc:m3"], [".", ".", "interp"]], [["Ala", "Ala", "subst:sg:nom:f"], ["ma", "mieć", "fin:sg:ter:imperf"], ["kota", "kot", "subst:sg:acc:m2"], [".", ".", "interp"]]] 2 | [[["Raz", "raz", "subst:sg:nom:m3"], ["dwa", "dwa", "num:pl:nom:m3:congr"], ["trzy", "trzy", "num:pl:nom:m3:congr"], [".", ".", "interp"]]] 3 | -------------------------------------------------------------------------------- /tests/data/server/out_raw.plain: -------------------------------------------------------------------------------- 1 | Lubię newline 2 | lubić fin:sg:pri:imperf disamb 3 | placki space 4 | placek subst:pl:acc:m3 disamb 5 | . none 6 | . interp disamb 7 | 8 | Ala space 9 | Ala subst:sg:nom:f disamb 10 | ma space 11 | mieć fin:sg:ter:imperf disamb 12 | kota space 13 | kot subst:sg:acc:m2 disamb 14 | . none 15 | . interp disamb 16 | 17 | Raz newline 18 | raz subst:sg:nom:m3 disamb 19 | dwa space 20 | dwa num:pl:nom:m3:congr disamb 21 | trzy space 22 | trzy num:pl:nom:m3:congr disamb 23 | . none 24 | . interp disamb 25 | 26 | -------------------------------------------------------------------------------- /tests/data/server/out_raw.xces: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Lubię 9 | lubićfin:sg:pri:imperf 10 | 11 | 12 | placki 13 | placeksubst:pl:acc:m3 14 | 15 | 16 | 17 | . 18 | .interp 19 | 20 | 21 | 22 | 23 | Ala 24 | Alasubst:sg:nom:f 25 | 26 | 27 | ma 28 | miećfin:sg:ter:imperf 29 | 30 | 31 | kota 32 | kotsubst:sg:acc:m2 33 | 34 | 35 | 36 | . 37 | .interp 38 | 39 | 40 | 41 | 42 | 43 | 44 | Raz 45 | razsubst:sg:nom:m3 46 | 47 | 48 | dwa 49 | dwanum:pl:nom:m3:congr 50 | 51 | 52 | trzy 53 | trzynum:pl:nom:m3:congr 54 | 55 | 56 | 57 | . 58 | .interp 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /tests/data/small/00132482.ann.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Gdzie 8 | gdzieadv 9 | 10 | 11 | kupiła 12 | kupićpraet:sg:f:perf 13 | 14 | 15 | 16 | ś 17 | byćaglt:sg:sec:imperf:nwok 18 | 19 | 20 | łańcuszek 21 | łańcuszeksubst:sg:acc:m3 22 | 23 | 24 | 25 | ? 26 | ?interp 27 | 28 | 29 | 30 | 31 | : 32 | :)interj 33 | 34 | 35 | 36 | ) 37 | )blank 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /tests/data/small/gold-task-c.txt: -------------------------------------------------------------------------------- 1 | Moje niefortunne pudła przygnębiły mnie do reszty. Zawsze miałem pretensje, że jestem dobrym myśliwym, a od dzieciństwa nie rozstawałem się ze strzelbą, a tu wśród obcych zblamowałem się jak nigdy w życiu. Jakże inaczej strzelałem cietrzewie i pardwy z moich "hollandów", które pozostawiłem na wieczną zgubę w Petersburgu. Poczciwy Staś Sierakowski pośpieszył mi z pomocą, by wyjaśnić moje niepowodzenia. - Pokaż mi strzelbę - poprosił, a gdy podałem mu mojego mauzera, spytał ze śmiechem: - Gdzieś to świństwo wykopał? - Ano w Gdańsku - odrzekłem zawstydzony. - Chyba byłeś ślepy, kupując taką szkaradę. Z czego strzelałeś przed wojną? - Miałem hollandy - odrzekłem. - Jedyna rada - rzekł w końcu Staś po oględzinach mojej broni. - Każ sobie skrócić szyję na dobrych kilka centymetrów, albo jeszcze lepiej rzuć to świństwo do pieca, a co się nie spali - na śmietnik. 2 | Przestrzeń dzielącą je od kolejnego skłonu schodów pokonało, kolebiąc się na boki, rozkołysanym kaczym chodem. Najdziwniejsze jednak było to, co nastąpiło potem. Jego wspinanie się na stopień. Mianowicie najpierw przed nim stanęło, niemal doń przywarło. Samo zresztą było niewiele od niego wyższe. A potem z olbrzymim wysiłkiem zaczęło się nań wspinać, a kiedy betonowa krawędź była już w połowie jego wysokości, ostrożnie się pochylając powoli przeważyło ciężar ciała na poziomą płaszczyznę stopnia. Jakby nie mogło się zginać, jakby kręgosłup miało całkiem zesztywniały. W końcu udało się. Z lekkim stukotem opadło na brzuch. Leżąc tak, wydało z siebie właśnie to jedyne w swoim rodzaju cichutkie jęknięcie. Osiągnąwszy tę fazę wspinaczki, przeszło po chwili do następnego etapu. Ciągle leżąc, zaczęło się czołgać dalej, aż do chwili kiedy środek ciężkości, w ogóle całe ciało, całkowicie znalazło się na stopniu. -------------------------------------------------------------------------------- /tests/download_model.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | cd .. 3 | mkdir model_data -p 4 | cd model_data 5 | 6 | if [ ! -f "weights.hdf5" ]; then 7 | wget "https://github.com/kwrobel-nlp/krnnt/releases/download/poleval/reanalyze_150epochs_train1.0.zip" 8 | unzip reanalyze_150epochs_train1.0.zip 9 | mv lemmatisation_reana150_1.0.pkl lemmatisation.pkl 10 | mv weights_reana150_1.0.hdf5 weights.hdf5 11 | fi -------------------------------------------------------------------------------- /tests/test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #TODO pytest-shell 4 | 5 | #version of morfeusz dictionary may influence results 6 | 7 | MACA_CONFIG=morfeusz2-nkjp 8 | 9 | cd .. 10 | 11 | python3 process_xces.py tests/data/small/nkjp1m-1.2-xces.xml /tmp/nkjp.spickle 12 | echo $? 13 | diff /tmp/nkjp.spickle tests/data/reference/nkjp1m-1.2.spickle 14 | 15 | python3 reanalyze.py --maca_config $MACA_CONFIG /tmp/nkjp.spickle /tmp/nkjp-reanalyzed.spickle 16 | echo $? 17 | diff /tmp/nkjp-reanalyzed.spickle tests/data/reference/nkjp1m-1.2-reanalyzed.spickle 18 | 19 | python3 shuffle.py /tmp/nkjp-reanalyzed.spickle /tmp/nkjp-reanalyzed.shuf.spickle 20 | echo $? 21 | diff /tmp/nkjp-reanalyzed.shuf.spickle tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle 22 | 23 | rm /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2 /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues 24 | CUDA_VISIBLE_DEVICES="" PYTHONHASHSEED=0 python3 krnnt_train.py --maca_config $MACA_CONFIG /tmp/nkjp-reanalyzed.shuf.spickle -e 2 --reproducible --hash test 25 | echo $? 26 | h5diff weight_test.hdf5 tests/data/reference/weight_test.hdf5 27 | h5diff weight_test.hdf5.final tests/data/reference/weight_test.hdf5.final 28 | diff lemmatisation_test.pkl tests/data/reference/lemmatisation_test.pkl 29 | diff /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2 tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2 30 | diff /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData 31 | diff /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues 32 | 33 | echo "Lubię placki." | python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces > /tmp/out.xces 34 | echo $? 35 | diff /tmp/out.xces tests/data/reference/out.xces 36 | 37 | echo "Lubię placki." | python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o plain > /tmp/out.plain 38 | echo $? 39 | diff /tmp/out.plain tests/data/reference/out.plain 40 | 41 | echo "Lubię placki." | python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o conll > /tmp/out.conll 42 | echo $? 43 | diff /tmp/out.conll tests/data/reference/out.conll 44 | 45 | echo "Lubię placki." | python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o conllu > /tmp/out.conllu 46 | echo $? 47 | diff /tmp/out.conllu tests/data/reference/out.conllu 48 | 49 | echo "Lubię placki." | python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o jsonl > /tmp/out.jsonl 50 | echo $? 51 | diff /tmp/out.jsonl tests/data/reference/out.jsonl 52 | -------------------------------------------------------------------------------- /tests/test_aglt.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | from krnnt.aglt import rewrite_praet, remove_aglt, rule1, rule3, rule1b 4 | 5 | paragraph = [ 6 | [ 7 | {'token': 'Zrobił', 'sep': 'newline', 'tag': 'praet:sg:m1:perf', 8 | 'lemmas': ['zrobić'], 'start': 0, 'end': 6}, 9 | {'token': 'by', 'sep': 'none', 'tag': 'qub', 'lemmas': ['by'], 10 | 'start': 6, 'end': 8}, 11 | {'token': 'm', 'sep': 'none', 'tag': 'aglt:sg:pri:imperf:nwok', 12 | 'lemmas': ['być'], 'start': 8, 'end': 9}, 13 | {'token': 'to', 'sep': 'space', 'tag': 'subst:sg:acc:n', 14 | 'lemmas': ['to'], 'start': 10, 'end': 12}, 15 | {'token': '.', 'sep': 'none', 'prob': 1.0, 'tag': 'interp', 'lemmas': ['.'], 16 | 'start': 12, 'end': 13} 17 | ], 18 | [ 19 | {'token': 'Czy', 'sep': 'space', 'tag': 'qub', 'lemmas': ['czy'], 20 | 'start': 14, 'end': 17}, 21 | {'token': 'by', 'sep': 'space', 'tag': 'qub', 'lemmas': ['by'], 22 | 'start': 18, 'end': 20}, 23 | {'token': 'm', 'sep': 'none', 'tag': 'aglt:sg:pri:imperf:nwok', 24 | 'lemmas': ['być'], 'start': 20, 'end': 21}, 25 | {'token': 'to', 'sep': 'space', 'tag': 'subst:sg:acc:n', 26 | 'lemmas': ['to'], 'start': 22, 'end': 24}, 27 | {'token': 'zrobił', 'sep': 'space', 'tag': 'praet:sg:m1:perf', 28 | 'lemmas': ['zrobić'], 'start': 25, 'end': 31}, 29 | {'token': '?', 'sep': 'none', 'tag': 'interp', 'lemmas': ['?'], 30 | 'start': 31, 'end': 32} 31 | ], 32 | [ 33 | {'token': 'Zrobił', 'sep': 'space', 'tag': 'praet:sg:m1:perf', 34 | 'lemmas': ['zrobić'], 'start': 33, 'end': 39}, 35 | {'token': 'em', 'sep': 'none', 'tag': 'aglt:sg:pri:imperf:wok', 36 | 'lemmas': ['być'], 'start': 39, 'end': 41}, 37 | {'token': 'to', 'sep': 'space', 'tag': 'subst:sg:acc:n', 38 | 'lemmas': ['to'], 'start': 42, 'end': 44}, 39 | {'token': '.', 'sep': 'none', 'prob': 1.0, 'tag': 'interp', 'lemmas': ['.'], 40 | 'start': 44, 'end': 45} 41 | ], 42 | [ 43 | {'token': 'Aby', 'sep': 'space', 'tag': 'comp', 'lemmas': ['aby'], 44 | 'start': 46, 'end': 49}, 45 | {'token': 'm', 'sep': 'none', 'tag': 'aglt:sg:pri:imperf:nwok', 46 | 'lemmas': ['być'], 'start': 49, 'end': 50}, 47 | {'token': 'to', 'sep': 'space', 'tag': 'subst:sg:acc:n', 48 | 'lemmas': ['to'], 'start': 51, 'end': 53}, 49 | {'token': 'zrobił', 'sep': 'space', 'tag': 'praet:sg:m1:perf', 50 | 'lemmas': ['zrobić'], 'start': 54, 'end': 60}, 51 | {'token': '?', 'sep': 'none', 'tag': 'interp', 'lemmas': ['?'], 52 | 'start': 60, 'end': 61} 53 | ], 54 | [ 55 | {'token': 'Zrobił', 'sep': 'newline', 'tag': 'praet:sg:m1:perf', 56 | 'lemmas': ['zrobić'], 'start': 0, 'end': 6}, 57 | {'token': 'by', 'sep': 'none', 'tag': 'qub', 'lemmas': ['by'], 58 | 'start': 6, 'end': 8}, 59 | {'token': 'to', 'sep': 'space', 'tag': 'subst:sg:acc:n', 60 | 'lemmas': ['to'], 'start': 9, 'end': 11}, 61 | {'token': '.', 'sep': 'none', 'prob': 1.0, 'tag': 'interp', 'lemmas': ['.'], 62 | 'start': 11, 'end': 12} 63 | ], 64 | [ 65 | {'token': 'Czy', 'sep': 'space', 'tag': 'qub', 'lemmas': ['czy'], 66 | 'start': 14, 'end': 17}, 67 | {'token': 'by', 'sep': 'space', 'tag': 'qub', 'lemmas': ['by'], 68 | 'start': 18, 'end': 20}, 69 | {'token': 'to', 'sep': 'space', 'tag': 'subst:sg:acc:n', 70 | 'lemmas': ['to'], 'start': 21, 'end': 23}, 71 | {'token': 'zrobił', 'sep': 'space', 'tag': 'praet:sg:m1:perf', 72 | 'lemmas': ['zrobić'], 'start': 24, 'end': 30}, 73 | {'token': '?', 'sep': 'none', 'tag': 'interp', 'lemmas': ['?'], 74 | 'start': 30, 'end': 31} 75 | ] 76 | ] 77 | 78 | 79 | def test_rewrite_praet(): 80 | sentence1 = copy.deepcopy(paragraph[2]) 81 | 82 | rewrite_praet(sentence1[1], sentence1[0]) 83 | assert sentence1[0]['tag'] == 'praet:sg:m1:pri:perf' 84 | 85 | 86 | def test_rewrite_cond(): 87 | sentence1 = copy.deepcopy(paragraph[0]) 88 | rewrite_praet(sentence1[2], sentence1[0], sentence1[1]) 89 | assert sentence1[0]['tag'] == 'cond:sg:m1:pri:perf' 90 | 91 | def test_rewrite_cond2(): 92 | sentence1 = copy.deepcopy(paragraph[4]) 93 | rewrite_praet(None, sentence1[0], sentence1[1]) 94 | assert sentence1[0]['tag'] == 'cond:sg:m1:ter:perf' 95 | 96 | def test_rule1_cond(): 97 | sentence1 = copy.deepcopy(paragraph[0]) 98 | 99 | remove_aglt(sentence1, [rule1]) 100 | print(sentence1) 101 | assert sentence1[0]['tag'] == 'cond:sg:m1:pri:perf' 102 | assert sentence1[1]['token'] != 'by' 103 | assert sentence1[2]['token'] != 'm' 104 | assert sentence1[0]['token'] == 'Zrobiłbym' 105 | assert sentence1[0]['end'] == 9 106 | 107 | 108 | def test_rule1_praet(): 109 | sentence1 = copy.deepcopy(paragraph[2]) 110 | 111 | remove_aglt(sentence1, [rule1]) 112 | print(sentence1) 113 | assert sentence1[0]['tag'] == 'praet:sg:m1:pri:perf' 114 | assert sentence1[1]['token'] != 'm' 115 | assert sentence1[0]['token'] == 'Zrobiłem' 116 | assert sentence1[0]['end'] == 41 117 | 118 | def test_rule3_1(): 119 | sentence1 = copy.deepcopy(paragraph[1]) 120 | 121 | print(sentence1) 122 | remove_aglt(sentence1, [rule1, rule3]) 123 | print(sentence1) 124 | assert sentence1[3]['tag'] == 'cond:sg:m1:pri:perf' 125 | assert sentence1[1]['token'] == 'bym' 126 | assert sentence1[1]['end'] == 21 127 | 128 | def test_rule3_2(): 129 | sentence1 = copy.deepcopy(paragraph[3]) 130 | 131 | remove_aglt(sentence1, [rule1, rule3]) 132 | print(sentence1) 133 | assert sentence1[2]['tag'] == 'praet:sg:m1:pri:perf' 134 | assert sentence1[0]['token'] == 'Abym' 135 | assert sentence1[0]['end'] == 50 136 | 137 | def test_rule3_3(): 138 | sentence1 = copy.deepcopy(paragraph[4]) 139 | 140 | remove_aglt(sentence1, [rule1b, rule3]) 141 | print(sentence1) 142 | assert sentence1[0]['tag'] == 'cond:sg:m1:ter:perf' 143 | assert sentence1[0]['token'] == 'Zrobiłby' 144 | assert sentence1[0]['end'] == 8 145 | assert sentence1[1]['token'] != 'by' 146 | 147 | def test_rule3_4(): 148 | sentence1 = copy.deepcopy(paragraph[5]) 149 | 150 | remove_aglt(sentence1, [rule1b, rule3]) 151 | print(sentence1) 152 | assert sentence1[3]['tag'] == 'cond:sg:m1:ter:perf' 153 | assert sentence1[3]['token'] == 'zrobił' 154 | -------------------------------------------------------------------------------- /tests/test_analyzers.py: -------------------------------------------------------------------------------- 1 | from krnnt.analyzers import MacaAnalyzer 2 | from krnnt.structure import Form 3 | 4 | reference_maca_output = \ 5 | '''Lubię newline 6 | lubić fin:sg:pri:imperf 7 | pociągi space 8 | pociąg subst:pl:nom:m3 9 | pociąg subst:pl:acc:m3 10 | pociąg subst:pl:voc:m3 11 | . none 12 | . interp''' 13 | 14 | paragraph_raw = 'Lubię pociągi.' 15 | 16 | MACA_CONFIG1='morfeusz-nkjp-official' 17 | MACA_CONFIG2='morfeusz2-nkjp' 18 | 19 | def test_maca(): 20 | try: 21 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1) 22 | results = maca_analyzer._maca(paragraph_raw) 23 | results = list(results) 24 | except: 25 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2) 26 | results = maca_analyzer._maca(paragraph_raw) 27 | results = list(results) 28 | 29 | assert len(results) == 1 30 | assert results[0] == reference_maca_output 31 | 32 | def test_maca_process(): 33 | try: 34 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1) 35 | results = maca_analyzer._maca_process(paragraph_raw) 36 | results = list(results) 37 | except: 38 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2) 39 | results = maca_analyzer._maca_process(paragraph_raw) 40 | results = list(results) 41 | 42 | assert len(results) == 1 43 | assert results[0] == reference_maca_output 44 | 45 | def test_maca_wrapper(): 46 | try: 47 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1) 48 | results = maca_analyzer._maca_wrapper(paragraph_raw) 49 | results = list(results) 50 | except: 51 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2) 52 | results = maca_analyzer._maca_wrapper(paragraph_raw) 53 | results = list(results) 54 | 55 | assert len(results) == 1 56 | assert results[0] == reference_maca_output 57 | 58 | def test_parse(): 59 | maca_analyzer = MacaAnalyzer('') 60 | maca_analyzer.text = paragraph_raw 61 | maca_analyzer.last_offset = 0 62 | result = maca_analyzer._parse(reference_maca_output) 63 | 64 | reference = [ 65 | ('Lubię', 'newline', 66 | [('lubić', 'fin:sg:pri:imperf')],0,5), 67 | ('pociągi', 'space', 68 | [('pociąg', 'subst:pl:nom:m3'), 69 | ('pociąg', 'subst:pl:acc:m3'), 70 | ('pociąg', 'subst:pl:voc:m3')],6,13), 71 | ('.', 'none', 72 | [('.', 'interp')], 13,14)] 73 | 74 | assert result == reference 75 | 76 | def test_maca_analyzer(): 77 | try: 78 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1) 79 | result = maca_analyzer.analyze(paragraph_raw) 80 | except: 81 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2) 82 | result = maca_analyzer.analyze(paragraph_raw) 83 | 84 | assert len(result.sentences)==1 85 | assert len(result.sentences[0].tokens) == 3 86 | 87 | assert result.sentences[0].tokens[0].form == 'Lubię' 88 | assert result.sentences[0].tokens[0].space_before == 'newline' 89 | assert len(result.sentences[0].tokens[0].interpretations) == 1 90 | 91 | assert result.sentences[0].tokens[1].form == 'pociągi' 92 | assert result.sentences[0].tokens[1].space_before == 'space' 93 | assert len(result.sentences[0].tokens[1].interpretations) == 3 94 | 95 | assert result.sentences[0].tokens[2].form == '.' 96 | assert result.sentences[0].tokens[2].space_before == 'none' 97 | assert len(result.sentences[0].tokens[2].interpretations) == 1 98 | 99 | assert result.sentences[0].tokens[1].interpretations[0] == Form('pociąg', 'subst:pl:nom:m3') 100 | assert result.sentences[0].tokens[1].interpretations[1] == Form('pociąg', 'subst:pl:acc:m3') 101 | assert result.sentences[0].tokens[1].interpretations[2] == Form('pociąg', 'subst:pl:voc:m3') 102 | 103 | 104 | def test_maca_analyzer_lemmas(): 105 | paragraph_raw='Ala ma kota.' 106 | try: 107 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1) 108 | result = maca_analyzer.analyze(paragraph_raw) 109 | except: 110 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2) 111 | result = maca_analyzer.analyze(paragraph_raw) 112 | 113 | lemmas =[form.lemma for form in result.sentences[0].tokens[2].interpretations] 114 | assert 'kot' in lemmas 115 | assert 'kot:s1' not in lemmas 116 | assert 'kot:s2' not in lemmas 117 | 118 | -------------------------------------------------------------------------------- /tests/test_blank.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | from krnnt.aglt import rewrite_praet, remove_aglt, rule1, rule3, rule1b 4 | from krnnt.blanks import remove_blanks 5 | 6 | sentence = [ 7 | 8 | {'token': '200', 'sep': 'newline', 'tag': 'num:pl:nom:m2:rec', 9 | 'lemmas': ['200'], 'start': 0, 'end': 3}, 10 | {'token': '.', 'sep': 'none', 'tag': 'blank', 'lemmas': ['.'], 11 | 'start': 3, 'end': 4}, 12 | {'token': '000', 'sep': 'none', 'tag': 'blank', 13 | 'lemmas': ['000'], 'start': 4, 'end': 7}, 14 | {'token': 'zł', 'sep': 'space', 'tag': 'brev:npun', 15 | 'lemmas': ['złoty'], 'start': 8, 'end': 10} 16 | ] 17 | 18 | 19 | def test_remove_blanks(): 20 | sentence1 = copy.deepcopy(sentence) 21 | remove_blanks(sentence1) 22 | print(sentence1) 23 | 24 | assert len(sentence1)==2 25 | 26 | 27 | assert sentence1[0]['tag'] == 'num:pl:nom:m2:rec' 28 | assert sentence1[0]['token'] == '200.000' 29 | assert sentence1[0]['start'] == 0 30 | assert sentence1[0]['end'] == 7 31 | 32 | assert sentence1[1] == sentence[-1] -------------------------------------------------------------------------------- /tests/test_features.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from krnnt.features import FeaturePreprocessor, TagsPreprocessorCython, TagsPreprocessor, create_token_features 4 | 5 | 6 | @pytest.fixture 7 | def token(): 8 | return 'asd' 9 | 10 | 11 | def test_nic(token): 12 | assert ["NIC"] == FeaturePreprocessor.nic(token) 13 | 14 | 15 | def test_interps(): 16 | assert ["."] == FeaturePreprocessor.interps('.', {'tags': ['interp']}) 17 | assert [] == FeaturePreprocessor.interps('.', {'tags': ['subst']}) 18 | assert [] == FeaturePreprocessor.interps(':)', {'tags': ['interp']}) 19 | 20 | 21 | def test_prefix1(): 22 | assert ["P0k"] == FeaturePreprocessor.prefix1('kot') 23 | assert ["P0??"] == FeaturePreprocessor.prefix1('©kot') 24 | assert ["P0k"] == FeaturePreprocessor.prefix1('KOT') 25 | 26 | 27 | def test_prefix2(): 28 | assert ["P1o"] == FeaturePreprocessor.prefix2('kot') 29 | assert ["P1xx"] == FeaturePreprocessor.prefix2('k') 30 | 31 | 32 | def test_prefix3(): 33 | assert ["P2t"] == FeaturePreprocessor.prefix3('kot') 34 | 35 | 36 | def test_suffix1(): 37 | assert ["S1t"] == FeaturePreprocessor.suffix1('kot') 38 | assert ["S1??"] == FeaturePreprocessor.suffix1('kot©') 39 | 40 | 41 | def test_suffix2(): 42 | assert ["S2o"] == FeaturePreprocessor.suffix2('kot') 43 | assert ["S2xx"] == FeaturePreprocessor.suffix2('k') 44 | 45 | 46 | def test_suffix3(): 47 | assert ["S3k"] == FeaturePreprocessor.suffix3('kot') 48 | 49 | 50 | def test_qubliki(): 51 | assert [] == FeaturePreprocessor.qubliki('kot') 52 | assert ['ale'] == FeaturePreprocessor.qubliki('ale') 53 | assert ['ale'] == FeaturePreprocessor.qubliki('Ale') 54 | 55 | 56 | @pytest.mark.parametrize('token, expected', [('wrobel', 'l'), 57 | ('Wrobel', 'ul'), 58 | ('WROBEL', 'u'), 59 | ('2019', 'd'), 60 | ('Wrobel2019', 'uld'), 61 | ('Wrobel2019:)', 'uldx')]) 62 | def test_shape(token, expected): 63 | features = FeaturePreprocessor.shape(token) 64 | assert features[0] == expected 65 | assert len(features) == 1 66 | 67 | 68 | @pytest.mark.parametrize('tags, expected', [ 69 | (['fin:sg:ter:imperf', 'subst:sg:nom:f'], ['1fin:ter', '2fin:sg:imperf', '1subst:nom', 70 | '2subst:sg:f']), 71 | (['adjp:dat'], ['1adjp:dat', '2adjp']), 72 | (['interp'], ['1interp', '2interp']), 73 | ([''], ['1', '2']), 74 | ([], [])]) 75 | def test_tags4(tags, expected): 76 | assert TagsPreprocessor.create_tags4_without_guesser(tags) == expected 77 | assert TagsPreprocessorCython.create_tags4_without_guesser(tags) == expected 78 | 79 | 80 | @pytest.mark.parametrize('tags, expected', [ 81 | (['fin:sg:ter:imperf', 'subst:sg:nom:f'], ['sg', 'sg:nom:f', 'nom']), 82 | (['adjp:dat'], ['dat']), 83 | (['interp'], []), 84 | ([''], []), 85 | ([], [])]) 86 | def test_tags5(tags, expected): 87 | assert TagsPreprocessor.create_tags5_without_guesser(tags) == expected 88 | assert TagsPreprocessorCython.create_tags5_without_guesser(tags) == expected 89 | 90 | 91 | def test_create_token_features(benchmark): 92 | token = 'obejmie' 93 | tags = ['subst:sg:loc:m3', 'subst:sg:voc:m3', 'subst:sg:dat:f', 'subst:sg:loc:f', 94 | 'fin:sg:ter:perf'] 95 | space_before = ['space_before'] 96 | features=['l', 'P0o', 'P1b', 'P2e', 'S1e', 'S2i', 'S3m', '1subst:loc', '2subst:sg:m3', 97 | '1subst:voc', '1subst:dat', '2subst:sg:f', '1fin:ter', '2fin:sg:perf', 'sg:loc:m3', 'loc', 98 | 'sg:voc:m3', 'voc', 'sg:dat:f', 'dat', 'sg:loc:f', 'sg', 'space_before'] 99 | 100 | result_features = create_token_features(token, tags, space_before) 101 | assert result_features == features 102 | -------------------------------------------------------------------------------- /tests/test_morfeusz.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from krnnt.analyzers import MacaAnalyzer 4 | from krnnt.new import get_morfeusz, analyze_tokenized, analyze_token 5 | from krnnt.structure import Form 6 | 7 | reference_maca_output = \ 8 | '''Lubię newline 9 | lubić fin:sg:pri:imperf 10 | pociągi space 11 | pociąg subst:pl:nom:m3 12 | pociąg subst:pl:acc:m3 13 | pociąg subst:pl:voc:m3 14 | . none 15 | . interp''' 16 | 17 | paragraph_raw = 'Lubię pociągi.' 18 | 19 | MACA_CONFIG1='morfeusz-nkjp-official' 20 | MACA_CONFIG2='morfeusz2-nkjp' 21 | 22 | def test_maca(): 23 | try: 24 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1) 25 | results = maca_analyzer._maca(paragraph_raw) 26 | results = list(results) 27 | except: 28 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2) 29 | results = maca_analyzer._maca(paragraph_raw) 30 | results = list(results) 31 | 32 | assert len(results) == 1 33 | assert results[0] == reference_maca_output 34 | 35 | 36 | def test_maca_analyzer(rootdir): 37 | try: 38 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1) 39 | result = maca_analyzer.analyze(paragraph_raw) 40 | except: 41 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2) 42 | result = maca_analyzer.analyze(paragraph_raw) 43 | 44 | lines = [] 45 | for line in open(os.path.join(rootdir, 'data/full/test-raw.txt')): 46 | line = line.strip() 47 | if not line: continue 48 | lines.append(line) 49 | 50 | morfeusz = get_morfeusz() 51 | 52 | 53 | 54 | for line in lines: 55 | paragraph = maca_analyzer.analyze(line) 56 | for sentence in paragraph: 57 | for token in sentence: 58 | 59 | maca_tags=[(form.lemma, form.tags) for form in token.interpretations] 60 | morfeusz_tags=analyze_token(morfeusz, token.form) 61 | maca_tags=set(maca_tags) 62 | morfeusz_tags=set(morfeusz_tags) 63 | if maca_tags!=morfeusz_tags: 64 | print(token) 65 | print(sorted(maca_tags-morfeusz_tags)) 66 | print(sorted(morfeusz_tags-maca_tags)) -------------------------------------------------------------------------------- /tests/test_parallel_api_speed.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | import os 3 | 4 | import pytest 5 | import requests 6 | 7 | 8 | def test_api(rootdir): 9 | url = 'http://localhost:9003' 10 | 11 | for line in open(os.path.join(rootdir, 'data/full/test-raw.txt')): 12 | line=line.strip() 13 | if not line: continue 14 | 15 | tag('http://localhost:9003', line) 16 | 17 | def tag(url, data): 18 | payload = data.encode('utf-8') 19 | r = requests.post(url, data=payload) 20 | return r 21 | 22 | def chunk(l, batch_size): 23 | batch = [] 24 | for element in l: 25 | batch.append(element) 26 | if len(batch) == batch_size: 27 | yield batch 28 | batch = [] 29 | if batch: 30 | yield batch 31 | 32 | @pytest.mark.slow 33 | @pytest.mark.parametrize('chunk_size', [100000, 10000, 1000, 100, 10, 4, 2,1]) 34 | def test_parallel_api(rootdir, chunk_size): 35 | print(rootdir, chunk_size) 36 | 37 | lines=[] 38 | for line in open(os.path.join(rootdir, 'data/full/test-raw.txt')): 39 | line = line.strip() 40 | if not line: continue 41 | lines.append(line) 42 | 43 | batches = list(chunk(lines, chunk_size)) 44 | print(len(batches)) 45 | 46 | with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: 47 | future_to_url = {executor.submit(tag, 'http://localhost:9003', "\n\n".join(batch)): "\n\n".join(batch) for batch in batches} 48 | for future in concurrent.futures.as_completed(future_to_url): 49 | r=future.result() 50 | # print(r.text) 51 | 52 | @pytest.mark.slow 53 | @pytest.mark.parametrize('chunk_size', [100000,10,1]) 54 | def test_parallel_api_maca(rootdir, chunk_size): 55 | lines=[] 56 | for line in open(os.path.join(rootdir, 'data/full/train-raw.txt')): 57 | line = line.strip() 58 | if not line: continue 59 | lines.append(line) 60 | 61 | batches = list(chunk(lines, chunk_size)) 62 | 63 | with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: 64 | future_to_url = {executor.submit(tag, 'http://localhost:9003/maca/', "\n\n".join(batch)): "\n\n".join(batch) for batch in batches} 65 | for future in concurrent.futures.as_completed(future_to_url): 66 | r=future.result() 67 | -------------------------------------------------------------------------------- /tests/test_process_xces.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from krnnt.aligner import align_paragraphs 4 | from krnnt.analyzers import MacaAnalyzer 5 | from krnnt.readers import read_xces 6 | 7 | #TODO parametrize? 8 | 9 | 10 | 11 | def test_different_xces_formats(rootdir): 12 | data = { 13 | os.path.join(rootdir, 'data/small/nkjp1m-1.2-xces.xml'): [8, 7], 14 | os.path.join(rootdir, 'data/small/train-gold.xml'): [10, 8, 6], 15 | os.path.join(rootdir, 'data/small/gold-task-c.xml'): [12, 12], 16 | os.path.join(rootdir, 'data/small/00130846.ann.xml'): [25], 17 | os.path.join(rootdir, 'data/small/00130846.xml'): [25], 18 | os.path.join(rootdir, 'data/small/00132482.ann.xml'): [2], 19 | os.path.join(rootdir, 'data/small/00132482.xml'): [2] 20 | } 21 | 22 | for path, paragraph_lenghts in data.items(): 23 | assert paragraph_lenghts == [len(paragraph.sentences) for paragraph in read_xces(path)] 24 | for paragraph in read_xces(path): 25 | print(paragraph.text()) 26 | 27 | for sentence in paragraph: 28 | for token in sentence: 29 | print(token) 30 | print() 31 | 32 | def test_reanalyze(rootdir): 33 | data = { 34 | os.path.join(rootdir, 'data/small/nkjp1m-1.2-xces.xml'): [8, 7], 35 | os.path.join(rootdir, 'data/small/train-gold.xml'): [10, 8, 6], 36 | os.path.join(rootdir, 'data/small/gold-task-c.xml'): [12, 12], 37 | os.path.join(rootdir, 'data/small/00130846.ann.xml'): [25], 38 | os.path.join(rootdir, 'data/small/00130846.xml'): [25], 39 | os.path.join(rootdir, 'data/small/00132482.ann.xml'): [2], 40 | os.path.join(rootdir, 'data/small/00132482.xml'): [2] 41 | } 42 | 43 | for path, paragraph_lenghts in data.items(): 44 | # assert paragraph_lenghts == [len(paragraph.sentences) for paragraph in read_xces(path)] 45 | maca_analyzer = MacaAnalyzer('morfeusz2-nkjp') 46 | for paragraph in read_xces(path): 47 | paragraph_raw = paragraph.text() 48 | 49 | paragraph_reanalyzed = maca_analyzer.analyze(paragraph_raw) 50 | 51 | print('Number of sentences by Maca vs gold', len(paragraph_reanalyzed.sentences), len(paragraph.sentences)) 52 | 53 | paragraph_reanalyzed = align_paragraphs(paragraph_reanalyzed, paragraph) 54 | for sentence in paragraph_reanalyzed: 55 | for token in sentence: 56 | print(token) 57 | print() 58 | -------------------------------------------------------------------------------- /tests/test_speed.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | MACA_CONFIG=morfeusz2-nkjp 4 | 5 | 6 | time cat tests/data/full/test-raw.txt | CUDA_VISIBLE_DEVICES="" python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces > /tmp/out.xces 7 | #12s 8 | 9 | time cat tests/data/full/train-raw.txt | CUDA_VISIBLE_DEVICES="" python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces > /tmp/out.xces 10 | #7m16s 11 | 12 | #one thread 13 | time cat tests/data/full/test-raw.txt | CUDA_VISIBLE_DEVICES="" python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces --reproducible > /tmp/out.xces 14 | #22s 15 | 16 | #GPU 1050GTX 17 | #train 18 | #5m12s 19 | 20 | #time maca-analyse -c morfeusz2-nkjp < tests/data/full/train-raw.txt > /dev/null 21 | #35s 22 | 23 | #time maca-analyse -c morfeusz2-nkjp < tests/data/full/test-raw.txt > /dev/null 24 | #0.9s 25 | 26 | #maca per line test-raw.txt 27 | #45s 28 | 29 | #i tak zrównolegla 30 | 31 | # test-raw.txt API 1w1t GPU 44s 32 | # test-raw.txt API 1w2t GPU 44s 33 | # test-raw.txt API 2w1t GPU 44s 34 | 35 | # test-raw.txt API 1w1t CPU 43s 36 | # test-raw.txt API 1w2t CPU 43s 37 | # test-raw.txt API 2w1t CPU 42s 38 | 39 | # pool=2 test-raw.txt API 1w1t CPU 29s 40 | # pool=2 test-raw.txt API 1w2t CPU 28s 41 | # pool=2 test-raw.txt API 2w1t CPU 25s 42 | 43 | # pool=2 test-raw.txt API 1w1t GPU 21s 44 | # pool=2 test-raw.txt API 1w2t GPU 30s 45 | # pool=2 test-raw.txt API 2w1t GPU 23s 46 | 47 | # pool=10 test-raw.txt API 1w1t CPU 20s 48 | # pool=10 test-raw.txt API 1w2t CPU 20s 49 | # pool=10 test-raw.txt API 2w1t CPU 17s 50 | # pool=10 test-raw.txt API 4w1t CPU 15s 51 | # pool=10 test-raw.txt API 4w2t CPU 16s 52 | # pool=10 test-raw.txt API 8w1t CPU 16s 53 | # pool=100 test-raw.txt API 10w1t CPU 14s 54 | # pool=100 test-raw.txt API 20w1t CPU 14s 55 | 56 | # pool=10 test-raw.txt API 1w1t GPU 21s 57 | # pool=10 test-raw.txt API 1w2t GPU 21s 58 | # pool=10 test-raw.txt API 2w1t GPU 14s 59 | # pool=10 test-raw.txt API 4w1t GPU OOM -------------------------------------------------------------------------------- /tests/test_structure.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from krnnt.readers import read_xces 4 | 5 | 6 | def test_paragraph_text(rootdir): 7 | data = { 8 | os.path.join(rootdir, 'data/small/nkjp1m-1.2-xces.xml'): [8, 7], 9 | os.path.join(rootdir, 'data/small/train-gold.xml'): [10, 8, 6], 10 | os.path.join(rootdir, 'data/small/gold-task-c.xml'): [12, 12], 11 | os.path.join(rootdir, 'data/small/00130846.ann.xml'): [25], 12 | os.path.join(rootdir, 'data/small/00130846.xml'): [25], 13 | os.path.join(rootdir, 'data/small/00132482.ann.xml'): [2], 14 | os.path.join(rootdir, 'data/small/00132482.xml'): [2] 15 | } 16 | 17 | for path, paragraph_lenghts in data.items(): 18 | print(path) 19 | for paragraph in read_xces(path): 20 | paragraph_raw = '' 21 | for sentence_gold in paragraph: 22 | paragraph_raw += sentence_gold.text() 23 | paragraph_raw = paragraph_raw[1:] 24 | assert paragraph_raw == paragraph.text() 25 | -------------------------------------------------------------------------------- /tests/test_system.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | def test_download_model(bash, rootdir): 5 | commands = [ 6 | 'cd %s' % rootdir, 7 | './download_model.sh' 8 | ] 9 | 10 | with bash() as s: 11 | for command in commands: 12 | s.run_script_inline([command]) 13 | 14 | 15 | def test_process_xces(bash, rootdir): 16 | commands = [ 17 | 'cd %s' % rootdir, 18 | 'cd ..', 19 | 'python3 process_xces.py tests/data/small/nkjp1m-1.2-xces.xml /tmp/nkjp.spickle', 20 | 'diff /tmp/nkjp.spickle tests/data/reference/nkjp1m-1.2.spickle'] 21 | 22 | for command in commands: 23 | bash.run_script_inline([command]) 24 | 25 | 26 | @pytest.mark.xfail(reason="version of morfeusz dictionary may influence results") 27 | def test_reanalyze(bash, rootdir): 28 | commands = [ 29 | 'cd %s' % rootdir, 30 | 'cd ..', 31 | 'python3 reanalyze.py --maca_config $MACA_CONFIG /tmp/nkjp.spickle /tmp/nkjp-reanalyzed.spickle', 32 | 'diff /tmp/nkjp-reanalyzed.spickle tests/data/reference/nkjp1m-1.2-reanalyzed.spickle' 33 | ] 34 | 35 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp'}) as s: 36 | for command in commands: 37 | print(command) 38 | s.run_script_inline([command]) 39 | 40 | 41 | def test_shuffle(bash, rootdir): 42 | commands = [ 43 | 'cd %s' % rootdir, 44 | 'cd ..', 45 | 'python3 shuffle.py tests/data/reference/nkjp1m-1.2-reanalyzed.spickle /tmp/nkjp-reanalyzed.shuf.spickle', 46 | 'diff /tmp/nkjp-reanalyzed.shuf.spickle tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle' 47 | ] 48 | 49 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp'}) as s: 50 | for command in commands: 51 | s.run_script_inline([command]) 52 | 53 | 54 | def test_preprocess(bash, rootdir): 55 | commands = [ 56 | 'cd %s' % rootdir, 57 | 'cd ..', 58 | 'rm -f /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2 /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData', 59 | 'python3 preprocess_data.py /tmp/nkjp-reanalyzed.shuf.spickle /tmp/nkjp-reanalyzed.shuf.spickle.preprocess', 60 | 'diff /tmp/nkjp-reanalyzed.shuf.spickle.preprocess tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess', 61 | ] 62 | 63 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp'}) as s: 64 | for command in commands: 65 | s.run_script_inline([command]) 66 | 67 | 68 | def test_create_dict(bash, rootdir): 69 | commands = [ 70 | 'cd %s' % rootdir, 71 | 'cd ..', 72 | 'rm -f /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2 /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData', 73 | 'python3 create_dict.py tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess /tmp/nkjp-reanalyzed.shuf.spickle.preprocess.dict', 74 | 'diff /tmp/nkjp-reanalyzed.shuf.spickle.preprocess.dict tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict', 75 | ] 76 | 77 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp'}) as s: 78 | for command in commands: 79 | s.run_script_inline([command]) 80 | 81 | 82 | @pytest.mark.slow 83 | def test_train2(bash, rootdir): 84 | commands = [ 85 | 'cd %s' % rootdir, 86 | 'cd ..', 87 | 'python3 train.py tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict --maca_config $MACA_CONFIG -e 2 --reproducible --hash test', 88 | 'h5diff weight_test.hdf5 tests/data/reference/weight_test.hdf5.new', 89 | 'h5diff weight_test.hdf5.final tests/data/reference/weight_test.hdf5.final.new', 90 | ] 91 | 92 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s: 93 | for command in commands: 94 | s.run_script_inline([command]) 95 | 96 | 97 | @pytest.mark.slow 98 | def test_train_lemmatization(bash, rootdir): 99 | commands = [ 100 | 'cd %s' % rootdir, 101 | 'cd ..', 102 | 'python3 train_lemmatization.py tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess --reproducible --hash test', 103 | 'diff lemmatisation_test.pkl tests/data/reference/lemmatisation_test.pkl', 104 | ] 105 | 106 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s: 107 | for command in commands: 108 | s.run_script_inline([command]) 109 | 110 | 111 | def test_join_dicts(bash, rootdir): 112 | commands = [ 113 | 'cd %s' % rootdir, 114 | 'cd ..', 115 | 'python3 join_dicts.py /tmp/joined_dicts.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict --reproducible', 116 | 'diff /tmp/joined_dicts.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict', 117 | ] 118 | 119 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s: 120 | for command in commands: 121 | s.run_script_inline([command]) 122 | 123 | 124 | def test_split_data(bash, rootdir): 125 | commands = [ 126 | 'cd %s' % rootdir, 127 | 'cd ..', 128 | 'python3 split_data.py tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle /tmp/nkjp1m-1.2-reanalyzed.shuf.spickle.part1 /tmp/nkjp1m-1.2-reanalyzed.shuf.spickle.part2 0.2', 129 | 'diff /tmp/nkjp1m-1.2-reanalyzed.shuf.spickle.part1 tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part1', 130 | 'diff /tmp/nkjp1m-1.2-reanalyzed.shuf.spickle.part2 tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part2', 131 | ] 132 | 133 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s: 134 | for command in commands: 135 | s.run_script_inline([command]) 136 | 137 | 138 | def test_join_data(bash, rootdir): 139 | commands = [ 140 | 'cd %s' % rootdir, 141 | 'cd ..', 142 | 'python3 join_data.py /tmp/nkjp1m-1.2-reanalyzed.shuf.spickle.joined tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part1 tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part2', 143 | 'diff /tmp/nkjp1m-1.2-reanalyzed.shuf.spickle.joined tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle', 144 | ] 145 | 146 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s: 147 | for command in commands: 148 | s.run_script_inline([command]) 149 | 150 | 151 | @pytest.mark.slow 152 | def test_train(bash, rootdir): 153 | commands = [ 154 | 'cd %s' % rootdir, 155 | 'cd ..', 156 | 'rm -f /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2 /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues', 157 | 'python3 krnnt_train.py --maca_config $MACA_CONFIG /tmp/nkjp-reanalyzed.shuf.spickle -e 2 --reproducible --hash test', 158 | 159 | 'h5diff weight_test.hdf5 tests/data/reference/weight_test.hdf5', 160 | 'h5diff weight_test.hdf5.final tests/data/reference/weight_test.hdf5.final', 161 | 'diff lemmatisation_test.pkl tests/data/reference/lemmatisation_test.pkl', 162 | 'diff /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2 tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2', 163 | 'diff /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData', 164 | 'diff /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues', 165 | ] 166 | 167 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s: 168 | for command in commands: 169 | s.run_script_inline([command]) 170 | 171 | 172 | def test_run_xces(bash, rootdir): 173 | commands = [ 174 | 'cd %s' % rootdir, 175 | 'cd ..', 176 | 'python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces > /tmp/out.xces < tests/data/reference/in_raw.txt', 177 | 'diff /tmp/out.xces tests/data/reference/out.xces' 178 | ] 179 | 180 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': ''}) as s: 181 | for command in commands: 182 | s.run_script_inline([command]) 183 | 184 | 185 | def test_run_xces_from_training(bash, rootdir): 186 | commands = [ 187 | 'cd %s' % rootdir, 188 | 'cd ..', 189 | 'python3 krnnt_run.py weight_test.hdf5.final lemmatisation_test.pkl /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces > /tmp/out.xces < tests/data/reference/in_raw.txt', 190 | 'diff /tmp/out.xces tests/data/reference/out.xces' 191 | ] 192 | 193 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': ''}) as s: 194 | for command in commands: 195 | s.run_script_inline([command]) 196 | 197 | 198 | def test_run_plain(bash, rootdir): 199 | commands = [ 200 | 'cd %s' % rootdir, 201 | 'cd ..', 202 | 'python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o plain > /tmp/out.plain < tests/data/reference/in_raw.txt', 203 | 'diff /tmp/out.plain tests/data/reference/out.plain' 204 | ] 205 | 206 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': ''}) as s: 207 | for command in commands: 208 | s.run_script_inline([command]) 209 | 210 | 211 | def test_run_conll(bash, rootdir): 212 | commands = [ 213 | 'cd %s' % rootdir, 214 | 'cd ..', 215 | 'python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o conll > /tmp/out.conll < tests/data/reference/in_raw.txt', 216 | 'diff /tmp/out.conll tests/data/reference/out.conll' 217 | ] 218 | 219 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': ''}) as s: 220 | for command in commands: 221 | s.run_script_inline([command]) 222 | 223 | 224 | def test_run_conllu(bash, rootdir): 225 | commands = [ 226 | 'cd %s' % rootdir, 227 | 'cd ..', 228 | 'python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o conllu > /tmp/out.conllu < tests/data/reference/in_raw.txt', 229 | 'diff /tmp/out.conllu tests/data/reference/out.conllu' 230 | ] 231 | 232 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': ''}) as s: 233 | for command in commands: 234 | s.run_script_inline([command]) 235 | 236 | 237 | def test_run_jsonl(bash, rootdir): 238 | commands = [ 239 | 'cd %s' % rootdir, 240 | 'cd ..', 241 | 242 | 'python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o jsonl > /tmp/out.jsonl < tests/data/reference/in_raw.txt', 243 | 'diff /tmp/out.jsonl tests/data/reference/out.jsonl' 244 | ] 245 | 246 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': ''}) as s: 247 | for command in commands: 248 | s.run_script_inline([command]) 249 | 250 | 251 | @pytest.mark.xfail(reason="non-deterministic lemmatisation?") 252 | def test_run_evaluation(bash, rootdir): 253 | commands = [ 254 | 'cd %s' % rootdir, 255 | 'cd ..', 256 | 'cat tests/data/small/gold-task-c.txt | python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces --reproducible > /tmp/out.xces', 257 | 'python2 tagger-eval.py tests/data/small/gold-task-c.xml /tmp/out.xces > /tmp/out_evaluation.txt', 258 | 'diff /tmp/out_evaluation.txt tests/data/reference/gold-task-c_evaluation.txt ' 259 | ] 260 | 261 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s: 262 | for command in commands: 263 | s.run_script_inline([command]) 264 | 265 | 266 | @pytest.mark.xfail(reason="non-deterministic lemmatisation?") 267 | def test_run_evaluation_from_training(bash, rootdir): 268 | commands = [ 269 | 'cd %s' % rootdir, 270 | 'cd ..', 271 | 'cat tests/data/small/gold-task-c.txt | python3 krnnt_run.py weight_test.hdf5.final lemmatisation_test.pkl /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces --reproducible > /tmp/out.xces', 272 | 'python2 tagger-eval.py tests/data/small/gold-task-c.xml /tmp/out.xces > /tmp/out_evaluation.txt', 273 | 'diff /tmp/out_evaluation.txt tests/data/reference/gold-task-c_evaluation.txt ' 274 | ] 275 | 276 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s: 277 | for command in commands: 278 | s.run_script_inline([command]) 279 | -------------------------------------------------------------------------------- /tests/test_system_server.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def test_download_model(bash, rootdir): 5 | commands = [ 6 | 'cd %s' % rootdir, 7 | './download_model.sh' 8 | ] 9 | 10 | with bash() as s: 11 | for command in commands: 12 | s.run_script_inline([command]) 13 | 14 | #TODO: run server: python3 krnnt_serve.py model_data/ --maca_config morfeusz2-nkjp 15 | 16 | def test_post_raw(bash,rootdir): 17 | commands = [ 18 | 'cd %s' % rootdir, 19 | 'cd ..', 20 | 'curl -X POST "http://localhost:9003" --data-binary @tests/data/server/in_raw.txt > /tmp/out.txt', 21 | 'diff /tmp/out.txt tests/data/server/out_raw.plain' 22 | ] 23 | 24 | with bash() as s: 25 | for command in commands: 26 | s.run_script_inline([command]) 27 | 28 | def test_post_raw_jsonl(bash,rootdir): 29 | commands = [ 30 | 'cd %s' % rootdir, 31 | 'cd ..', 32 | 'curl -X POST "http://localhost:9003/?output_format=jsonl&input_format=lines" --data-binary @tests/data/server/in_raw.txt > /tmp/out.txt', 33 | 'diff /tmp/out.txt tests/data/server/out_raw.jsonl' 34 | ] 35 | 36 | with bash() as s: 37 | for command in commands: 38 | s.run_script_inline([command]) 39 | 40 | def test_post_raw_conll(bash,rootdir): 41 | commands = [ 42 | 'cd %s' % rootdir, 43 | 'cd ..', 44 | 'curl -X POST "http://localhost:9003/?output_format=conll&input_format=lines" --data-binary @tests/data/server/in_raw.txt > /tmp/out.txt', 45 | 'diff /tmp/out.txt tests/data/server/out_raw.conll' 46 | ] 47 | 48 | with bash() as s: 49 | for command in commands: 50 | s.run_script_inline([command]) 51 | 52 | def test_post_raw_conllu(bash,rootdir): 53 | commands = [ 54 | 'cd %s' % rootdir, 55 | 'cd ..', 56 | 'curl -X POST "http://localhost:9003/?output_format=conllu&input_format=lines" --data-binary @tests/data/server/in_raw.txt > /tmp/out.txt', 57 | 'diff /tmp/out.txt tests/data/server/out_raw.conllu' 58 | ] 59 | 60 | with bash() as s: 61 | for command in commands: 62 | s.run_script_inline([command]) 63 | 64 | def test_post_raw_xces(bash,rootdir): 65 | commands = [ 66 | 'cd %s' % rootdir, 67 | 'cd ..', 68 | 'curl -X POST "http://localhost:9003/?output_format=xces&input_format=lines" --data-binary @tests/data/server/in_raw.txt > /tmp/out.txt', 69 | 'diff /tmp/out.txt tests/data/server/out_raw.xces' 70 | ] 71 | 72 | with bash() as s: 73 | for command in commands: 74 | s.run_script_inline([command]) 75 | 76 | def test_post_form(bash, rootdir): 77 | commands = [ 78 | 'cd %s' % rootdir, 79 | 'cd ..', 80 | 'curl -X POST "http://localhost:9003" --data-binary "text=Lubię placki. Ala ma kota.\n\nRaz dwa trzy." > /tmp/out.txt' 81 | ] 82 | 83 | with bash() as s: 84 | for command in commands: 85 | s.run_script_inline([command]) 86 | 87 | generated = open('/tmp/out.txt').read() 88 | reference = open(os.path.join(rootdir,'data/server/out_raw.plain')).read() 89 | 90 | assert reference in generated 91 | 92 | def test_post_tokenized_json(bash, rootdir): 93 | commands = [ 94 | 'cd %s' % rootdir, 95 | 'cd ..', 96 | 'curl -X POST -H "Content-Type: application/json" "http://localhost:9003" -d @tests/data/server/in_tokenized.json > /tmp/out.txt', 97 | 'diff -B /tmp/out.txt tests/data/server/out_raw.plain' 98 | ] 99 | 100 | with bash() as s: 101 | for command in commands: 102 | s.run_script_inline([command]) 103 | 104 | def test_post_tokenized_compact_json(bash, rootdir): 105 | commands = [ 106 | 'cd %s' % rootdir, 107 | 'cd ..', 108 | 'curl -X POST -H "Content-Type: application/json" "http://localhost:9003" -d @tests/data/server/in_tokenized_compact.json > /tmp/out.txt', 109 | 'diff -B /tmp/out.txt tests/data/server/out_raw.plain' 110 | ] 111 | 112 | with bash() as s: 113 | for command in commands: 114 | s.run_script_inline([command]) 115 | 116 | def test_post_raw_poleval(bash, rootdir): 117 | commands = [ 118 | 'cd %s' % rootdir, 119 | 'cd ..', 120 | 'curl -X POST "http://localhost:9003" --data-binary @tests/data/full/test-raw.txt > /tmp/out.txt' 121 | ] 122 | 123 | with bash() as s: 124 | for command in commands: 125 | s.run_script_inline([command]) -------------------------------------------------------------------------------- /tests/test_tagset.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from krnnt.analyzers import MacaAnalyzer 3 | from krnnt.new import get_morfeusz, analyze_token 4 | 5 | MACA_CONFIG1='morfeusz-nkjp-official' 6 | MACA_CONFIG2='morfeusz2-nkjp' 7 | 8 | @pytest.fixture 9 | def maca(): 10 | try: 11 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1) 12 | list(maca_analyzer._maca("test")) 13 | except: 14 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2) 15 | list(maca_analyzer._maca("test")) 16 | 17 | return maca_analyzer 18 | 19 | test_data = [ 20 | ('IV', '', 'num:::'), 21 | ('IV', '', 'romandig'), 22 | ('1', '', 'dig'), 23 | ('prostu', 'adjp', 'adjp:gen'), 24 | (':)', '', 'emo'), 25 | ('godzien', 'adjc', ''), 26 | ('oślep', 'burk', 'frag'), 27 | ('obojga', 'numcol:pl:gen:m1:rec', ''), 28 | ('dwoje', 'numcol:pl:acc:m1:rec', ''), 29 | ('czworo', 'numcol:pl:nom:m1:rec', ''), 30 | ('hej', 'interj', ''), 31 | ('jeszcze', 'qub', 'part'), 32 | ('czterem', 'num:pl:dat:m1:congr', ''), 33 | ('czym', 'conj', 'comp'), 34 | ('niedaleko', 'prep:gen', ''), 35 | ('doprawdy', 'qub', 'adv'), 36 | ('jak', 'qub', 'adv'), 37 | ('pół', '', 'numcomp'), 38 | ('pół', '', 'num:comp'), 39 | ('pół', 'num:pl:acc:n:rec', ''), 40 | ('słowa', 'subst:pl:acc:n', 'subst:sg:gen:n:ncol'), 41 | ('rozklepywało', '', 'praet:sg:n1:ter:imperf'), 42 | ('bardzo', 'adv:pos', 'adv'), 43 | ('bardziej', 'adv:com', ''), 44 | ('znacząco', 'adv:pos', 'pacta'), 45 | ('my', '', 'ppron12:pl:nom:_:pri'), 46 | ('sobie', 'siebie:dat', ''), 47 | ('zł', 'brev:npun', 'brev'), 48 | ] 49 | 50 | @pytest.mark.parametrize('form, exist, not_exist', test_data) 51 | @pytest.mark.xfail 52 | def test_maca(maca, form, exist, not_exist): 53 | paragraph=maca.analyze(form) 54 | sentence=paragraph.sentences[0] 55 | token=sentence.tokens[0] 56 | tags = [form.tags for form in token.interpretations] 57 | print(tags) 58 | if exist: 59 | assert exist in tags 60 | if not_exist: 61 | assert not_exist not in tags 62 | 63 | @pytest.mark.parametrize('form, exist, not_exist', test_data) 64 | @pytest.mark.xfail 65 | def test_morfeusz(maca, form, exist, not_exist): 66 | morfeusz = get_morfeusz() 67 | tags=[tag for form, tag in analyze_token(morfeusz, form)] 68 | print(tags) 69 | if exist: 70 | assert exist in tags 71 | if not_exist: 72 | assert not_exist not in tags -------------------------------------------------------------------------------- /tests/test_writers.py: -------------------------------------------------------------------------------- 1 | from krnnt.writers import results_to_conll_str, results_to_conllu_str, results_to_txt_str, results_to_plain_str, \ 2 | results_to_xces_str 3 | 4 | results = [[[{'token': 'Lubię', 'sep': 'newline', 'prob': 0.37375012, 'tag': 'adj:pl:nom:m1:pos', 'lemmas': ['Lubię'], 5 | 'start': 0, 'end': 5}, 6 | {'token': 'placki', 'sep': 'space', 'prob': 0.38550463, 'tag': 'subst:pl:nom:m1', 'lemmas': ['placki'], 7 | 'start': 6, 'end': 12}, 8 | {'token': '.', 'sep': 'none', 'prob': 0.99999726, 'tag': 'interp', 'lemmas': ['.'], 'start': 12, 9 | 'end': 13}], [ 10 | {'token': 'Ala', 'sep': 'space', 'prob': 0.9995969, 'tag': 'subst:sg:nom:f', 'lemmas': ['Ala'], 11 | 'start': 14, 'end': 17}, 12 | {'token': 'ma', 'sep': 'space', 'prob': 0.6605565, 'tag': 'subst:sg:nom:f', 'lemmas': ['ma'], 13 | 'start': 18, 'end': 20}, 14 | {'token': 'kota', 'sep': 'space', 'prob': 0.93132496, 'tag': 'subst:sg:nom:f', 'lemmas': ['kota'], 15 | 'start': 21, 'end': 25}, 16 | {'token': '.', 'sep': 'none', 'prob': 0.9999993, 'tag': 'interp', 'lemmas': ['.'], 'start': 25, 17 | 'end': 26}]], [[ 18 | {'token': 'Raz', 'sep': 'space', 'prob': 0.23650545, 'tag': 'subst:sg:nom:f', 'lemmas': ['Raz'], 19 | 'start': 27, 'end': 30}, 20 | {'token': 'dwa', 'sep': 'space', 'prob': 0.581044, 'tag': 'adj:pl:acc:f:pos', 'lemmas': ['dwa'], 21 | 'start': 31, 'end': 34}, 22 | {'token': 'trzy', 'sep': 'space', 'prob': 0.71970826, 'tag': 'subst:pl:acc:f', 'lemmas': ['trzy'], 23 | 'start': 35, 'end': 39}, 24 | {'token': '.', 'sep': 'none', 'prob': 0.99999905, 'tag': 'interp', 'lemmas': ['.'], 'start': 39, 25 | 'end': 40}]]] 26 | 27 | 28 | def test_conll(): 29 | reference=\ 30 | """Lubię Lubię 1 adj:pl:nom:m1:pos 0 5 31 | placki placki 1 subst:pl:nom:m1 6 12 32 | . . 0 interp 12 13 33 | 34 | Ala Ala 1 subst:sg:nom:f 14 17 35 | ma ma 1 subst:sg:nom:f 18 20 36 | kota kota 1 subst:sg:nom:f 21 25 37 | . . 0 interp 25 26 38 | 39 | 40 | Raz Raz 1 subst:sg:nom:f 27 30 41 | dwa dwa 1 adj:pl:acc:f:pos 31 34 42 | trzy trzy 1 subst:pl:acc:f 35 39 43 | . . 0 interp 39 40 44 | 45 | """ 46 | output = results_to_conll_str(results) 47 | assert output == reference 48 | 49 | def test_conllu(): 50 | reference=\ 51 | """1 Lubię Lubię _ adj:pl:nom:m1:pos _ _ _ _ _ 52 | 2 placki placki _ subst:pl:nom:m1 _ _ _ _ _ 53 | 3 . . _ interp _ _ _ _ _ 54 | 55 | 1 Ala Ala _ subst:sg:nom:f _ _ _ _ _ 56 | 2 ma ma _ subst:sg:nom:f _ _ _ _ _ 57 | 3 kota kota _ subst:sg:nom:f _ _ _ _ _ 58 | 4 . . _ interp _ _ _ _ _ 59 | 60 | 61 | 1 Raz Raz _ subst:sg:nom:f _ _ _ _ _ 62 | 2 dwa dwa _ adj:pl:acc:f:pos _ _ _ _ _ 63 | 3 trzy trzy _ subst:pl:acc:f _ _ _ _ _ 64 | 4 . . _ interp _ _ _ _ _ 65 | 66 | """ 67 | output = results_to_conllu_str(results) 68 | assert output == reference 69 | 70 | def test_txt(): 71 | reference=\ 72 | """Lubię placki. 73 | Ala ma kota. 74 | 75 | Raz dwa trzy. 76 | 77 | """ 78 | output = results_to_txt_str(results) 79 | 80 | assert output == reference 81 | 82 | def test_plain(): 83 | reference=\ 84 | """Lubię newline 85 | Lubię adj:pl:nom:m1:pos disamb 86 | placki space 87 | placki subst:pl:nom:m1 disamb 88 | . none 89 | . interp disamb 90 | 91 | Ala space 92 | Ala subst:sg:nom:f disamb 93 | ma space 94 | ma subst:sg:nom:f disamb 95 | kota space 96 | kota subst:sg:nom:f disamb 97 | . none 98 | . interp disamb 99 | 100 | 101 | Raz space 102 | Raz subst:sg:nom:f disamb 103 | dwa space 104 | dwa adj:pl:acc:f:pos disamb 105 | trzy space 106 | trzy subst:pl:acc:f disamb 107 | . none 108 | . interp disamb 109 | 110 | """ 111 | output = results_to_plain_str(results) 112 | assert output == reference 113 | 114 | def test_xces(): 115 | reference=\ 116 | """ 117 | 118 | 119 | 120 | 121 | 122 | 123 | Lubię 124 | Lubięadj:pl:nom:m1:pos 125 | 126 | 127 | placki 128 | plackisubst:pl:nom:m1 129 | 130 | 131 | 132 | . 133 | .interp 134 | 135 | 136 | 137 | 138 | Ala 139 | Alasubst:sg:nom:f 140 | 141 | 142 | ma 143 | masubst:sg:nom:f 144 | 145 | 146 | kota 147 | kotasubst:sg:nom:f 148 | 149 | 150 | 151 | . 152 | .interp 153 | 154 | 155 | 156 | 157 | 158 | 159 | Raz 160 | Razsubst:sg:nom:f 161 | 162 | 163 | dwa 164 | dwaadj:pl:acc:f:pos 165 | 166 | 167 | trzy 168 | trzysubst:pl:acc:f 169 | 170 | 171 | 172 | . 173 | .interp 174 | 175 | 176 | 177 | 178 | """ 179 | 180 | output = results_to_xces_str(results) 181 | assert output == reference -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import logging 4 | from argparse import ArgumentParser 5 | 6 | from keras.models import load_model 7 | 8 | from krnnt.keras_models import BEST, ExperimentParameters 9 | from krnnt.new import UnalignedSimpleEvaluator 10 | from krnnt.tagger_exps import RunFolds2, KerasData, RunExperiment, KerasData2, RunExperiment2 11 | 12 | logging.basicConfig(level=logging.DEBUG) 13 | 14 | if __name__ == '__main__': 15 | parser = ArgumentParser() 16 | parser.add_argument('data_path', help='path to preprocessed data') 17 | parser.add_argument('features_dict', help='path to features dict') 18 | 19 | parser.add_argument('-p', '--preanalyzed', action='store_false', 20 | default=True, dest='reanalyzed', 21 | help='training data have not been reanalyzed') 22 | parser.add_argument('-c', '--cv', action='store_true', 23 | default=False, dest='cv', 24 | help='run 10-fold cross-validation') 25 | parser.add_argument('-t', '--train_ratio', 26 | default=1.0, dest='train_ratio', type=float, 27 | help='percentage of data for training') 28 | parser.add_argument('-d', '--dev_ratio', 29 | default=0.0, dest='dev_ratio', type=float, 30 | help='percentage of training data for development') 31 | parser.add_argument('--dev_data', default='0.0', help='dev data ratio or path to dev data') 32 | parser.add_argument('--test_data', default='0.0', help='test data ratio or path to test data') 33 | parser.add_argument('--load_model', default=None, help='path to pretrained model') 34 | parser.add_argument('-e', '--epochs', 35 | default=100, dest='epochs', type=int, 36 | help='number of epochs') 37 | parser.add_argument('--patience', 38 | default=10, dest='patience', type=int, 39 | help='patience') 40 | parser.add_argument('--maca_config', 41 | default='morfeusz2-nkjp', 42 | help='Maca config') 43 | parser.add_argument('--tensor_board', 44 | action='store_true', 45 | help='save data for TensorBoard') 46 | parser.add_argument('-g', '--debug', action='store_true', dest='debug_mode') # TODO 47 | parser.add_argument('--hash', action='store', default=None, dest='hash') 48 | parser.add_argument('--reproducible', action='store_true', default=False, help='set seeds') 49 | parser.add_argument('-f', '--fold', default=None, dest='fold') 50 | args = parser.parse_args() 51 | 52 | if args.reproducible: 53 | from numpy.random import seed 54 | seed(1337) 55 | import random as rn 56 | rn.seed(1337) 57 | import tensorflow as tf 58 | session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, 59 | inter_op_parallelism_threads=1) 60 | from keras import backend as K 61 | tf.set_random_seed(1337) 62 | sess = tf.Session(graph=tf.get_default_graph(), config=session_conf) 63 | K.set_session(sess) 64 | 65 | pref = {'nb_epoch': 100, 'batch_size': 256, 66 | 'internal_neurons': 256, 'feature_name': 'tags4e3', 'label_name': 'label', 67 | 'evaluator': UnalignedSimpleEvaluator, 'patience': 10, 68 | 'weight_path': 'weights.hdf5', 'samples_per_epoch': 10000, 'keras_model_class': BEST, 69 | 'corpus_path': 'data/train-reanalyzed.spickle', 'reanalyze': True, 'train_data_ratio': 0.9, 70 | 'dev_data_ratio': 0.1} 71 | 72 | pref['reanalyze'] = args.reanalyzed 73 | pref['train_data_ratio'] = float(args.train_ratio) 74 | pref['dev_data_ratio'] = float(args.dev_ratio) 75 | 76 | pref['tensor_board']= args.tensor_board 77 | pref['nb_epoch'] = args.epochs 78 | 79 | pref['dev_data'] = args.dev_data 80 | if pref['dev_data']=='0.0': 81 | pref['patience'] = pref['nb_epoch'] 82 | pref['test_data'] = args.test_data 83 | pref['load_model'] = args.load_model 84 | 85 | 86 | # pref['corpus_path'] = args.corpus_path 87 | pref['patience'] = args.patience 88 | pref['maca_config'] = args.maca_config 89 | if args.hash is not None: 90 | pref['h'] = args.hash 91 | if args.fold is not None: 92 | pref['fold'] = int(args.fold) 93 | 94 | keras_model_class = pref['keras_model_class'] 95 | 96 | if args.cv: 97 | logging.error('CV is not supported') 98 | # rf = RunFolds2(keras_model_class, pref) 99 | # rf.run() 100 | else: 101 | parameters = ExperimentParameters(pref) 102 | 103 | km = keras_model_class(parameters) 104 | 105 | 106 | 107 | 108 | print('Model will be saved under: %s.final' % parameters.pref['weight_path']) 109 | 110 | kd = KerasData2(args.data_path, args.features_dict, parameters) 111 | re = RunExperiment2(kd, km) 112 | re.run() 113 | 114 | print('Model is saved under: %s' % parameters.pref['weight_path']) 115 | 116 | -------------------------------------------------------------------------------- /train_lemmatization.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from argparse import ArgumentParser 5 | 6 | from krnnt.keras_models import ExperimentParameters 7 | from krnnt.tagger_exps import KerasData2, RunLemma 8 | 9 | if __name__ == '__main__': 10 | parser = ArgumentParser(description='Train lemmatization') 11 | parser.add_argument('data_path', help='path to preprocessed data') 12 | 13 | 14 | parser.add_argument('-t', '--train_ratio', 15 | default=1.0, dest='train_ratio', type=float, 16 | help='percentage of data for training') 17 | parser.add_argument('-d', '--dev_ratio', 18 | default=0.0, dest='dev_ratio', type=float, 19 | help='percentage of training data for development') 20 | parser.add_argument('--dev_data', default='0.1', help='dev data ratio or path to dev data') 21 | parser.add_argument('--test_data', default='0.1', help='test data ratio or path to test data') 22 | parser.add_argument('-g', '--debug', action='store_true', dest='debug_mode') # TODO 23 | parser.add_argument('--hash', action='store', default=None, dest='hash') 24 | parser.add_argument('--reproducible', action='store_true', default=False, help='set seeds') 25 | 26 | args = parser.parse_args() 27 | 28 | if args.reproducible: 29 | from numpy.random import seed 30 | seed(1337) 31 | import random as rn 32 | rn.seed(1337) 33 | import tensorflow as tf 34 | session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, 35 | inter_op_parallelism_threads=1) 36 | from keras import backend as K 37 | tf.set_random_seed(1337) 38 | sess = tf.Session(graph=tf.get_default_graph(), config=session_conf) 39 | K.set_session(sess) 40 | 41 | pref = { 42 | 'train_data_ratio': float(args.train_ratio), 43 | 'dev_data_ratio': float(args.dev_ratio), 44 | 'dev_data': args.dev_data, 45 | 'test_data': args.test_data 46 | } 47 | 48 | if args.hash is not None: 49 | pref['h'] = args.hash 50 | 51 | 52 | parameters = ExperimentParameters(pref) 53 | 54 | kd = KerasData2(args.data_path, None, parameters) 55 | re = RunLemma(kd) 56 | re.learn_lemma() 57 | 58 | print('Lemmatisation model is saved under: %s' % parameters.pref['lemmatisation_path']) 59 | 60 | #TODO CV, usunac zaleznosc od TF, KerasData2 bez słownika -------------------------------------------------------------------------------- /voting.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | import sys 4 | 5 | from krnnt.writers import results_to_xces, results_to_xces_str 6 | from krnnt.readers import read_xces 7 | 8 | # path='/home/djstrong/projects/repos/krnnt/models/voting/' 9 | # path='/home/djstrong/projects/repos/krnnt/' 10 | # files=[path+'text-raw.'+str(i)+'.xml' for i in range(4)] 11 | # files=[path+str(i)+'b.xml' for i in range(10)] 12 | 13 | path=sys.argv[1] 14 | files=[path+str(i)+'.xml' for i in range(10)] 15 | 16 | def checkEqual2(iterator): 17 | return len(set(iterator)) == 1 18 | 19 | xcess = [read_xces(file) for file in files] 20 | 21 | result = [] 22 | 23 | count_all=0 24 | count_mismatch=0 25 | 26 | while True: 27 | try: 28 | paragraphs = [next(xces) for xces in xcess] 29 | 30 | for sentences in zip(*paragraphs): 31 | sentence = [] 32 | result.append(sentence) 33 | for tokens in zip(*sentences): 34 | count_all+=1 35 | # print(tokens) 36 | forms = [token.gold_form for token in tokens] 37 | tags = [form.tags for form in forms] 38 | 39 | token_result = {'sep': 'space' if tokens[0].space_before else 'none','token':tokens[0].form} 40 | sentence.append(token_result) 41 | if not checkEqual2(tags): 42 | # print(tags) 43 | tags_count=collections.defaultdict(list) 44 | for form in forms: 45 | tags_count[form.tags].append(form) 46 | # print(tags_count) 47 | 48 | sorted_forms = sorted(tags_count.items(), key=lambda x: len(x[1]), reverse=True) 49 | # print(tokens[0].form, '\t'*(3-int(len(tokens[0].form)/8)), [(form[0], len(form[1])) for form in sorted_forms]) 50 | winner = sorted_forms[0][1][0] 51 | 52 | 53 | 54 | token_result['tag']=winner.tags 55 | token_result['lemmas']=[winner.lemma] 56 | count_mismatch+=1 57 | else: 58 | # print(tokens[0].form, '\t'*(3-int(len(tokens[0].form)/8)), forms[0].tags) 59 | token_result['tag']=forms[0].tags 60 | token_result['lemmas']=[forms[0].lemma] 61 | 62 | # print() 63 | # print() 64 | 65 | 66 | 67 | except StopIteration: 68 | break 69 | 70 | 71 | print(results_to_xces_str(result)) 72 | 73 | print(count_all, count_mismatch, file=sys.stderr) --------------------------------------------------------------------------------