├── LICENSE
├── Readme.md
├── analyze_corpus_tagset_date.py
├── clean.sh
├── create_dict.py
├── create_simple_lemmatization.py
├── export_data.py
├── graph_log.py
├── join_data.py
├── join_dicts.py
├── krnnt
    ├── __init__.py
    ├── additional_format.py
    ├── aglt.py
    ├── aligner.py
    ├── analyzers.py
    ├── blanks.py
    ├── features.py
    ├── keras_models.py
    ├── new.py
    ├── pipeline.py
    ├── readers.py
    ├── serial_pickle.py
    ├── structure.py
    ├── tagger_exps.py
    ├── utils.py
    └── writers.py
├── krnnt_run.py
├── krnnt_serve.py
├── krnnt_train.py
├── merge_analyzed_gold.py
├── preprocess_data.py
├── process_xces.py
├── reanalyze.py
├── requirements.txt
├── run_test.sh
├── setup.py
├── shuffle.py
├── split_data.py
├── start_flask_server.sh
├── start_gunicorn_server.sh
├── tagger-eval.py
├── tests
    ├── benchmark
    │   ├── test_maca.py
    │   ├── test_maca_analyze.py
    │   ├── test_shape.py
    │   └── test_tags.py
    ├── conftest.py
    ├── data
    │   ├── full
    │   │   ├── gold-task-c.xml
    │   │   ├── test-raw.txt
    │   │   └── train-raw.txt
    │   ├── reference
    │   │   ├── gold-task-c_evaluation.txt
    │   │   ├── in_raw.txt
    │   │   ├── lemmatisation_test.pkl
    │   │   ├── nkjp1m-1.2-reanalyzed.shuf.spickle
    │   │   ├── nkjp1m-1.2-reanalyzed.shuf.spickle.part1
    │   │   ├── nkjp1m-1.2-reanalyzed.shuf.spickle.part2
    │   │   ├── nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess
    │   │   ├── nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict
    │   │   ├── nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2
    │   │   ├── nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData
    │   │   ├── nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues
    │   │   ├── nkjp1m-1.2-reanalyzed.spickle
    │   │   ├── nkjp1m-1.2.spickle
    │   │   ├── out.conll
    │   │   ├── out.conllu
    │   │   ├── out.jsonl
    │   │   ├── out.plain
    │   │   ├── out.xces
    │   │   ├── weight_test.hdf5
    │   │   ├── weight_test.hdf5.final
    │   │   └── weight_test.hdf5.new
    │   ├── server
    │   │   ├── in_raw.txt
    │   │   ├── in_tokenized.json
    │   │   ├── in_tokenized_compact.json
    │   │   ├── out_raw.conll
    │   │   ├── out_raw.conllu
    │   │   ├── out_raw.jsonl
    │   │   ├── out_raw.plain
    │   │   └── out_raw.xces
    │   └── small
    │   │   ├── 00130846.ann.xml
    │   │   ├── 00132482.ann.xml
    │   │   ├── gold-task-c.txt
    │   │   ├── gold-task-c.xml
    │   │   ├── nkjp1m-1.2-xces.xml
    │   │   └── train-gold.xml
    ├── download_model.sh
    ├── test.sh
    ├── test_aglt.py
    ├── test_analyzers.py
    ├── test_blank.py
    ├── test_features.py
    ├── test_morfeusz.py
    ├── test_parallel_api_speed.py
    ├── test_process_xces.py
    ├── test_speed.sh
    ├── test_structure.py
    ├── test_system.py
    ├── test_system_server.py
    ├── test_tagset.py
    └── test_writers.py
├── train.py
├── train_lemmatization.py
└── voting.py


/LICENSE:
--------------------------------------------------------------------------------
  1 |                    GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 | 


--------------------------------------------------------------------------------
/analyze_corpus_tagset_date.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | import collections
  4 | import glob
  5 | import sys
  6 | 
  7 | from argparse import ArgumentParser
  8 | 
  9 | from krnnt.readers import read_xces
 10 | 
 11 | usage = """%prog CORPUS
 12 | 
 13 | Analyze corpus for changes in dictionary.
 14 | """
 15 | 
 16 | if __name__ == '__main__':
 17 |     parser = ArgumentParser(usage=usage)
 18 |     parser.add_argument('corpus_path', type=str, help='path to XCES corpus (or path with wildcard)')
 19 |     args = parser.parse_args()
 20 | 
 21 |     # read corpus
 22 |     stats_forms = collections.defaultdict(int)
 23 |     stats_tags = collections.defaultdict(int)
 24 | 
 25 |     count_sentences=0
 26 |     count_igns=0
 27 |     count_blanks=0
 28 |     count_wo_disamb=0
 29 |     count_problems=0
 30 |     for path in glob.iglob(args.corpus_path):
 31 |         print(path, file=sys.stderr)
 32 |         for paragraph in read_xces(path):
 33 | 
 34 |             for sentence in paragraph:
 35 |                 count_sentences += 1
 36 |                 ign = False
 37 |                 blank = False
 38 |                 wo_disamb = False
 39 |                 for token in sentence:
 40 |                     form = token.form
 41 |                     try:
 42 |                         tag = token.gold_form.tags
 43 |                         stats_forms[(form, tag)] += 1
 44 |                         stats_tags[tag] += 1
 45 |                         if tag=='ign':
 46 |                             ign=True
 47 |                         elif tag=='blank':
 48 |                             blank=True
 49 |                     except:  # no disamb
 50 |                         print("Missing disamb", path, form, file=sys.stderr)
 51 |                         wo_disamb=True
 52 |                         pass
 53 |                 if ign: count_igns+=1
 54 |                 if blank: count_blanks+=1
 55 |                 if wo_disamb: count_wo_disamb+=1
 56 |                 if ign or blank or wo_disamb: count_problems+=1
 57 | 
 58 |     # stats
 59 |     print('Sentences: %s' % count_sentences)
 60 |     print('Sentences wo disamb: %s' % count_wo_disamb)
 61 |     print('Sentences with ign: %s' % count_igns)
 62 |     print('Sentences with blank: %s' % count_blanks)
 63 |     print('Sentences with problems: %s' % count_problems)
 64 |     print('Tokens: %s' % sum(stats_forms.values()))
 65 |     print('Unique tokens: %s' % len(set([x[0] for x in stats_forms])))
 66 |     print('Unique token+tag: %s' % len(stats_forms))
 67 |     print('Unique tags: %s' % len(stats_tags))
 68 |     print('Tokens with tag ign: %s' % stats_tags['ign'])
 69 |     print('Tokens with tag blank: %s' % stats_tags['blank'])
 70 |     print()
 71 | 
 72 |     # analyse
 73 |     TAGS = 'tags'
 74 |     FORMS = 'forms'
 75 |     POSITIVE = '+'
 76 |     NEGATIVE = '-'
 77 |     checks = {}
 78 | 
 79 |     checks['20141013 brev -> brev:n?pun'] = {
 80 |         TAGS: {
 81 |             POSITIVE: [lambda tag: tag in ('brev:pun', 'brev:npun')],
 82 |             NEGATIVE: [lambda tag: tag == 'brev']
 83 |         },
 84 |         FORMS: {
 85 |             POSITIVE: [],
 86 |             NEGATIVE: []
 87 |         }
 88 |     }
 89 |     checks['20150127 siebie, ale w NKJP jest'] = {
 90 |         TAGS: {
 91 |             POSITIVE: [],
 92 |             NEGATIVE: [lambda tag: tag.startswith('siebie:')]
 93 |         },
 94 |         FORMS: {
 95 |             POSITIVE: [],
 96 |             NEGATIVE: []
 97 |         }
 98 |     }
 99 | 
100 |     checks['20150617 _'] = {
101 |         TAGS: {
102 |             POSITIVE: [lambda tag: '_' not in tag],
103 |             NEGATIVE: [lambda tag: '_' in tag]
104 |         },
105 |         FORMS: {
106 |             POSITIVE: [],
107 |             NEGATIVE: []
108 |         }
109 |     }
110 | 
111 |     checks['20160126 pacta'] = {
112 |         TAGS: {
113 |             POSITIVE: [lambda tag: tag == 'pacta'],
114 |             NEGATIVE: []
115 |         },
116 |         FORMS: {
117 |             POSITIVE: [],
118 |             NEGATIVE: []
119 |         }
120 |     }
121 | 
122 |     checks['20170301 bardzo:adv:pos, bardziej:adv:com'] = {
123 |         TAGS: {
124 |             POSITIVE: [],
125 |             NEGATIVE: []
126 |         },
127 |         FORMS: {
128 |             POSITIVE: [lambda form, tag: form == 'bardziej' and tag == 'adv:com',
129 |                        lambda form, tag: form == 'bardzo' and tag == 'adv:pos'],
130 |             NEGATIVE: [lambda form, tag: form == 'bardziej' and tag == 'adv',
131 |                        lambda form, tag: form == 'bardzo' and tag == 'adv']
132 |         }
133 |     }
134 |     checks['20170409 n1,n2,p1,p2,p3 -> n'] = {
135 |         TAGS: {
136 |             POSITIVE: [lambda tag: {'n'} & set(tag.split(':'))],
137 |             NEGATIVE: [lambda tag: {'n1', 'n2', 'n3', 'p1', 'p2', 'p3'} & set(tag.split(':'))]
138 |         },
139 |         FORMS: {
140 |             POSITIVE: [],
141 |             NEGATIVE: []
142 |         }
143 |     }
144 |     checks['no col,ncol,pt'] = {
145 |         TAGS: {
146 |             POSITIVE: [lambda tag: not {'col', 'ncol', 'pt'} & set(tag.split(':'))],
147 |             NEGATIVE: [lambda tag: {'col', 'ncol', 'pt'} & set(tag.split(':'))]
148 |         },
149 |         FORMS: {
150 |             POSITIVE: [],
151 |             NEGATIVE: []
152 |         }
153 |     }
154 |     checks['20170430 num:comp -> numcomp'] = {
155 |         TAGS: {
156 |             POSITIVE: [lambda tag: tag == 'numcomp'],
157 |             NEGATIVE: [lambda tag: tag == 'num:comp']
158 |         },
159 |         FORMS: {
160 |             POSITIVE: [],
161 |             NEGATIVE: []
162 |         }
163 |     }
164 | 
165 |     checks['20170625 jak nie adv'] = {
166 |         TAGS: {
167 |             POSITIVE: [],
168 |             NEGATIVE: []
169 |         },
170 |         FORMS: {
171 |             POSITIVE: [],
172 |             NEGATIVE: [lambda form, tag: form == 'jak' and tag == 'adv']
173 |         }
174 |     }
175 | 
176 |     checks['20170702 jak:comp'] = {
177 |         TAGS: {
178 |             POSITIVE: [],
179 |             NEGATIVE: []
180 |         },
181 |         FORMS: {
182 |             POSITIVE: [lambda form, tag: form == 'jak' and tag == 'comp'],
183 |             NEGATIVE: []
184 |         }
185 |     }
186 | 
187 |     checks['20170914 adv na qub'] = {
188 |         TAGS: {
189 |             POSITIVE: [],
190 |             NEGATIVE: []
191 |         },
192 |         FORMS: {
193 |             POSITIVE: [lambda form, tag: form == 'niedaleko' and tag == 'prep:gen',
194 |                        lambda form, tag: form == 'doprawdy' and tag == 'qub'],
195 |             NEGATIVE: [lambda form, tag: form == 'doprawdy' and tag == 'adv']
196 |         }
197 |     }
198 | 
199 |     conj_to_comp = ['czym', 'ergo', 'jakokolwiek', 'jakoż', 'przeto', 'tedy', 'to', 'toteż', 'więc', 'zatem']
200 |     checks['20170917 conj na comp'] = {
201 |         TAGS: {
202 |             POSITIVE: [],
203 |             NEGATIVE: []
204 |         },
205 |         FORMS: {
206 |             POSITIVE: [],
207 |             NEGATIVE: []
208 |         }
209 |     }
210 |     for token in conj_to_comp:
211 |         checks['20170917 conj na comp'][FORMS][POSITIVE].append(lambda form, tag: form == token and tag == 'comp')
212 |         checks['20170917 conj na comp'][FORMS][NEGATIVE].append(lambda form, tag: form == token and tag == 'conj')
213 | 
214 |     checks['20171224 num:..:congr'] = {
215 |         TAGS: {
216 |             POSITIVE: [lambda tag: tag.startswith('num:') and tag.endswith(':congr')],
217 |             NEGATIVE: []
218 |         },
219 |         FORMS: {
220 |             POSITIVE: [],
221 |             NEGATIVE: []
222 |         }
223 |     }
224 | 
225 |     checks['20180722 adjp -> adjp:dat, adjp:gen; burk -> frag; qub -> part'] = {
226 |         TAGS: {
227 |             POSITIVE: [lambda tag: tag == 'adjp:dat',
228 |                        lambda tag: tag == 'adjp:gen',
229 |                        lambda tag: tag == 'frag',
230 |                        lambda tag: tag == 'part'],
231 |             NEGATIVE: [lambda tag: tag == 'adjp',
232 |                        lambda tag: tag == 'burk',
233 |                        lambda tag: tag == 'qub']
234 |         },
235 |         FORMS: {
236 |             POSITIVE: [],
237 |             NEGATIVE: []
238 |         }
239 |     }
240 | 
241 |     checks['NKJP tagset'] = {
242 |         TAGS: {
243 |             POSITIVE: [lambda tag: tag == 'interj',
244 |                        lambda tag: tag == 'adjc',
245 |                        lambda tag: tag == 'burk',
246 |                        lambda tag: tag == 'numcol'],
247 |             NEGATIVE: []
248 |         },
249 |         FORMS: {
250 |             POSITIVE: [],
251 |             NEGATIVE: []
252 |         }
253 |     }
254 | 
255 |     checks['dig'] = {
256 |         TAGS: {
257 |             POSITIVE: [],
258 |             NEGATIVE: [lambda tag: tag == 'dig']
259 |         },
260 |         FORMS: {
261 |             POSITIVE: [],
262 |             NEGATIVE: []
263 |         }
264 |     }
265 | 
266 |     checks['romandig'] = {
267 |         TAGS: {
268 |             POSITIVE: [],
269 |             NEGATIVE: [lambda tag: tag == 'romandig']
270 |         },
271 |         FORMS: {
272 |             POSITIVE: [],
273 |             NEGATIVE: []
274 |         }
275 |     }
276 | 
277 |     checks['blank'] = {
278 |         TAGS: {
279 |             POSITIVE: [],
280 |             NEGATIVE: [lambda tag: tag == 'blank']
281 |         },
282 |         FORMS: {
283 |             POSITIVE: [],
284 |             NEGATIVE: []
285 |         }
286 |     }
287 | 
288 |     checks['emoticon'] = {
289 |         TAGS: {
290 |             POSITIVE: [],
291 |             NEGATIVE: [lambda tag: tag == 'emoticon']
292 |         },
293 |         FORMS: {
294 |             POSITIVE: [],
295 |             NEGATIVE: []
296 |         }
297 |     }
298 | 
299 |     checks['emo'] = {
300 |         TAGS: {
301 |             POSITIVE: [],
302 |             NEGATIVE: [lambda tag: tag == 'emo']
303 |         },
304 |         FORMS: {
305 |             POSITIVE: [],
306 |             NEGATIVE: []
307 |         }
308 |     }
309 | 
310 |     checks['ign'] = {
311 |         TAGS: {
312 |             POSITIVE: [],
313 |             NEGATIVE: [lambda tag: tag == 'ign']
314 |         },
315 |         FORMS: {
316 |             POSITIVE: [],
317 |             NEGATIVE: []
318 |         }
319 |     }
320 | 
321 |     checks['morfeusz2 tags not in NKJP'] = {
322 |         TAGS: {
323 |             POSITIVE: [],
324 |             NEGATIVE: [lambda tag: tag == 'prefa',
325 |                        lambda tag: tag == 'prefppas',
326 |                        lambda tag: tag == 'prefs',
327 |                        lambda tag: tag == 'prefv',
328 |                        lambda tag: tag == 'nie',
329 |                        lambda tag: tag == 'naj',
330 |                        lambda tag: tag == 'cond',
331 |                        lambda tag: tag == 'substa']
332 |         },
333 |         FORMS: {
334 |             POSITIVE: [],
335 |             NEGATIVE: []
336 |         }
337 |     }
338 | 
339 | 
340 |     test_data = [
341 |         ('IV', '', 'num:::'),
342 |         ('IV', '', 'romandig'),
343 |         ('1', '', 'dig'),
344 |         ('prostu', 'adjp', 'adjp:gen'),
345 |         (':)', '', 'emo'),
346 |         ('godzien', 'adjc', ''),
347 |         ('oślep', 'burk', 'frag'),
348 |         ('obojga', 'numcol:pl:gen:m1:rec', ''),
349 |         ('dwoje', 'numcol:pl:acc:m1:rec', ''),
350 |         ('czworo', 'numcol:pl:nom:m1:rec', ''),
351 |         ('hej', 'interj', ''),
352 |         ('jeszcze', 'qub', 'part'),
353 |         ('czterem', 'num:pl:dat:m1:congr', ''),
354 |         ('czym', 'conj', 'comp'),
355 |         ('niedaleko', 'prep:gen', ''),
356 |         ('doprawdy', 'qub', 'adv'),
357 |         ('jak', 'qub', 'adv'),
358 |         ('pół', '', 'numcomp'),
359 |         ('pół', '', 'num:comp'),
360 |         ('pół', 'num:pl:acc:n:rec', ''),
361 |         ('słowa', 'subst:pl:acc:n', 'subst:sg:gen:n:ncol'),
362 |         ('rozklepywało', '', 'praet:sg:n1:ter:imperf'),
363 |         ('bardzo', 'adv:pos', 'adv'),
364 |         ('bardziej', 'adv:com', ''),
365 |         ('znacząco', 'adv:pos', 'pacta'),
366 |         ('my', '', 'ppron12:pl:nom:_:pri'),
367 |         ('sobie', 'siebie:dat', ''),
368 |         ('zł', 'brev:npun', 'brev'),
369 |     ]
370 | 
371 |     for formX, exist, not_exist in test_data:
372 |         ch={
373 |             TAGS: {
374 |                 POSITIVE: [],
375 |                 NEGATIVE: []
376 |             },
377 |             FORMS: {
378 |                 POSITIVE: [],
379 |                 NEGATIVE: []
380 |             }
381 |         }
382 |         if exist:
383 |             ch[FORMS][POSITIVE]=[lambda form, tag: form == formX and tag == exist]
384 |         if not_exist:
385 |             ch[FORMS][NEGATIVE]=[lambda form, tag: form == formX and tag == not_exist]
386 | 
387 |         checks[f"{formX}, {exist}, {not_exist}"]=ch
388 | 
389 |     for date, functions in checks.items():
390 |         print('Checking: %s' % date)
391 |         for i, function in enumerate(functions[TAGS][POSITIVE]):
392 |             if any([function(tag) for tag in stats_tags]):
393 |                 print('%s. +' % (i,))
394 |             else:
395 |                 print('%s. ?' % (i,))
396 |         for i, function in enumerate(functions[TAGS][NEGATIVE]):
397 |             if any([function(tag) for tag in stats_tags]):
398 |                 print('%s. -' % (i,))
399 |             else:
400 |                 print('%s. ?' % (i,))
401 |         for i, function in enumerate(functions[FORMS][POSITIVE]):
402 |             if any([function(form, tag) for form, tag in stats_forms]):
403 |                 print('%s. +' % (i,))
404 |             else:
405 |                 print('%s. ?' % (i,))
406 | 
407 |         for i, function in enumerate(functions[FORMS][NEGATIVE]):
408 |             if any([function(form, tag) for form, tag in stats_forms]):
409 |                 print('%s. -' % (i,))
410 |             else:
411 |                 print('%s. ?' % (i,))
412 |         print()
413 | 


--------------------------------------------------------------------------------
/clean.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | rm devlog_*.log
4 | rm log_*.log
5 | rm lemmatisation_*.pkl
6 | rm weight_*.hdf5*


--------------------------------------------------------------------------------
/create_dict.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pickle
 3 | from argparse import ArgumentParser
 4 | 
 5 | from tqdm import tqdm
 6 | import jsonlines
 7 | 
 8 | from krnnt.new import preprocess_paragraph_preanalyzed, \
 9 |     preprocess_paragraph_reanalyzed, serialize_sample_paragraph, create_dict
10 | from krnnt.serial_pickle import SerialPickler, SerialUnpickler
11 | from krnnt.structure import Paragraph
12 | 
13 | if __name__ == '__main__':
14 |     parser = ArgumentParser(description='Create dictionary of features')
15 |     parser.add_argument('input_path', type=str, help='path to preprocessed data')
16 |     parser.add_argument('output_path', type=str, help='save path')
17 |     args = parser.parse_args()
18 | 
19 |     file = open(args.input_path, 'rb')
20 |     su = SerialUnpickler(file)
21 | 
22 |     unique_dict = create_dict(su)
23 | 
24 |     with open(args.output_path, 'wb') as file:
25 |         pickle.dump(unique_dict, file)
26 | 
27 |     with open(args.output_path+'.json','w') as file:
28 |         json.dump(unique_dict, file, ensure_ascii=False)


--------------------------------------------------------------------------------
/create_simple_lemmatization.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | sgjp=sys.argv[1]
 4 | 
 5 | print(sgjp)
 6 | 
 7 | def base_tag(tag):
 8 |     transformations = {
 9 |           'ger':  [(['pl'],'sg'), 
10 |                    (['gen','dat','acc','inst','loc','voc'],'nom')], 
11 |           'pact': [(['pl'],'sg'), 
12 |                    (['gen','dat','acc','inst','loc','voc'],'nom'), 
13 |                    (['m2','m3','f','n'], 'm1')],
14 |           'ppas': [(['pl'],'sg'), 
15 |                    (['gen','dat','acc','inst','loc','voc'],'nom'), 
16 |                    (['m2','m3','f','n'], 'm1')],
17 |     }
18 |     
19 |     tag=list(tag)
20 |     
21 |     if tag[0] not in transformations: return None
22 |     
23 |     transforms = transformations[tag[0]]
24 |     for sources, target in transforms:
25 |         for source in sources:
26 |             try:
27 |                 index = tag.index(source)
28 |                 tag[index]=target
29 |                 break
30 |             except ValueError:
31 |                 pass
32 |     return tag
33 |   
34 |   
35 |   
36 | 
37 | import itertools
38 | import tqdm
39 | 
40 | count=0
41 | 
42 | 
43 | lt={}
44 | for line in tqdm.tqdm(open(sgjp), total=7221123):
45 |     row = line.split('\t')[:-1]
46 | #     print(row)
47 |     try:
48 |         form, lemma, tag, other = row
49 |     except ValueError:
50 |         continue
51 |     
52 |     tags=[t.split('.') for t in tag.split(':')]
53 |     for tag in itertools.product(*tags):
54 |         if tag[0] in ['ger','ppas','pact']:
55 |             btag=tuple(base_tag(tag))
56 | #             print(tag, btag, form, lemma)
57 |             if btag == tag:
58 |                 count+=1
59 |                 lemma=lemma.rsplit(':')[0]
60 |                 lt[(lemma,tag)]=form
61 | 
62 | print(count)
63 | 
64 | import pickle
65 | pickle.dump(lt, open('data/ger_ppas_pact.pickle','wb'))


--------------------------------------------------------------------------------
/export_data.py:
--------------------------------------------------------------------------------
 1 | from krnnt.serial_pickle import SerialUnpickler
 2 | from tqdm import tqdm
 3 | 
 4 | from krnnt.structure import Paragraph
 5 | 
 6 | 
 7 | #!/usr/bin/env python
 8 | # -*- coding: utf-8 -*-
 9 | from argparse import ArgumentParser
10 | 
11 | from krnnt.serial_pickle import SerialUnpickler
12 | from krnnt.writers import get_output_converter
13 | 
14 | 
15 | def paragraph_to_result(paragraph: Paragraph):
16 | 
17 |     paragraph2=[]
18 |     for sentence in paragraph:
19 |         try:
20 |             sentence2=[]
21 |             paragraph2.append(sentence2)
22 |             for token in sentence:
23 |                 sentence2.append({
24 |                     'token':token.form,
25 |                     'sep':token.space_before,
26 |                     'tag': token.gold_form.tags,
27 |                     'lemmas': [token.gold_form.lemma],
28 |                 })
29 |         except AttributeError: #omit sentence if some token does no have gold tag
30 |             continue
31 |     return paragraph2
32 | 
33 | if __name__ == '__main__':
34 |     parser = ArgumentParser(description='Export data (before preprocessing) to format')
35 |     parser.add_argument('input_path', help='input path to data')
36 |     parser.add_argument('output_path', help='output path to data')
37 |     parser.add_argument('-f','--format', default='txt', help='output format')
38 | 
39 |     args = parser.parse_args()
40 | 
41 |     with open(args.input_path, 'rb') as file:
42 |         su = SerialUnpickler(file)
43 | 
44 |         converter=get_output_converter(args.format)
45 | 
46 |         string=converter((paragraph_to_result(paragraph_gold) for paragraph_gold in su))
47 | 
48 |         with open(args.output_path, 'w') as output_file:
49 |             output_file.write(string)
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/graph_log.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | from argparse import ArgumentParser
 4 | import matplotlib.pyplot as plt
 5 | import re
 6 | 
 7 | if __name__ == '__main__':
 8 |     parser = ArgumentParser(description='Plots data for graph')
 9 |     parser.add_argument('output_path', help='output path to features dict')
10 |     parser.add_argument('input_path', help='input path to log data')
11 | 
12 | 
13 |     args = parser.parse_args()
14 | 
15 | 
16 |     test_scores=[]
17 |     dev_scores=[]
18 |     with open(args.input_path) as file:
19 |         for line in file:
20 |             m = re.search(r'\'val_score\', (.*?)\)', line)
21 |             if m is None:
22 |                 continue
23 |             test_score=float(m.group(1))
24 | 
25 |             m = re.search(r'\'dev_val_score\', (.*?)\)', line)
26 |             if m is None:
27 |                 continue
28 |             dev_scores += (float(m.group(1)),)
29 |             test_scores+=(test_score, )
30 | 
31 |     t=range(len(test_scores))
32 |     plt.plot(test_scores)
33 | 
34 |     if any([score!=0.0 for score in dev_scores]):
35 |         plt.plot(dev_scores)
36 |     plt.ylabel('some numbers')
37 |     plt.show()
38 | 
39 |     print('Test scores:')
40 |     for score in test_scores:
41 |         print(score)
42 | 
43 |     print('Dev scores:')
44 |     for score in dev_scores:
45 |         print(score)


--------------------------------------------------------------------------------
/join_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | from argparse import ArgumentParser
 4 | 
 5 | from tqdm import tqdm
 6 | 
 7 | from krnnt.serial_pickle import SerialPickler, SerialUnpickler
 8 | 
 9 | if __name__ == '__main__':
10 |     parser = ArgumentParser(description='Join data')
11 |     parser.add_argument('output_path', help='output path to data')
12 |     parser.add_argument('input_paths', nargs='+', help='input paths to data')
13 | 
14 |     args = parser.parse_args()
15 | 
16 |     sp = SerialPickler(open(args.output_path, 'wb'))
17 |     for input_path in args.input_paths:
18 |         su = SerialUnpickler(open(input_path, 'rb'))
19 |         for paragraph in tqdm(su):
20 |             sp.add(paragraph)
21 |     sp.close()
22 | 


--------------------------------------------------------------------------------
/join_dicts.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import pickle
 4 | from argparse import ArgumentParser
 5 | 
 6 | if __name__ == '__main__':
 7 |     parser = ArgumentParser(description='Join features dicts')
 8 |     parser.add_argument('output_path', help='output path to features dict')
 9 |     parser.add_argument('input_paths', nargs='+', help='input paths to features dicts')
10 | 
11 |     parser.add_argument('--reproducible', action='store_true', default=False, help='set seeds')
12 | 
13 |     args = parser.parse_args()
14 | 
15 |     if args.reproducible:
16 |         from numpy.random import seed
17 | 
18 |         seed(1337)
19 |         import random as rn
20 | 
21 |         rn.seed(1337)
22 | 
23 |     print(args.input_paths)
24 |     joined_unique_features_dict = None
25 |     for input_path in args.input_paths:
26 |         unique_features_dict = pickle.load(open(input_path, 'rb'))
27 | 
28 |         if joined_unique_features_dict is None:
29 |             joined_unique_features_dict = unique_features_dict
30 |         else:
31 |             for feature_name, dict2 in unique_features_dict.items():
32 | 
33 |                 if feature_name not in joined_unique_features_dict:
34 |                     joined_index = 0
35 |                 else:
36 |                     joined_index = max(joined_unique_features_dict[feature_name].values()) + 1
37 |                     assert joined_index == len(joined_unique_features_dict[feature_name])
38 | 
39 |                 for value, index in sorted(dict2.items(), key=lambda x: x[1]):
40 |                     if value not in joined_unique_features_dict[feature_name]:
41 |                         joined_unique_features_dict[feature_name][value] = joined_index
42 |                         joined_index += 1
43 | 
44 |     with open(args.output_path, 'wb') as file:
45 |         pickle.dump(joined_unique_features_dict, file)
46 | 


--------------------------------------------------------------------------------
/krnnt/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | logging.getLogger(__name__).addHandler(logging.NullHandler())


--------------------------------------------------------------------------------
/krnnt/additional_format.py:
--------------------------------------------------------------------------------
1 | def additional_format(data, krnntx, morf):
2 |     raise NotImplementedError()
3 | 


--------------------------------------------------------------------------------
/krnnt/aglt.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | 
  4 | def startswith(token, prefixes):
  5 |     for prefix in prefixes:
  6 |         if token.lower().startswith(prefix.lower()):
  7 |             return True
  8 |     return False
  9 | 
 10 | 
 11 | def praet_or_winien(tag):
 12 |     return startswith(tag, ['praet', 'winien'])
 13 | 
 14 | 
 15 | def rule1(sentence):
 16 |     """
 17 |     Find immediate aglt after praet.
 18 |     """
 19 |     result = []
 20 | 
 21 |     for i, token in enumerate(sentence):
 22 | 
 23 |         separator = token['sep']
 24 |         tag = token['tag']
 25 | 
 26 |         if tag.startswith('aglt') and separator == 'none':
 27 |             if praet_or_winien(sentence[i - 1]['tag']):
 28 |                 result.append((i, i - 1, None))
 29 |             elif praet_or_winien(sentence[i - 2]['tag']) and sentence[i - 1]['token'] == 'by':
 30 |                 if sentence[i - 1]['sep'] == 'none':
 31 |                     result.append((i, i - 2, i - 1))
 32 |                 else:
 33 |                     print('błąd?')
 34 |     return result
 35 | 
 36 | def rule1b(sentence):
 37 |     """
 38 |     Find immediate aglt after praet.
 39 |     """
 40 |     result = []
 41 | 
 42 |     for i, token in enumerate(sentence):
 43 | 
 44 | 
 45 |         tag = token['tag']
 46 | 
 47 |         if praet_or_winien(tag):
 48 |             try:
 49 |                 next_token=sentence[i+1]
 50 |                 if next_token['tag'].startswith('aglt') and next_token['sep'] == 'none':
 51 |                     result.append((i+1, i, None))
 52 |                 elif next_token['tag']=='qub' and next_token['token'] == 'by' and next_token['sep'] == 'none':
 53 |                     try:
 54 |                         next_next_token=sentence[i+2]
 55 |                         if next_next_token['tag'].startswith('aglt') and next_next_token['sep'] == 'none':
 56 |                             result.append((i+2, i , i + 1))
 57 |                         else:
 58 |                             result.append((None, i, i + 1))
 59 |                     except IndexError:
 60 |                         result.append((None, i, i + 1))
 61 |             except IndexError:
 62 |                 pass
 63 |     return result
 64 | 
 65 | def rule3(sentence):
 66 |     """
 67 |     Find aglt and then praet as successor.
 68 |     """
 69 |     result = []
 70 | 
 71 |     for i, token in enumerate(sentence):
 72 |         tag = token['tag']
 73 |         if tag.startswith('aglt'):
 74 |             for j in range(i + 1, len(sentence)):
 75 |                 token2 = sentence[j]
 76 |                 if praet_or_winien(token2['tag']):
 77 |                     by_index=None
 78 |                     try:
 79 |                         by_token = sentence[i-1]
 80 |                         if by_token['tag']=='qub' and by_token['token']=='by':
 81 |                             by_index=i-1
 82 |                     except IndexError:
 83 |                         pass
 84 |                     result.append((i, j, by_index))
 85 |                     # print(sentence[i - 2:j + 2])
 86 |                     break
 87 |         elif tag == 'qub' and token['token']=='by':
 88 |             try:
 89 |                 if not sentence[i+1]['tag'].startswith('aglt'):
 90 |                     for j in range(i + 1, len(sentence)):
 91 |                         token2 = sentence[j]
 92 |                         if praet_or_winien(token2['tag']):
 93 |                             result.append((None, j, i))
 94 |                             break
 95 |             except IndexError:
 96 |                 pass
 97 | 
 98 |     return result
 99 | 
100 | 
101 | def rewrite_praet(aglt_token, praet_token, by_token=None):
102 |     """
103 |     Copy person from aglt to praet and change praet to cond.
104 |     """
105 |     praet_tags = list(praet_token['tag'].split(':'))
106 | 
107 |     # praet i aglt mają tę samą liczbę
108 |     if aglt_token is not None:
109 |         aglt_person = aglt_token['tag'].split(':')[2]
110 |         if aglt_token['tag'].split(':')[1] != praet_tags[1]:
111 |             logging.warning(
112 |                 'DIFFERENT NUMBER: %s %s %s %s' % (aglt_token['tag'].split(':')[1], praet_tags[1], aglt_token, praet_token))
113 |             return
114 |         praet_tags.insert(3, aglt_person)
115 | 
116 |     if by_token:
117 |         praet_tags[0] = 'cond'
118 |         if aglt_token is None:
119 |             praet_tags.insert(3, 'ter')
120 | 
121 |     praet_token['tag'] = ':'.join(praet_tags)
122 | 
123 | 
124 | def remove_tokens(sentence, aglt_indexes):
125 |     for i in sorted(aglt_indexes, reverse=True):
126 |         token = sentence[i]
127 | 
128 | 
129 |         #dołącz do formy poprzedzającego tokenu i popraw offsety
130 |         if token['sep']=='none':
131 |             previous_token = sentence[i-1]
132 |             previous_token['end']=token['end']
133 |             previous_token['token'] += token['token']
134 |             sentence.pop(i)
135 | 
136 | def remove_aglt(sentence, rules):
137 |     for rule_index, rule in enumerate(rules):
138 |         pairs = rule(sentence)
139 | 
140 |         for aglt_index, praet_index, by_index in pairs:
141 |             if by_index is None:
142 |                 by_token = None
143 |             else:
144 |                 by_token = sentence[by_index]
145 | 
146 |             if aglt_index is None:
147 |                 aglt_token = None
148 |             else:
149 |                 aglt_token = sentence[aglt_index]
150 |             rewrite_praet(aglt_token, sentence[praet_index], by_token)
151 | 
152 |         aglt_indexes = [aglt_index for aglt_index, praet_index, by_index in pairs] + [by_index for
153 |                                                                                       aglt_index, praet_index, by_index
154 |                                                                                       in pairs]
155 |         aglt_indexes = [x for x in aglt_indexes if x is not None]
156 |         remove_tokens(sentence, aglt_indexes)
157 | 
158 | 
159 | def remove_aglt_from_results(results, rules):
160 |     for paragraph in results:
161 |         for sentence in paragraph:
162 |             remove_aglt(sentence, rules)
163 |     return results
164 | 
165 | 
166 | def remove_aglt_from_results_rule1_3(results):
167 |     return remove_aglt_from_results(results, [rule1b, rule3])
168 | 


--------------------------------------------------------------------------------
/krnnt/aligner.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import logging
  3 | 
  4 | from krnnt.structure import Token, Sentence, Paragraph
  5 | 
  6 | 
  7 | def text(buffer):
  8 |     return ''.join([' '+token.form if (token.space_before is True or (token.space_before is not False and token.space_before!='none')) else token.form for token in buffer])
  9 | 
 10 | def align(pred, ref, ref_text_old=''):
 11 |     pred_buffer = [pred.pop(0)]
 12 |     ref_buffer = [ref.pop(0)]
 13 |     if ref_text_old:
 14 |         t = Token()
 15 |         t.form = ref_text_old
 16 |         t.space_before=False
 17 |         ref_buffer.insert(0, t)
 18 | 
 19 |     while pred_buffer or ref_buffer:
 20 |         pred_text = text(pred_buffer)
 21 |         ref_text = text(ref_buffer)
 22 |         # print("BUFFERS: [%s] [%s]" % ([str(x) for x in pred_buffer], [str(x) for x in ref_buffer]))
 23 |         # print("BUFFERS: [%s] [%s]" % (pred_text, ref_text))
 24 |         if len(pred_text) == len(ref_text):  # aligned
 25 |             if pred_text != ref_text:
 26 |                 print('alignment ERROR', pred_text, ref_text, ref, pred, file=sys.stderr)
 27 |                 logging.error("alignment ERROR")
 28 |             yield (pred_buffer, ref_buffer, ref_text[len(pred_text):])
 29 | 
 30 |             pred_buffer=[]
 31 |             ref_buffer = []
 32 | 
 33 |             #print(pred)
 34 |             if not pred or not ref:
 35 |                 #print('break', pred)
 36 |                 break
 37 | 
 38 |             pred_buffer = [pred.pop(0)]
 39 |             ref_buffer = [ref.pop(0)]
 40 |         elif len(pred_text) < len(ref_text):
 41 |             if pred:
 42 |                 pred_buffer.append(pred.pop(0))
 43 |             else:
 44 | 
 45 |                 print('break2', pred_text, ref_text, ref_text[len(pred_text):], file=sys.stderr)
 46 | 
 47 |                 # print([x.form for x in pred_buffer])
 48 |                 # print([x.space_before for x in pred_buffer])
 49 |                 # print([x.form for x in ref_buffer])
 50 |                 # print([x.space_before for x in ref_buffer])
 51 | 
 52 |                 #skroc ref_buffer
 53 |                 asd=[]
 54 |                 for x in ref_buffer:
 55 |                     asd.append(x)
 56 |                     if len(pred_text) >= len(text(asd)):
 57 | 
 58 |                         break
 59 |                 ref_buffer=asd
 60 |                 if len(pred_text) < len(text(asd)):
 61 |                     print('RRRR', asd[-1].form, file=sys.stderr)
 62 |                     asd[-1].form = asd[-1].form[:len(pred_text)-1]
 63 |                     print('RRRR', asd[-1].form, file=sys.stderr)
 64 |                 print(text(ref_buffer), 'XXX', text(ref), file=sys.stderr)
 65 | 
 66 | 
 67 |                 break
 68 |         else:
 69 |             if ref:
 70 |                 ref_buffer.append(ref.pop(0))
 71 |             else:
 72 |                 print('break3', file=sys.stderr)
 73 |                 break
 74 | 
 75 |     rest = ref_buffer # + ref
 76 |     if rest:
 77 |         yield (pred_buffer+pred, rest, ref_text[len(pred_text):])
 78 |     # print('rest', pred, ref)
 79 | 
 80 | def align_paragraphs(paragraph_reanalyzed: Paragraph, paragraph_gold: Paragraph) -> Paragraph:
 81 |     tokens_gold = []
 82 |     for sentence_gold in paragraph_gold:
 83 |         for token_gold in sentence_gold:
 84 |             tokens_gold.append(token_gold)
 85 |             token_gold.form = token_gold.form.replace('\xa0', ' ')  # "a j e n t a"
 86 | 
 87 | 
 88 |     ref_text_old = ''
 89 |     paragraph_reanalyzed.concraft = []
 90 |     for sentence_reanalyzed in paragraph_reanalyzed:
 91 |         # print('XXXXXXXXXXXXXXXXXXXXXXXXXXXNEW')
 92 |         sentence_reanalyzed_gold = Sentence()
 93 |         paragraph_reanalyzed.concraft.append(sentence_reanalyzed_gold)
 94 |         for p, r, ref_text_old in align([token for token in sentence_reanalyzed.tokens], tokens_gold, ref_text_old):
 95 | 
 96 |             if p:
 97 |                 for r1 in r:
 98 |                     sentence_reanalyzed_gold.add_token(r1)
 99 |                 if text(p) != text(r):
100 |                     print('ERR', [t.form for t in p], [t.form for t in r], file=sys.stderr)
101 |                 # if len(p)!=len(r):
102 |                 # print(text(p),'_____', text(r))
103 |                 # print(len(tokens_gold))
104 |                 if len(p) == len(r):
105 |                     for p1, r1 in zip(p, r):
106 |                         p1.gold_form = r1.gold_form
107 |     return paragraph_reanalyzed
108 | 
109 | def align_paragraphs2(paragraph_reanalyzed: Paragraph, paragraph_gold: Paragraph) -> Paragraph:
110 |     tokens_gold = []
111 |     for sentence_gold in paragraph_gold:
112 |         for token_gold in sentence_gold:
113 |             tokens_gold.append(token_gold)
114 |             token_gold.form = token_gold.form.replace('\xa0', ' ')  # "a j e n t a"
115 | 
116 | 
117 |     ref_text_old = ''
118 |     paragraph_reanalyzed.concraft = []
119 |     for sentence_reanalyzed in paragraph_reanalyzed:
120 |         # print('XXXXXXXXXXXXXXXXXXXXXXXXXXXNEW')
121 |         sentence_reanalyzed_gold = Sentence()
122 |         paragraph_reanalyzed.concraft.append(sentence_reanalyzed_gold)
123 |         for p, r, ref_text_old in align([token for token in sentence_reanalyzed.tokens], tokens_gold, ref_text_old):
124 | 
125 |             if p:
126 |                 for r1 in r:
127 |                     sentence_reanalyzed_gold.add_token(r1)
128 |                 if text(p) != text(r):
129 |                     print('ERR', [t.form for t in p], [t.form for t in r], file=sys.stderr)
130 |                 # if len(p)!=len(r):
131 |                 # print(text(p),'_____', text(r))
132 |                 # print(len(tokens_gold))
133 |                 if len(p) == len(r):
134 |                     for p1, r1 in zip(p, r):
135 |                         p1.interpretations = r1.interpretations
136 |     return paragraph_reanalyzed


--------------------------------------------------------------------------------
/krnnt/analyzers.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import re
  3 | import sys
  4 | from subprocess import PIPE, Popen
  5 | from typing import Iterable, Generator, List, Tuple
  6 | 
  7 | from krnnt.structure import Form, Token, Sentence, Paragraph
  8 | from krnnt.utils import uniq
  9 | 
 10 | try:
 11 |     from maca_analyse import maca_analyse
 12 | except ImportError:
 13 |     pass
 14 | 
 15 | 
 16 | # TODO morfeusz analyzer for pretokenized?
 17 | 
 18 | class MacaAnalyzer:
 19 |     def __init__(self, maca_config: str, toki_config_path: str = ''):
 20 |         self.maca_config = maca_config
 21 |         self.toki_config_path = toki_config_path
 22 |         self.configure()
 23 | 
 24 |     def _maca(self, text: str) -> Generator[str, None, None]:
 25 |         """
 26 |         Yields output of Maca by sentences,
 27 |         """
 28 |         raise NotImplementedError()
 29 | 
 30 |     def configure(self):
 31 |         if 'maca_analyse' in sys.modules:
 32 |             self._maca = self._maca_wrapper
 33 |         else:
 34 |             self._maca = self._maca_process
 35 | 
 36 |     def analyze(self, text: str) -> Paragraph:
 37 |         results = self._maca(text)
 38 | 
 39 |         paragraph_reanalyzed = Paragraph()
 40 |         for i, res in enumerate(results):
 41 |             result = self._parse(res)
 42 |             sentence_reanalyzed = Sentence()
 43 |             paragraph_reanalyzed.add_sentence(sentence_reanalyzed)
 44 |             for form, space_before, interpretations, start, end in result:
 45 |                 token_reanalyzed = Token()
 46 |                 sentence_reanalyzed.add_token(token_reanalyzed)
 47 |                 token_reanalyzed.form = form
 48 |                 token_reanalyzed.space_before = space_before  # != 'none'
 49 |                 interpretations = [(re.sub(r':[abcdijnopqsv]\d?$', '', l), t) for l, t in
 50 |                                    interpretations]  # remove senses
 51 |                 token_reanalyzed.interpretations = [Form(l.replace('_', ' '), t) for l, t in uniq(interpretations)]
 52 |                 token_reanalyzed.start = start
 53 |                 token_reanalyzed.end = end
 54 |         return paragraph_reanalyzed
 55 | 
 56 |     def _maca_process(self, text: str) -> Generator[str, None, None]:
 57 |         cmd = ['maca-analyse', '-c', self.maca_config, '-l'] #TODO: -l ?
 58 |         if self.toki_config_path:
 59 |             cmd.extend(['--toki-config-path', self.toki_config_path])
 60 |         p = Popen(cmd, stdout=PIPE, stdin=PIPE, stderr=PIPE)
 61 | 
 62 |         self.text = text
 63 |         # self.text = '\n'.join(batch)
 64 |         self.last_offset = 0
 65 | 
 66 |         stdout = p.communicate(input=self.text.encode('utf-8'))[0]
 67 |         try:
 68 |             p.stdin.close()
 69 |         except BrokenPipeError:
 70 |             pass
 71 |         p.wait()
 72 |         if p.returncode != 0:
 73 |             raise Exception('Maca is not working properly')
 74 |         for i in stdout.decode('utf-8').split('\n\n'):
 75 |             if len(i) > 0:
 76 |                 yield i
 77 | 
 78 |     def _maca_wrapper(self, text: str) -> Generator[str, None, None]:
 79 |         # self.text = '\n'.join(batch)
 80 |         self.text = text
 81 |         self.last_offset = 0
 82 | 
 83 |         output_text = maca_analyse(self.maca_config, self.toki_config_path, self.text, False, False)
 84 | 
 85 |         for i in output_text.split('\n\n'):
 86 |             if len(i) > 0:
 87 |                 yield i
 88 | 
 89 |     def _parse(self, output: str) -> List[Tuple[str, str, List[Tuple[str, str]], int, int]]:
 90 |         """
 91 |         Parses one sentence output of Maca.
 92 |         """
 93 |         data = []
 94 |         lemma_lines = []
 95 |         token_line = None
 96 |         for line in output.split("\n"):
 97 |             if line.startswith("\t"):
 98 |                 lemma_lines.append(line)
 99 |             else:
100 |                 if token_line is not None:
101 |                     data.append((token_line, lemma_lines))
102 |                     lemma_lines = []
103 |                 token_line = line
104 |         data.append((token_line, lemma_lines))
105 | 
106 |         tokens = []
107 | 
108 |         for index, (token_line, lemma_lines) in enumerate(data):
109 |             token = self._construct(token_line, lemma_lines)  # 80%
110 |             if token is None: continue
111 |             form, space_before, interpretations = token
112 |             start = self.text.index(form, self.last_offset)
113 |             end = start + len(form)
114 |             self.last_offset = end
115 |             tokens.append((form, space_before, interpretations, start, end))
116 | 
117 |         return tokens
118 | 
119 |     def _construct(self, token_line: str, lemma_lines: Iterable[str]) -> Tuple[str, str, List[Tuple[str, str]]]:
120 |         try:
121 |             if token_line == '': return None
122 |             form, space_before = token_line.split("\t")
123 |         except ValueError:
124 |             logging.exception("Probably Maca is not working.")
125 |             raise Exception('Probably Maca is not working.')
126 | 
127 |         interpretations = []
128 | 
129 |         for lemma_line in lemma_lines:
130 |             row = lemma_line.strip().split("\t")
131 |             try:
132 |                 lemma, tags, _ = row  # 30%
133 |                 # disamb = True
134 |             except ValueError:
135 |                 lemma, tags = row  # 16%
136 |                 # disamb = False
137 |             interpretation = (lemma, tags)
138 |             # lemma.disamb=disamb
139 |             interpretations.append(interpretation)
140 | 
141 |         return form, space_before, interpretations
142 | 


--------------------------------------------------------------------------------
/krnnt/blanks.py:
--------------------------------------------------------------------------------
 1 | def remove_blanks_from_results(results):
 2 |     for paragraph in results:
 3 |         for sentence in paragraph:
 4 |             remove_blanks(sentence)
 5 |     return results
 6 | 
 7 | def remove_blanks(sentence):
 8 |     """
 9 | 
10 |     """
11 |     result = []
12 | 
13 |     i=1
14 |     while i<len(sentence):
15 |         token=sentence[i]
16 |         tag = token['tag']
17 |         if tag=='blank':
18 |             join_token_to_previous(sentence, i)
19 |         else:
20 |             i+=1
21 |     return result
22 | 
23 | def join_token_to_previous(sentence, token_id):
24 |     previous_token = sentence[token_id - 1]
25 |     token = sentence[token_id]
26 | 
27 |     previous_token['end'] = token['end']
28 |     previous_token['token'] += token['token']
29 | 
30 |     #TODO: lemmas?
31 | 
32 |     sentence.pop(token_id)


--------------------------------------------------------------------------------
/krnnt/features.py:
--------------------------------------------------------------------------------
  1 | from typing import Iterable, List
  2 | 
  3 | from krnnt.utils import uniq, flatten, shape
  4 | 
  5 | try:
  6 |     import krnnt_utils
  7 |     from krnnt_utils import shape
  8 | except:
  9 |     pass
 10 | 
 11 | class FeaturePreprocessor:
 12 |     qubs = {'a', 'abo', 'aby', 'akurat', 'albo', 'ale', 'amen', 'ani', 'aż', 'aza', 'bądź', 'blisko', 'bo', 'bogać',
 13 |             'by', 'byle', 'byleby', 'choć', 'choćby', 'chociaż', 'chociażby', 'chyba', 'ci', 'co', 'coś', 'czy',
 14 |             'czyli', 'czyż', 'dalibóg', 'dobra', 'dokładnie', 'doprawdy', 'dość', 'dosyć', 'dziwna', 'dziwniejsza',
 15 |             'gdyby', 'gdzie', 'gdzieś', 'hale', 'i', 'ino', 'istotnie', 'jakby', 'jakoby', 'jednak', 'jedno', 'jeno',
 16 |             'koło', 'kontra', 'lada', 'ledwie', 'ledwo', 'li', 'maksimum', 'minimum', 'może', 'najdziwniejsza',
 17 |             'najmniej', 'najwidoczniej', 'najwyżej', 'naturalnie', 'nawzajem', 'ni', 'niby', 'nie', 'niechaj',
 18 |             'niejako', 'niejakoś', 'no', 'nuż', 'oczywiście', 'oczywista', 'okay', 'okej', 'około', 'oto', 'pewnie',
 19 |             'pewno', 'podobno', 'ponad', 'ponoś', 'prawda', 'prawie', 'przecie', 'przeszło', 'raczej', 'skąd',
 20 |             'skądinąd', 'skądże', 'szlus', 'ta', 'taj', 'tak', 'tam', 'też', 'to', 'toż', 'tuż', 'tylko', 'tylo',
 21 |             'widocznie', 'właśnie', 'wprost', 'wręcz', 'wszakże', 'wszelako', 'za', 'zaledwie', 'zaledwo', 'żali',
 22 |             'zaliż', 'zaraz', 'że', 'żeby', 'zwłaszcza'}
 23 |     safe_chars = {'1', '2', '3', '4', '5', '6', '7', '8', '9', '0', u'?', u'-', u'a', u'ą', u'c', u'ć', u'b', u'e',
 24 |                   u'ę', u'd', u'g', u'f', u'i', u'h', u'k', u'j', u'm', u'l', u'ł', u'o', u'ó', u'n', u'ń', u'q', u'p',
 25 |                   u's', u'ś', u'r', u'u', u't', u'w', u'y', u'x', u'z', u'ź', u'ż'}
 26 | 
 27 |     @staticmethod
 28 |     def nic(form: str, features: dict=None) -> List[str]:
 29 |         return ['NIC']
 30 | 
 31 |     @staticmethod
 32 |     def interps(form, features) -> List[str]:
 33 |         if 'interp' in features['tags'] and len(form) == 1:
 34 |             return [form]
 35 |         else:
 36 |             return []
 37 | 
 38 |     @staticmethod
 39 |     def qubliki(form, features=None) -> List[str]:
 40 |         if form.lower() in FeaturePreprocessor.qubs:
 41 |             return [form.lower()] #TODO: form.lower()
 42 |         else:
 43 |             return []
 44 | 
 45 |     @staticmethod
 46 |     def shape(form, features=None) -> List[str]:
 47 |         # print(form, shape(form))
 48 |         return [shape(form)]
 49 | 
 50 |     @staticmethod
 51 |     def prefix(n, form, features=None) -> List[str]:
 52 |         try:
 53 |             char = form[n].lower()
 54 |             if char not in FeaturePreprocessor.safe_chars:
 55 |                 char = '??'
 56 |         except IndexError:
 57 |             char = 'xx'
 58 | 
 59 |         return ['P' + str(n) + char]
 60 | 
 61 |     @staticmethod
 62 |     def prefix1(form, features=None) -> List[str]:
 63 |         return FeaturePreprocessor.prefix(0, form, features)
 64 | 
 65 |     @staticmethod
 66 |     def prefix2(form, features=None) -> List[str]:
 67 |         return FeaturePreprocessor.prefix(1, form, features)
 68 | 
 69 |     @staticmethod
 70 |     def prefix3(form, features=None) -> List[str]:
 71 |         return FeaturePreprocessor.prefix(2, form, features)
 72 | 
 73 |     @staticmethod
 74 |     def suffix(n, form, features=None) -> List[str]:
 75 |         try:
 76 |             char = form[-n].lower()
 77 |             if char not in FeaturePreprocessor.safe_chars:
 78 |                 char = '??'
 79 |         except IndexError:
 80 |             char = 'xx'
 81 | 
 82 |         return ['S' + str(n) + char]
 83 | 
 84 |     @staticmethod
 85 |     def suffix1(form, features=None) -> List[str]:
 86 |         return FeaturePreprocessor.suffix(1, form, features)
 87 | 
 88 |     @staticmethod
 89 |     def suffix2(form, features=None) -> List[str]:
 90 |         return FeaturePreprocessor.suffix(2, form, features)
 91 | 
 92 |     @staticmethod
 93 |     def suffix3(form, features=None) -> List[str]:
 94 |         return FeaturePreprocessor.suffix(3, form, features)
 95 | 
 96 | 
 97 | class TagsPreprocessorCython:
 98 |     @staticmethod
 99 |     def create_tags4_without_guesser(tags, features=None) -> List[str]:
100 |         return krnnt_utils.create_tags4_without_guesser(tags)
101 | 
102 |     @staticmethod
103 |     def create_tags5_without_guesser(tags, features=None) -> List[str]:
104 |         return krnnt_utils.create_tags5_without_guesser(tags)
105 | 
106 | 
107 | class TagsPreprocessor:
108 |     cas = ['nom', 'gen', 'dat', 'acc', 'inst', 'loc', 'voc']
109 |     per = ['pri', 'sec', 'ter']
110 |     nmb = ['sg', 'pl']
111 |     gnd = ['m1', 'm2', 'm3', 'f', 'n']
112 | 
113 |     @staticmethod
114 |     def create_tags4(tags, features=None, keep_guesser=True) -> List[str]:  # concraft
115 |         if not keep_guesser and 'ign' in tags:
116 |             return ['ign']
117 |             # return ['1ign','2ign','1subst:nom','2subst:sg:f','1adj:nom','1subst:gen','2subst:sg:n','2subst:sg:m1','2adj:sg:m3:pos','2subst:sg:m3','1num:acc','2num:pl:m3:rec','1brev','2adj:sg:n:pos','2num:pl:m3:congr','1num:nom','1adj:gen','1adj:loc']
118 |         return uniq(flatten(map(lambda tag: TagsPreprocessor.create_tag4(tag), tags)))
119 | 
120 |     @staticmethod
121 |     def create_tags4_without_guesser(tags, features=None) -> List[str]:
122 |         return TagsPreprocessor.create_tags4(tags, features=features, keep_guesser=False)
123 | 
124 |     @staticmethod
125 |     def create_tag4(otag, features=None) -> List[str]:
126 |         tags = flatten(map(lambda x: x.split('.'), otag.split(':')))
127 |         pos = tags[0]
128 |         tags = tags[1:]
129 |         tags2 = []
130 | 
131 |         first = None
132 |         for tag in tags:
133 |             if tag in TagsPreprocessor.cas or tag in TagsPreprocessor.per:
134 |                 first = tag
135 |                 break
136 | 
137 |         if first:
138 |             tags.remove(first)
139 |             tags2.append('1' + pos + ':' + first)
140 |         else:
141 |             tags2.append('1' + pos)  # TODO sprawdzic
142 | 
143 |         tags2.append('2' + (':'.join([pos] + tags)))
144 | 
145 |         # print otag, tags2
146 |         return uniq(tags2)
147 | 
148 |     @staticmethod
149 |     def create_tags5(tags, features=None, keep_guesser=True) -> List[str]:  # concraft
150 |         if not keep_guesser and 'ign' in tags:
151 |             return ['ign']
152 |             # return ['ign','sg:loc:m3','sg:nom:n','pl:nom:m3','pl:acc:m3','loc','sg:gen:m3','pl:gen:m3','sg:nom:m1','sg:nom:m3','gen','nom','acc','sg:nom:f']
153 | 
154 |         return uniq(flatten(map(lambda tag: TagsPreprocessor.create_tag5(tag), tags)))
155 | 
156 |     @staticmethod
157 |     def create_tags5_without_guesser(tags, features=None) -> List[str]:
158 |         return TagsPreprocessor.create_tags5(tags, features=features, keep_guesser=False)
159 | 
160 |     @staticmethod
161 |     def create_tag5(otag, features=None) -> List[str]:
162 | 
163 |         tags = flatten(map(lambda x: x.split('.'), otag.split(':')))
164 | 
165 |         tags_out = []
166 |         tags2 = []
167 |         tags3 = []
168 |         for tag in tags:
169 |             if tag in TagsPreprocessor.nmb:
170 |                 tags2.append(tag)
171 |             elif tag in TagsPreprocessor.cas:
172 |                 tags2.append(tag)
173 |                 tags3.append(tag)
174 |             elif tag in TagsPreprocessor.gnd:
175 |                 tags2.append(tag)
176 | 
177 |         for tagsX in [tags2, tags3]:
178 |             if tagsX:
179 |                 tags_out.append(':'.join(tagsX))
180 | 
181 |         return uniq(tags_out)
182 | 
183 | def create_token_features(token, tags, space_before) -> List[str]: #TODO
184 |     f = []
185 |     f+=FeaturePreprocessor.interps(token, {'tags':tags})
186 |     f+=FeaturePreprocessor.qubliki(token)
187 |     f+=FeaturePreprocessor.shape(token)  # 90%
188 |     f+=FeaturePreprocessor.prefix1(token)
189 |     f+=FeaturePreprocessor.prefix2(token)
190 |     f+=FeaturePreprocessor.prefix3(token)
191 |     f+=FeaturePreprocessor.suffix1(token)
192 |     f+=FeaturePreprocessor.suffix2(token)
193 |     f+=FeaturePreprocessor.suffix3(token)
194 |     f+=TagsPreprocessorCython.create_tags4_without_guesser(
195 |         tags)  # 3% moze cache dla wszystkich tagów
196 |     f+=TagsPreprocessorCython.create_tags5_without_guesser(tags)  # 3%
197 |     f+=space_before
198 | 
199 |     return f


--------------------------------------------------------------------------------
/krnnt/keras_models.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import uuid
 3 | from typing import Dict
 4 | 
 5 | import keras
 6 | from keras.layers import Dense, Dropout, Input, GRU, TimeDistributed, \
 7 |     Masking
 8 | from keras.layers.wrappers import Bidirectional
 9 | from keras.models import Model
10 | 
11 | 
12 | class ExperimentParameters:
13 |     def __init__(self, pref: Dict, testing=False):
14 |         self.pref = pref.copy()
15 |         if testing:
16 |             pass  # TODO self.h
17 |         else:
18 |             if 'h' not in self.pref:
19 |                 self.pref['h'] = str(uuid.uuid1())
20 |             self.h = self.pref['h']
21 |             self.pref['weight_path'] = 'weight_' + self.h + '.hdf5'
22 |             self.pref['lemmatisation_path'] = 'lemmatisation_' + self.h + '.pkl'
23 | 
24 |     def save_prefs(self):
25 |         # TODO
26 |         print(self.pref)
27 | 
28 | 
29 | class KerasModel:
30 |     model: Model
31 | 
32 |     def __init__(self, parameters: ExperimentParameters):
33 |         self.parameters = parameters
34 | 
35 |     def compile(self):
36 |         logging.info('Model compiling')
37 |         self.model.compile(loss=self.loss, optimizer=self.optimizer, metrics=['accuracy'])
38 |         logging.info('Model compiled')
39 | 
40 |     def make_predict_func(self):
41 |         self.model._make_predict_function()
42 | 
43 |     def load_weights(self, path):
44 |         self.model.load_weights(path)
45 |         logging.info('Weights loaded')
46 | 
47 |     def load_model(self, path):
48 |         self.model = keras.models.load_model(path)
49 | 
50 |     def yaml_model(self):
51 |         model_yaml = self.model.to_yaml()
52 |         # TODO
53 |         return model_yaml
54 | 
55 |     def create_model(self):
56 |         raise NotImplementedError
57 | 
58 | 
59 | class BEST(KerasModel):
60 |     def __init__(self, parameters):
61 |         super().__init__(parameters)
62 | 
63 |     def create_model(self):
64 |         features_length = self.parameters.pref['features_length']
65 | 
66 |         inputs = Input(shape=(None, features_length))
67 |         x = inputs
68 |         x = Masking(mask_value=0., input_shape=(None, features_length))(x)
69 |         x = Bidirectional(
70 |             GRU(self.parameters.pref['internal_neurons'], return_sequences=True, dropout=0.0, recurrent_dropout=0.5,
71 |                 implementation=1), input_shape=(None, features_length))(x)
72 |         x = Bidirectional(
73 |             GRU(self.parameters.pref['internal_neurons'], return_sequences=True, dropout=0.0, recurrent_dropout=0.5,
74 |                 implementation=1), input_shape=(None, features_length))(x)
75 |         x = Dropout(0.5)(x)
76 |         x = TimeDistributed(Dense(self.parameters.pref['output_length'], activation='softmax'))(x)
77 | 
78 |         self.model = Model(inputs=inputs, outputs=x)
79 | 
80 |         self.loss = 'categorical_crossentropy'
81 |         self.optimizer = keras.optimizers.Nadam()
82 | 


--------------------------------------------------------------------------------
/krnnt/pipeline.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import math
  3 | import pickle
  4 | import re
  5 | import sys
  6 | from typing import List, Iterable, Generator, Union
  7 | 
  8 | from krnnt.analyzers import MacaAnalyzer
  9 | from krnnt.structure import Paragraph
 10 | 
 11 | from .keras_models import ExperimentParameters, KerasModel
 12 | from krnnt.utils import uniq
 13 | from .new import k_hot, UniqueFeaturesValues, Lemmatisation, Lemmatisation2
 14 | from krnnt.features import create_token_features
 15 | 
 16 | sys.setrecursionlimit(10000)
 17 | 
 18 | from keras.preprocessing import sequence
 19 | import numpy as np
 20 | import krnnt_utils
 21 | 
 22 | 
 23 | class KRNNTSingle:
 24 |     def __init__(self, pref):
 25 |         self.pref = pref
 26 |         self.unique_features_dict = pickle.load(open(pref['UniqueFeaturesValues'], 'rb'))
 27 |         self.km = KerasThread.create_model(pref, testing=True)
 28 |         self.lemmatisation = pref['lemmatisation_class']()
 29 |         self.lemmatisation.load(pref['lemmatisation_path'])
 30 | 
 31 |         self.configure()
 32 | 
 33 |     def tag_sentence(self, sentence: str, preana=False):
 34 |         return self.__tag([sentence], preana)
 35 | 
 36 |     def tag_sentences(self, sentences: List[str], preana=False):
 37 |         return self.__tag(sentences, preana)
 38 | 
 39 |     def tag_sentences_preana(self, sentences: List[Paragraph]):
 40 |         return self.__tag(sentences, preana=True)
 41 | 
 42 |     def tag_paragraphs(self, paragraphs: Iterable[str], preana=False):
 43 |         return self.__tag_paragraphs(paragraphs, preana)
 44 | 
 45 |     def __tag_paragraphs(self, paragraphs: Iterable[str], preana):
 46 | 
 47 | 
 48 |         if preana:
 49 |             sequences = Preprocess.process_batch_preana(enumerate(paragraphs))
 50 |         else:
 51 |             sequences = Preprocess.process_batch(paragraphs, self.pref['maca_config'], self.pref['toki_config_path'])
 52 | 
 53 |         # batch_size=math.ceil(len_sequences/max(math.floor(len_sequences/self.pref['keras_batch_size']), 1)) # dynamic batch
 54 | 
 55 |         result = []
 56 |         for batch in chunk(sequences, self.pref['keras_batch_size']):
 57 |             pad_batch = self.pad(batch, self.unique_features_dict, 'tags4e3')
 58 |             preds = self.km.model.predict_on_batch(pad_batch)
 59 |             for plain in KerasThread.return_results(batch, preds, self.km.classes, self.lemmatisation):
 60 |                 result.append(plain)
 61 | 
 62 | 
 63 |         # podziel na paragrafy
 64 |         result2=[]
 65 |         result_paragraph=[]
 66 |         for sentence in result:
 67 |             if not result_paragraph or sentence[0]['document_id']==result_paragraph[-1][0]['document_id']:
 68 |                 result_paragraph+= (sentence, )
 69 |             else:
 70 |                 result2+=(result_paragraph,)
 71 |                 result_paragraph=[sentence]
 72 | 
 73 |         if result_paragraph:
 74 |             result2 += (result_paragraph,)
 75 | 
 76 |         return result2
 77 | 
 78 |     def configure(self):
 79 |         if 'krnnt_utils' in sys.modules:
 80 |             self.pad = krnnt_utils.pad
 81 |         else:
 82 |             self.pad = Preprocess.pad
 83 | 
 84 |     def __tag(self, sentences: List[str], preana: bool):
 85 |         if preana:
 86 |             sequences = Preprocess.process_batch_preana(enumerate(sentences))
 87 |         else:
 88 |             sequences = Preprocess.process_batch(sentences, self.pref['maca_config'], self.pref['toki_config_path'])
 89 | 
 90 |         # batch_size=math.ceil(len_sequences/max(math.floor(len_sequences/self.pref['keras_batch_size']), 1)) # dynamic batch
 91 | 
 92 |         result = []
 93 |         for batch in chunk(sequences, self.pref['keras_batch_size']):
 94 |             pad_batch = self.pad(batch, self.unique_features_dict, 'tags4e3')
 95 |             preds = self.km.model.predict_on_batch(pad_batch)
 96 |             for plain in KerasThread.return_results(batch, preds, self.km.classes, self.lemmatisation):
 97 |                 result.append(plain)
 98 | 
 99 |         return result
100 | 
101 | 
102 | class Sample:
103 |     def __init__(self):
104 |         self.features = {}
105 | 
106 | 
107 | class Preprocess:
108 |     @staticmethod
109 |     def create_features(sequence: List[Sample]):
110 |         for sample in sequence:
111 |             sample.features['tags4e3'] = create_token_features(sample.features['token'], sample.features['tags'],
112 |                                                                sample.features['space_before'])
113 | 
114 |     @staticmethod
115 |     def process_batch(documents: Iterable[str], maca_config: str, toki_config_path: str) -> Generator[
116 |         List[Sample], None, None]:
117 |         maca_analyzer = MacaAnalyzer(maca_config, toki_config_path)
118 | 
119 |         for document_id, document in enumerate(documents):
120 |             results = maca_analyzer._maca(document)
121 | 
122 |             for res in results:
123 |                 result = maca_analyzer._parse(res)
124 | 
125 |                 sequence = []
126 |                 for form, space_before, interpretations, start, end in result:
127 |                     sample = Sample()
128 |                     sequence.append(sample)
129 |                     sample.features['token'] = form
130 |                     sample.features['tags'] = uniq([t for l, t in interpretations])
131 |                     interpretations = [(re.sub(r':[abcdijnopqsv]\d?$', '', l), t) for l, t in
132 |                                        interpretations]
133 |                     sample.features['maca_lemmas'] = [(l.replace('_', ' '), t) for l, t in uniq(interpretations)]
134 | 
135 |                     # TODO: cleanup space before
136 |                     sample.features['space_before'] = ['space_before'] if space_before !='none' else [
137 |                         'no_space_before']
138 |                     sample.features['space_before'].append(space_before)
139 |                     sample.features['start'] = start
140 |                     sample.features['end'] = end
141 |                     sample.features['document_id'] = document_id
142 |                 Preprocess.create_features(sequence)
143 | 
144 |                 if sequence:
145 |                     yield sequence
146 | 
147 |     @staticmethod
148 |     def process_batch_preana(batch: Iterable[Paragraph]) -> Generator[List[Sample], None, None]:
149 |         for document_id, paragraph in batch:
150 |             for sentence in paragraph:
151 |                 sequence = []
152 |                 for token in sentence:
153 |                     sample = Sample()
154 |                     sequence.append(sample)
155 |                     sample.features['token'] = token.form
156 |                     sample.features['tags'] = uniq([form.tags for form in token.interpretations])
157 |                     sample.features['maca_lemmas'] = uniq([(form.lemma, form.tags) for form in token.interpretations])
158 |                     sample.features['space_before'] = ['space_before'] if token.space_before else ['no_space_before']
159 |                     sample.features['space_before'].append(token.space_before)
160 |                     sample.features['document_id'] = document_id
161 |                 Preprocess.create_features(sequence)
162 | 
163 |                 if sequence:
164 |                     yield sequence
165 | 
166 |     @staticmethod
167 |     def pad(batch: List[List[Sample]], unique_features_dict, feature_name: str):
168 |         if not batch:
169 |             return []
170 | 
171 |         result_batchX = []
172 |         for sentence in batch:
173 |             X_sentence = []
174 |             for sample in sentence:
175 |                 X_sentence.append(np.array(k_hot(sample.features[feature_name], unique_features_dict[feature_name])))
176 | 
177 |             result_batchX.append(X_sentence)
178 | 
179 |         return sequence.pad_sequences(result_batchX)
180 | 
181 | 
182 | def chunk(l: Iterable, batch_size: int) -> List:
183 |     batch = []
184 |     for element in l:
185 |         batch.append(element)
186 |         if len(batch) == batch_size:
187 |             yield batch
188 |             batch = []
189 |     if batch:
190 |         yield batch
191 | 
192 | 
193 | class KerasThread():
194 | 
195 |     @staticmethod
196 |     def create_model(pref, testing=False) -> KerasModel:
197 |         keras_model_class = pref['keras_model_class']
198 | 
199 |         parameters = ExperimentParameters(pref, testing)
200 | 
201 |         km = keras_model_class(parameters)
202 | 
203 |         if 'UniqueFeaturesValues' in pref:
204 |             km.unique_features_dict = pickle.load(open(pref['UniqueFeaturesValues'], 'rb'))
205 |         else:
206 |             # data_path = 'nkjp_paragraphs_shuffled_concraft.spickle_FormatData_PreprocessData'
207 |             data_path = pref['data_path']
208 |             km.unique_features_dict = UniqueFeaturesValues(data_path).get()
209 | 
210 |         unique_tags_dict = km.unique_features_dict[pref['label_name']]
211 |         km.classes = list(map(lambda k: k[0], sorted(unique_tags_dict.items(), key=lambda k: k[1])))
212 |         pref = km.parameters.pref
213 |         pref['features_length'] = len(km.unique_features_dict[pref['feature_name']])
214 |         pref['output_length'] = len(km.unique_features_dict[pref['label_name']])
215 | 
216 |         km.create_model()
217 |         # self.km.load_weights('weight_7471898792961270266.hdf5')
218 |         # km.load_weights('weight_7471898792961270266.hdf5')
219 |         # km.load_weights('../artykul/compare/train_on_all.weights')
220 |         km.load_weights(pref['weight_path'])
221 |         km.compile()
222 | 
223 |         return km
224 | 
225 |     @staticmethod
226 |     def return_results(sentences: List[List[Sample]], preds, classes: List[str],
227 |                        lemmatisation: Union[Lemmatisation, Lemmatisation2]):
228 |         for sentence, preds2 in zip(sentences, preds):  # TODO sentences
229 |             # print(preds2.shape)
230 |             # print(preds2)
231 | 
232 |             response = []
233 | 
234 |             preds3 = preds2.argmax(axis=-1)
235 |             preds3max = preds2.max(axis=-1)
236 |             # print(len(sentence), len(preds3))
237 |             first = True
238 |             for sample, max_index, prob in zip(sentence, list(preds3)[-len(sentence):],
239 |                                                list(preds3max)[-len(sentence):]):
240 |                 # print(sample.features, max_index)
241 |                 # max_index, max_value = max(enumerate(d), key=lambda x: x[1])
242 | 
243 |                 token_response = {}
244 |                 response.append(token_response)
245 |                 predicted_tag = classes[max_index]
246 | 
247 |                 # TODO
248 |                 if sample.features['space_before'] == ['space_before']:
249 |                     sep = 'space'
250 |                 else:
251 |                     sep = 'none'
252 | 
253 |                 if 'newline' in sample.features['space_before'] or 'newlines' in sample.features['space_before']:
254 |                     sep = 'newline'
255 |                 elif 'space' in sample.features['space_before'] or 'spaces' in sample.features['space_before']:
256 |                     sep = 'space'
257 |                 elif 'none' in sample.features['space_before']:
258 |                     sep = 'none'
259 | 
260 |                 # print(sample.features['token']+'\t'+sep)
261 |                 # response.append(sample.features['token']+'\t'+sep)
262 |                 token_response['token'] = sample.features['token']
263 |                 token_response['sep'] = sep
264 |                 token_response['prob'] = float(prob)
265 |                 token_response['document_id'] = sample.features['document_id']
266 | 
267 |                 lemmas = [x for x in sample.features['maca_lemmas']]
268 |                 token_response['tag'] = predicted_tag
269 |                 token_response['lemmas'] = []
270 |                 try:
271 |                     token_response['start'] = sample.features['start']
272 |                     token_response['end'] = sample.features['end']
273 |                 except KeyError:
274 |                     token_response['start'] = None
275 |                     token_response['end'] = None
276 | 
277 |                 # if not lemmas:
278 |                 #    lemmas.append((sample.features['token'], predicted_tag))
279 |                 lemma = lemmatisation.disambiguate(token_response['token'], lemmas, predicted_tag)
280 | 
281 |                 token_response['lemmas'].append(lemma)
282 | 
283 |                 # if lemmas:
284 |                 #     for l, t in lemmas:
285 |                 #         #print('\t'+l+'\t'+t+'\tdisamb')
286 |                 #         #response.append('\t'+l+'\t'+t+'\tdisamb')
287 |                 #         token_response['lemmas'].append(l)
288 |                 # else:
289 |                 #     #print('\t'+sample.features['token']+'\t'+predicted_tag+'\tdisamb')
290 |                 #     #response.append('\t'+sample.features['token']+'\t'+predicted_tag+'\tdisamb')
291 |                 #     token_response['lemmas'].append(sample.features['token'])
292 | 
293 |                 first = False
294 |             # print()
295 |             # response.append('')
296 | 
297 |             yield response
298 | 


--------------------------------------------------------------------------------
/krnnt/readers.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import sys
  3 | from typing import Generator
  4 | from xml.etree import ElementTree as ET
  5 | 
  6 | import jsonlines
  7 | 
  8 | from krnnt.structure import Paragraph, Sentence, Token, Form
  9 | 
 10 | 
 11 | def read_xces(file_path: str) -> Paragraph:
 12 |     paragraphs_defined = True
 13 |     ns=False #no separator
 14 |     first_chunk=True
 15 | 
 16 |     for event, elem in ET.iterparse(file_path, events=("start","end",)):
 17 |         if first_chunk and event=="start" and elem.tag in ('chunk','sentence'):
 18 |             if elem.get('type') == 's' or elem.tag =='sentence':
 19 |                 paragraphs_defined = False
 20 |             first_chunk=False
 21 |         elif event=="end" and elem.tag in ('chunk','sentence'):
 22 |             xml_sentences=[]
 23 |             paragraph=Paragraph()
 24 |             if paragraphs_defined and elem.tag == 'chunk' and elem.get('type')!='s':
 25 |                 xml_sentences = elem.getchildren()
 26 |             elif (not paragraphs_defined) and ((elem.tag == 'chunk' and elem.get('type')=='s') or elem.tag == 'sentence'):
 27 |                 xml_sentences = [elem]
 28 |             else:
 29 |                 continue
 30 | 
 31 |             for sentence_index, xml_sentence in enumerate(xml_sentences):
 32 |                 sentence=Sentence()
 33 |                 paragraph.add_sentence(sentence)
 34 |                 for token_index, xml_token in enumerate(xml_sentence.getchildren()):
 35 |                     if xml_token.tag=='ns':
 36 |                         if token_index>0 or sentence_index>0: #omit first ns in paragraph
 37 |                             ns=True
 38 |                     elif xml_token.tag=='tok':
 39 |                         token=Token()
 40 |                         token.space_before=not ns
 41 | 
 42 |                         for xml_node in xml_token.getchildren():
 43 |                             if xml_node.tag=='orth':
 44 |                                 orth=xml_node.text
 45 |                                 token.form=orth
 46 |                             elif xml_node.tag=='lex':
 47 |                                 if xml_node.get('disamb')=='1':
 48 |                                     disamb=True
 49 |                                 else:
 50 |                                     disamb=False
 51 | 
 52 |                                 base=xml_node.find('base').text
 53 |                                 ctag=xml_node.find('ctag').text
 54 | 
 55 |                                 form = Form(base, ctag)
 56 |                                 if disamb:
 57 |                                     if token.gold_form is not None:
 58 |                                         logging.warning(f'More than 1 disamb {file_path} {orth}')
 59 |                                     token.gold_form=form
 60 |                                 else:
 61 |                                     token.interpretations.append(form)
 62 |                             elif xml_node.tag=='ann':
 63 |                                 continue
 64 |                             else:
 65 |                                 logging.error('Error 1 {xml_token}')
 66 |                         if token.form:
 67 |                             sentence.add_token(token)
 68 |                         ns=False
 69 |                     else:
 70 |                         logging.error(f'Error 2 {xml_token}')
 71 |             yield paragraph
 72 |             elem.clear()
 73 | 
 74 | 
 75 | def read_jsonl(file_path: str) -> Generator[Paragraph,None,None]:
 76 |     with jsonlines.Reader(file_path) as reader:
 77 |         for obj in reader:
 78 |             a = _list_to_paragraph(obj)
 79 |             yield a
 80 | 
 81 | 
 82 | def _list_to_paragraph(l) -> Paragraph:
 83 |     paragraph = Paragraph()
 84 |     for s in l:
 85 |         sentence = Sentence()
 86 |         paragraph.add_sentence(sentence)
 87 |         for t in s:
 88 |             token = Token()
 89 |             form=t[0]
 90 |             token.form = form
 91 | 
 92 |             # print(t)
 93 |             try:
 94 |                 space=t[1]
 95 |                 token.space_before = (space == 1)
 96 |             except IndexError:
 97 |                 token.space_before = True # ?
 98 | 
 99 |             interpretations = t[2:]
100 |             token.interpretations.extend([Form(base, ctag) for (base, ctag) in interpretations])
101 | 
102 |             sentence.add_token(token)
103 |     return paragraph
104 | 
105 | 
106 | def json_to_objects(data):
107 |     paragraphs = []
108 |     for input_paragraph in data['documents']:
109 |         paragraph = Paragraph()
110 |         paragraphs.append(paragraph)
111 |         for input_sentence in input_paragraph['sentences']:
112 |             sentence = Sentence()
113 |             paragraph.add_sentence(sentence)
114 |             for input_token in input_sentence['tokens']:
115 |                 token = Token()
116 |                 token.form = input_token['form']
117 |                 if len(input_token)>=2:
118 |                     separator=input_token['separator']
119 |                     if separator is not None:
120 |                         token.space_before=separator
121 |                     elif len(input_token)>=4:
122 |                         token.start=input_token['start']
123 |                         token.end = input_token['end']
124 |                         #infer separator before from positions
125 |                         if len(sentence.tokens)==0:
126 |                             token.space_before='space'
127 |                         else:
128 |                             if sentence.tokens[-1].end==token.start:
129 |                                 token.space_before = 'none'
130 |                             else:
131 |                                 token.space_before = 'space'
132 |                 else:
133 |                     token.space_before = 'space'  # TODO ?
134 |                 sentence.add_token(token)
135 |     return paragraphs
136 | 
137 | 
138 | def json_compact_to_objects(data):
139 |     paragraphs = []
140 |     for input_paragraph in data:
141 |         paragraph = Paragraph()
142 |         paragraphs.append(paragraph)
143 |         for input_sentence in input_paragraph:
144 |             sentence = Sentence()
145 |             paragraph.add_sentence(sentence)
146 |             for input_token in input_sentence:
147 |                 token = Token()
148 |                 token.form = input_token[0]
149 |                 if len(input_token) >= 2:
150 |                     separator = input_token[1]
151 |                     if separator is not None:
152 |                         token.space_before = separator
153 |                     elif len(input_token) >= 4:
154 |                         token.start = input_token[2]
155 |                         token.end = input_token[3]
156 |                         # infer separator before from positions
157 |                         if len(sentence.tokens) == 0:
158 |                             token.space_before = 'space'
159 |                         else:
160 |                             if sentence.tokens[-1].end == token.start:
161 |                                 token.space_before = 'none'
162 |                             else:
163 |                                 token.space_before = 'space'
164 |                 else:
165 |                     token.space_before = 'space'  # TODO ?
166 |                 sentence.add_token(token)
167 |     return paragraphs
168 | 
169 | 


--------------------------------------------------------------------------------
/krnnt/serial_pickle.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | from typing import BinaryIO, Iterable
 3 | 
 4 | 
 5 | class SerialPickler:
 6 |     def __init__(self, file: BinaryIO, mode=3):  # don't work with protocol 4
 7 |         self.file = file
 8 |         self.p = pickle.Pickler(file, mode)
 9 | 
10 |     def add(self, obj):
11 |         self.p.dump(obj)
12 |         self.p.memo.clear()
13 | 
14 |     def extend(self, objs: Iterable):
15 |         for obj in objs:
16 |             self.p.dump(obj)
17 |             self.p.memo.clear()
18 | 
19 |     def close(self):
20 |         self.file.close()
21 | 
22 | 
23 | class SerialUnpickler:
24 |     def __init__(self, file: BinaryIO, stop: int=-1, start: int =0, ids: Iterable = None):
25 |         """
26 | 
27 |         :param file:
28 |         :param start: unpickle objects starting from index start
29 |         :param stop: unpickle objects ending with index stop
30 |         :param ids: unpickle objects with indexes in ids
31 |         """
32 |         if ids is None:
33 |             ids = []
34 |         self.file = file
35 |         self.p = pickle.Unpickler(file)
36 |         self.c = 0
37 |         self.stop = stop
38 |         self.start = start
39 |         self.ids = set(ids)
40 | 
41 |     def __iter__(self):
42 |         if self.ids:
43 |             return self.__iter2()
44 |         else:
45 |             return self.__iter1()
46 | 
47 |     def __iter1(self):
48 |         while True:
49 |             try:
50 |                 if self.c == self.stop:
51 |                     break
52 |                 self.c += 1
53 |                 x = self.p.load()
54 |                 if self.c - 1 < self.start:
55 |                     continue
56 | 
57 |                 # print self.c
58 |                 yield x
59 |             except EOFError:
60 |                 break
61 | 
62 |     def __iter2(self):
63 |         while True:
64 |             try:
65 |                 x = self.p.load()
66 |                 if self.c in self.ids:
67 |                     yield x
68 |                 self.c += 1
69 |             except EOFError:
70 |                 break
71 | 
72 | 
73 | def count_samples(path: str) -> int:
74 |     """
75 |     Return number of items in serial pickle file.
76 |     """
77 |     with open(path, 'rb') as file:
78 |         su = SerialUnpickler(file)
79 | 
80 |         count = 0
81 |         for paragraph in su:
82 |             count += 1
83 | 
84 |         return count


--------------------------------------------------------------------------------
/krnnt/structure.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from typing import List
 5 | 
 6 | 
 7 | class Paragraph:
 8 |     sentences: List['Sentences']
 9 | 
10 |     __slots__ = ['sentences', 'concraft']
11 | 
12 |     def __init__(self):
13 |         self.sentences = []
14 | 
15 |     def add_sentence(self, sentence: 'Sentence'):
16 |         self.sentences.append(sentence)
17 | 
18 |     def __iter__(self):
19 |         return self.sentences.__iter__()
20 | 
21 |     def text(self) -> str:
22 |         raw = ''.join([sentence.text() for sentence in self.sentences])
23 |         try:
24 |             if self.sentences[0].tokens[0].space_before:
25 |                 return raw[1:]
26 |             else:
27 |                 return raw
28 |         except:
29 |             return raw
30 | 
31 |     def __str__(self):
32 |         return 'Paragraph([%s])' % ','.join([str(x) for x in self.sentences])
33 | 
34 | 
35 | class Sentence:
36 |     tokens: List['Token']
37 | 
38 |     __slots__ = ['tokens']
39 | 
40 |     def __init__(self):
41 |         self.tokens = []
42 | 
43 |     def add_token(self, token: 'Token'):
44 |         self.tokens.append(token)
45 | 
46 |     def text(self) -> str:
47 |         return ''.join(map(lambda token: ' ' + token.form if token.space_before else token.form, self.tokens))
48 | 
49 |     def __iter__(self):
50 |         return self.tokens.__iter__()
51 | 
52 |     def __str__(self):
53 |         return 'Sentence([%s])' % ','.join([str(x) for x in self.tokens])
54 | 
55 | class Token:
56 |     form: str
57 |     interpretations: List['Form']
58 |     gold_form: 'Form'
59 | 
60 |     __slots__ = ['form', 'space_before', 'interpretations', 'gold_form', 'start', 'end']
61 | 
62 |     def __init__(self):
63 |         self.form = None
64 |         self.space_before = None
65 |         self.interpretations = []
66 |         self.gold_form = None
67 | 
68 |     def add_interpretation(self, interpretation: 'Form'):
69 |         self.interpretations.append(interpretation)
70 | 
71 |     def __str__(self):
72 |         return 'Token(%s, %s, %s, %s)' % (self.form, ','.join([str(x) for x in self.interpretations]), self.space_before, str(self.gold_form))
73 | 
74 | 
75 | class Form:
76 |     def __init__(self, lemma: str, tags: str):
77 |         self.lemma = lemma
78 |         self.tags = tags
79 | 
80 |     def __str__(self):
81 |         return 'Form(%s, %s)' % (self.lemma, self.tags)
82 | 
83 |     def __eq__(self, y):
84 |         return self.lemma == y.lemma and self.tags == y.tags
85 | 
86 |     def __hash__(self):
87 |         return hash((self.lemma, self.tags))
88 | 


--------------------------------------------------------------------------------
/krnnt/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterable, List
 2 | 
 3 | import regex
 4 | 
 5 | 
 6 | def unix_uniq(l: str) -> str:
 7 |     packed = []
 8 | 
 9 |     for el in l:
10 |         if not packed or packed[-1] != el:
11 |             packed.append(el)
12 |     return ''.join(packed)
13 | 
14 | 
15 | def uniq(seq: Iterable) -> List:
16 |     seen = set()
17 |     return [x for x in seq if not (x in seen or seen.add(x))]
18 | 
19 | 
20 | def flatten(l: Iterable) -> List:
21 |     return [item for sublist in l for item in sublist]
22 | 
23 | 
24 | def shape(word: str) -> str:  # TODO zredukowac czas
25 |     word = regex.sub(r'(?V1)\p{Lowercase}', 'l', word, flags=regex.U)  # 80%
26 |     word = regex.sub(r'(?V1)\p{Uppercase}', 'u', word, flags=regex.U)
27 |     word = regex.sub(r'\p{gc=Decimal_Number}', 'd', word, flags=regex.U)
28 |     word = regex.sub(r'[^A-Za-z0-9]', 'x', word, flags=regex.LOCALE)
29 |     return unix_uniq(word)


--------------------------------------------------------------------------------
/krnnt/writers.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import json
  3 | import logging
  4 | import sys
  5 | from typing import Callable
  6 | 
  7 | import jsonlines
  8 | 
  9 | 
 10 | def results_to_txt_str(result_paragraphs):
 11 |     result_str = []
 12 |     for paragraph in result_paragraphs:
 13 |         for sentence in paragraph:
 14 |             for i, token in enumerate(sentence):
 15 |                 # print(token['sep'])
 16 |                 if i > 0 and token['sep'] != 'none':
 17 |                     result_str += (' ',)
 18 |                 result_str += (token['token'],)
 19 |             result_str += ("\n",)
 20 |         result_str += ("\n",)
 21 |     return ''.join(result_str)
 22 | 
 23 | 
 24 | def results_to_conll_str(result_paragraphs):
 25 |     result_str = []
 26 |     for paragraph in result_paragraphs:
 27 |         for sentence in paragraph:
 28 |             for token in sentence:
 29 |                 try:
 30 |                     start = token['start']
 31 |                 except KeyError:
 32 |                     start = ''
 33 | 
 34 |                 try:
 35 |                     end = token['end']
 36 |                 except KeyError:
 37 |                     end = ''
 38 | 
 39 |                 result_str += ('%s\t%s\t%s\t%s\t%s\t%s' % (
 40 |                     token['token'], token['lemmas'][0], 0 if token['sep'] == 'none' else 1, token['tag'], start, end),)
 41 |             result_str += ("",)
 42 |         result_str += ("",)
 43 |     return '\n'.join(result_str)
 44 | 
 45 | 
 46 | def results_to_jsonl_str(result_paragraphs):
 47 |     fp = io.StringIO()
 48 |     with jsonlines.Writer(fp) as writer:
 49 |         for paragraph in result_paragraphs:
 50 |             output_paragraph=[]
 51 |             for sentence in paragraph:
 52 |                 ss = [(token['token'], token['lemmas'][0], token['tag']) for token in sentence]
 53 |                 output_paragraph+=(ss,)
 54 |             writer.write(output_paragraph)
 55 |     return fp.getvalue()
 56 | 
 57 | def results_to_json_str(result_paragraphs):
 58 |     return json.dumps(result_paragraphs)
 59 | 
 60 | 
 61 | def results_to_conllu_str(result_paragraphs):
 62 |     result_str = []
 63 |     for paragraph in result_paragraphs:
 64 |         for sentence in paragraph:
 65 |             for i, token in enumerate(sentence):
 66 |                 result_str += ('%s\t%s\t%s\t_\t%s\t_\t_\t_\t_\t_' % (
 67 |                     i + 1, token['token'], token['lemmas'][0], token['tag']),)
 68 |             result_str += ("",)
 69 |         result_str += ("",)
 70 |     return '\n'.join(result_str)
 71 | 
 72 | 
 73 | def results_to_plain_str(result_paragraphs):
 74 |     result_str = []
 75 |     for paragraph in result_paragraphs:
 76 |         for sentence in paragraph:
 77 |             for token in sentence:
 78 |                 result_str += ('%s\t%s' % (token['token'], token['sep']),)
 79 |                 for lemma in token['lemmas']:
 80 |                     result_str += ('\t%s\t%s\tdisamb' % (lemma, token['tag']),)
 81 |             result_str += ("",)
 82 |         result_str += ("",)
 83 |     return '\n'.join(result_str)
 84 | 
 85 | 
 86 | def results_to_xces_str(result_paragraphs):
 87 |     result_str = []
 88 |     result_str += ('<?xml version="1.0" encoding="UTF-8"?>',
 89 |                    '<!DOCTYPE cesAna SYSTEM "xcesAnaIPI.dtd">',
 90 |                    '<cesAna xmlns:xlink="http://www.w3.org/1999/xlink" version="1.0" type="lex disamb">',
 91 |                    '<chunkList>')
 92 |     for paragraph in result_paragraphs:
 93 |         result_str += (' <chunk type="p">', )
 94 |         for sentence in paragraph:
 95 |             result_str += ('  <chunk type="s">',)
 96 |             for token in sentence:
 97 |                 if token['sep'] == 'none':
 98 |                     result_str += ('   <ns/>',)
 99 |                 result_str += ('   <tok>',)
100 |                 result_str += ('    <orth>%s</orth>' % escape_xml(token['token']),)
101 |                 for lemma in token['lemmas']:
102 |                     result_str += ('    <lex disamb="1"><base>%s</base><ctag>%s</ctag></lex>' % (escape_xml(lemma),
103 |                                                                                                  token['tag']),)
104 |                 result_str += ('   </tok>',)
105 |             result_str += ('  </chunk>',)
106 |         result_str += (' </chunk>',)
107 | 
108 |     result_str += ('</chunkList>',
109 |                    '</cesAna>')
110 |     return '\n'.join(result_str)
111 | 
112 | 
113 | def escape_xml(s):
114 |     return s.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace('\'',
115 |                                                                                                             '&apos;')
116 | 
117 | 
118 | def get_output_converter(output_format: str) -> Callable:
119 |     output_format=output_format.lower()
120 |     if output_format == 'xces':
121 |         conversion = results_to_xces_str
122 |     elif output_format == 'plain':
123 |         conversion = results_to_plain_str
124 |     elif output_format in ('conll','tsv'):
125 |         conversion = results_to_conll_str
126 |     elif output_format == 'conllu':
127 |         conversion = results_to_conllu_str
128 |     elif output_format == 'jsonl':
129 |         conversion = results_to_jsonl_str
130 |     elif output_format == 'json':
131 |         conversion = results_to_json_str
132 |     elif output_format in ('txt','text'):
133 |         conversion = results_to_txt_str
134 |     else:
135 |         logging.error('Wrong output format.')
136 |         sys.exit(1)
137 | 
138 |     return conversion


--------------------------------------------------------------------------------
/krnnt_run.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | import logging
  4 | import sys
  5 | 
  6 | from argparse import ArgumentParser
  7 | 
  8 | from krnnt.aglt import remove_aglt_from_results_rule1_3
  9 | from krnnt.blanks import remove_blanks_from_results
 10 | from krnnt.keras_models import BEST
 11 | from krnnt.new import Lemmatisation, Lemmatisation2, get_morfeusz, analyze_tokenized
 12 | from krnnt.pipeline import KRNNTSingle, chunk
 13 | from krnnt.readers import read_xces, read_jsonl
 14 | from krnnt.writers import results_to_jsonl_str, results_to_conll_str, results_to_conllu_str, \
 15 |     results_to_xces_str, results_to_plain_str
 16 | 
 17 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
 18 | 
 19 | 
 20 | if __name__ == '__main__':
 21 |     parser = ArgumentParser(description='Run tagger')
 22 |     parser.add_argument('weight_path', help='path to weights, lemmatisation data and dictionary')
 23 |     parser.add_argument('lemmatisation_data', help='path to lemmatisation data')
 24 |     parser.add_argument('dictionary', help='path to dictionary')
 25 |     parser.add_argument('-p', '--preanalyzed', action='store_false',
 26 |                       default=True, dest='reanalyzed',
 27 |                       help='training data have not been reanalyzed')
 28 |     parser.add_argument('-i', '--input-format', default='xces', dest='input_format',
 29 |                       help='input format of preanalyzed data: xces, jsonl')
 30 |     parser.add_argument('-o', '--output-format',
 31 |                       default='xces', dest='output_format',
 32 |                       help='output format: xces, plain, conll, conllu, jsonl')
 33 |     parser.add_argument('--maca_config',
 34 |                       default='morfeusz2-nkjp',
 35 |                       help='Maca config')
 36 |     parser.add_argument('--toki_config_path',
 37 |                       default='',
 38 |                       help='Toki config path (directory)')
 39 |     parser.add_argument('--lemmatisation',
 40 |                       default='sgjp',
 41 |                       help='lemmatization mode (sgjp, simple)')
 42 |     parser.add_argument('-g', '--debug', action='store_true', dest='debug_mode')  # TODO
 43 |     parser.add_argument('--tokenized', action='store_true',
 44 |                       help='input data are tokenized, but not analyzed')
 45 |     parser.add_argument('--reproducible', action='store_true', default=False, help='set seeds')
 46 |     parser.add_argument('--chunk_size',
 47 |                         default=100000, type=int,
 48 |                         help='chunk size')
 49 |     parser.add_argument('--remove_aglt', action='store_true')
 50 |     parser.add_argument('--dont_remove_blank', action='store_false')
 51 |     args = parser.parse_args()
 52 | 
 53 |     if args.reproducible:
 54 |         from numpy.random import seed
 55 |         seed(1337)
 56 |         import random as rn
 57 |         rn.seed(1337)
 58 |         import tensorflow as tf
 59 |         session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
 60 |                                       inter_op_parallelism_threads=1)
 61 |         from keras import backend as K
 62 |         tf.set_random_seed(1337)
 63 |         sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
 64 |         K.set_session(sess)
 65 | 
 66 |     pref = {'keras_batch_size': 32, 'internal_neurons': 256, 'feature_name': 'tags4e3', 'label_name': 'label',
 67 |             'keras_model_class': BEST, 'maca_config':args.maca_config, 'toki_config_path':args.toki_config_path}
 68 | 
 69 |     if args.lemmatisation== 'simple':
 70 |         pref['lemmatisation_class'] = Lemmatisation2
 71 |     else:
 72 |         pref['lemmatisation_class'] = Lemmatisation
 73 | 
 74 |     pref['reanalyze'] = args.reanalyzed
 75 |     # pref['input_format'] = options.input_format
 76 |     pref['output_format'] = args.output_format
 77 | 
 78 |     pref['weight_path'] = args.weight_path
 79 |     pref['lemmatisation_path'] = args.lemmatisation_data
 80 |     pref['UniqueFeaturesValues'] = args.dictionary
 81 | 
 82 |     krnnt = KRNNTSingle(pref)
 83 | #time python3 -m cProfile -o gpu_run_train2.profil  krnnt_run.py ../krnnt/data/weights.hdf5 ../krnnt/data/lemmatisation.pkl ../krnnt/data/dictionary.pkl -o xces  > /tmp/out.xces < ../krnnt-refactor/tests/data/full/train-raw.txt
 84 | 
 85 |     if args.tokenized:
 86 |         if args.input_format == 'jsonl':
 87 |             corpus = read_jsonl(sys.stdin)
 88 |         else:
 89 |             print('Wrong input format.')
 90 |             sys.exit(1)
 91 | 
 92 |         morf=get_morfeusz()
 93 |         corpus = analyze_tokenized(morf, corpus)
 94 |         results = krnnt.tag_paragraphs(corpus, preana=True)
 95 |     elif args.reanalyzed:
 96 |         data=sys.stdin.read().split('\n\n')
 97 |         results=[]
 98 |         for batch in chunk(data, args.chunk_size):
 99 |             results += krnnt.tag_paragraphs(batch) # ['Ala ma kota.', 'Ale nie ma psa.']
100 |             #TODO: print here
101 |     else:
102 |         #f = io.StringIO(sys.stdin.read())
103 |         if args.input_format== 'xces':
104 |             corpus = read_xces(sys.stdin)
105 |         elif args.input_format== 'jsonl':
106 |             corpus = read_jsonl(sys.stdin)
107 |         else:
108 |             print('Wrong input format.')
109 |             sys.exit(1)
110 | 
111 |         results = krnnt.tag_paragraphs(corpus, preana=True)
112 | 
113 |     # print(results)
114 | 
115 |     if args.output_format == 'xces':
116 |         conversion = results_to_xces_str
117 |     elif args.output_format == 'plain':
118 |         conversion = results_to_plain_str
119 |     elif args.output_format == 'conll':
120 |         conversion = results_to_conll_str
121 |     elif args.output_format == 'conllu':
122 |         conversion = results_to_conllu_str
123 |     elif args.output_format == 'jsonl':
124 |         conversion = results_to_jsonl_str
125 |     else:
126 |         print('Wrong output format.')
127 |         sys.exit(1)
128 | 
129 | 
130 |     if args.remove_aglt:
131 |         remove_aglt_from_results_rule1_3(results)
132 | 
133 |     if args.dont_remove_blank:
134 |         remove_blanks_from_results(results)
135 | 
136 |     print(conversion(results), end='')
137 | 


--------------------------------------------------------------------------------
/krnnt_serve.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import sys
  5 | from argparse import ArgumentParser
  6 | 
  7 | from flask import Flask
  8 | from flask import request
  9 | from krnnt.additional_format import additional_format
 10 | from krnnt.aglt import remove_aglt_from_results_rule1_3
 11 | from krnnt.analyzers import MacaAnalyzer
 12 | from krnnt.blanks import remove_blanks_from_results
 13 | from krnnt.keras_models import BEST
 14 | from krnnt.new import Lemmatisation, Lemmatisation2, get_morfeusz, analyze_tokenized
 15 | from krnnt.writers import get_output_converter
 16 | from krnnt.readers import json_to_objects, json_compact_to_objects
 17 | from krnnt.pipeline import KRNNTSingle
 18 | 
 19 | app = Flask(__name__)
 20 | app.config['JSON_AS_ASCII'] = False
 21 | application = app
 22 | 
 23 | global krnntx, conversion, maca_analyzer, morfeusz
 24 | 
 25 | 
 26 | def render(text='', str_results=''):
 27 |     return """
 28 | <html>
 29 | <head>
 30 | <meta charset="utf-8">
 31 | <title>KRNNT</title>
 32 | </head>
 33 | <body>
 34 | <h1>KRNNT: Polish Recurrent Neural Network Tagger</h1>
 35 | <form action="/" method="post">
 36 | <textarea name="text" rows=10 cols=100>%s</textarea><br>
 37 | <input type="submit">
 38 | </form>
 39 | <pre>%s</pre>
 40 | <p>The tagset is described here: <a href="http://nkjp.pl/poliqarp/help/ense2.html">http://nkjp.pl/poliqarp/help/ense2.html</a></p>
 41 | <p>Wróbel Krzysztof, <a href="https://www.researchgate.net/publication/333566748_KRNNT_Polish_Recurrent_Neural_Network_Tagger">KRNNT: Polish Recurrent Neural Network Tagger</a></p>
 42 | <p>Source code: <a href="https://github.com/kwrobel-nlp/krnnt">https://github.com/kwrobel-nlp/krnnt</a></p>
 43 | </body>
 44 | </html>""" % (text, str_results)
 45 | 
 46 | 
 47 | @app.route('/', methods=['GET'])
 48 | def gui():
 49 |     return render()
 50 | 
 51 | 
 52 | @app.route('/', methods=['POST'])
 53 | def tag_raw():
 54 |     request.get_data()
 55 | 
 56 |     input_format = request.args.get('input_format', default=None, type=str)
 57 |     output_format = request.args.get('output_format', default='plain', type=str)
 58 |     remove_aglt = request.args.get('remove_aglt', default='0', type=str)
 59 |     remove_blank = request.args.get('remove_blank', default='1', type=str)
 60 | 
 61 |     conversion2 = get_output_converter(output_format)
 62 | 
 63 |     if remove_aglt!='0':
 64 |         conversionx=conversion2
 65 |         conversion2=lambda x: conversionx(remove_aglt_from_results_rule1_3(x))
 66 | 
 67 |     if remove_blank!='0':
 68 |         conversionx2=conversion2
 69 |         conversion2=lambda x: conversionx2(remove_blanks_from_results(x))
 70 | 
 71 |     if request.is_json:
 72 |         data = request.get_json()
 73 | 
 74 |         if 'docs' in data:
 75 |             return additional_format(data, krnntx, morfeusz)
 76 |         else:
 77 |             if 'documents' in data:
 78 |                 paragraphs = json_to_objects(data)
 79 |             else:
 80 |                 paragraphs = json_compact_to_objects(data)
 81 | 
 82 |             corpus = analyze_tokenized(morfeusz, paragraphs)
 83 |             results = krnntx.tag_paragraphs(corpus, preana=True)
 84 | 
 85 |             return conversion2(results)
 86 |     elif 'text' in request.form:
 87 |         text = request.form['text']
 88 | 
 89 | 
 90 | 
 91 |         results = krnntx.tag_paragraphs([text])  # ['Ala ma kota.', 'Ale nie ma psa.']
 92 |         return render(text, conversion(results))
 93 |     else:
 94 |         text = request.get_data()
 95 | 
 96 |         if input_format == 'lines':
 97 |             data = text.decode('utf-8').split('\n\n') #TODO
 98 |         else:
 99 |             data = [text.decode('utf-8')]
100 | 
101 |         results = krnntx.tag_paragraphs(data)
102 | 
103 |         return conversion2(results)
104 | 
105 | 
106 | @app.route('/tag/', methods=['POST'])
107 | def tag():
108 |     text = request.form['text']
109 |     results = krnntx.tag_sentences(text.split('\n\n'))  # ['Ala ma kota.', 'Ale nie ma psa.']
110 |     return render(text, conversion(results))
111 | 
112 | @app.route('/maca/', methods=['POST'])
113 | def maca():
114 |     text = request.get_data()
115 |     # print(text.decode('utf-8').split('\n\n'))
116 | 
117 |     results = maca_analyzer._maca(text.decode('utf-8').split('\n\n'))
118 |     results = list(results)
119 |     return str(results)
120 | 
121 | 
122 | def main(argv=sys.argv[1:]):
123 |     print(argv)
124 |     global conversion,krnntx,maca_analyzer, morfeusz
125 | 
126 |     parser = ArgumentParser(usage='HTTP Tagger server')
127 |     parser.add_argument('model_path', help='path to directory woth weights, lemmatisation data and dictionary')
128 |     parser.add_argument('-p', '--port',
129 |                         default=9003,
130 |                         help='server port (defaults to 9003)')
131 |     parser.add_argument('-t', '--host',
132 |                         default='0.0.0.0',
133 |                         help='server host (defaults to localhost)')
134 |     parser.add_argument('--maca_config',
135 |                         default='morfeusz-nkjp-official',
136 |                         help='Maca config')
137 |     parser.add_argument('--toki_config_path',
138 |                         default='',
139 |                         help='Toki config path (directory)')
140 |     parser.add_argument('--lemmatisation',
141 |                         default='sgjp',
142 |                         help='lemmatization mode (sgjp, simple)')
143 |     parser.add_argument('-o', '--output-format',
144 |                         default='plain', dest='output_format',
145 |                         help='output format: xces, plain, conll, conllu, jsonl')
146 |     parser.add_argument('-b', '--batch_size',
147 |                         default=32, type=int,
148 |                         help='batch size')
149 |     parser.add_argument('--remove_aglt', action='store_true')
150 |     parser.add_argument('--dont_remove_blank', action='store_false')
151 |     args = parser.parse_args(argv)
152 | 
153 |     pref = {'keras_batch_size': args.batch_size, 'internal_neurons': 256, 'feature_name': 'tags4e3', 'label_name': 'label',
154 |             'keras_model_class': BEST, 'maca_config': args.maca_config, 'toki_config_path': args.toki_config_path}
155 | 
156 |     if args.lemmatisation == 'simple':
157 |         pref['lemmatisation_class'] = Lemmatisation2
158 |     else:
159 |         pref['lemmatisation_class'] = Lemmatisation
160 | 
161 |     pref['reanalyze'] = True
162 | 
163 |     pref['weight_path'] = args.model_path + "/weights.hdf5"
164 |     pref['lemmatisation_path'] = args.model_path + "/lemmatisation.pkl"
165 |     pref['UniqueFeaturesValues'] = args.model_path + "/dictionary.pkl"
166 | 
167 |     morfeusz = get_morfeusz()
168 |     maca_analyzer = MacaAnalyzer(args.maca_config)
169 |     krnntx = KRNNTSingle(pref)
170 | 
171 |     krnntx.tag_sentences(['Ala'])
172 | 
173 |     conversion= get_output_converter(args.output_format)
174 | 
175 |     if args.remove_aglt:
176 |         conversionx = conversion
177 |         conversion=lambda x: conversionx(remove_aglt_from_results_rule1_3(x))
178 | 
179 |     if args.dont_remove_blank:
180 |         conversionx2 = conversion
181 |         conversion=lambda x: conversionx2(remove_blanks_from_results(x))
182 | 
183 | 
184 |     return app, args.host, args.port
185 | 
186 | 
187 | 
188 | if __name__ == '__main__':
189 |     app,host,port = main()
190 |     # from werkzeug.middleware.profiler import ProfilerMiddleware
191 |     # app.config['PROFILE'] = True
192 |     # app = ProfilerMiddleware(app)
193 |     # app.wsgi_app = ProfilerMiddleware(
194 |     #     app.wsgi_app, profile_dir="."
195 |     # )
196 |     app.run(host=host, port=port, debug=False) # threaded=False on GPU
197 | 
198 | def start(*args, **kwargs):
199 |     app, host, port = main(args)
200 |     return app
201 | 
202 | #gunicorn -b 127.0.0.1:9003 -w 4 -k gevent -t 3600 --threads 4 'krnnt_serve:start("model_data","--maca_config","morfeusz2-nkjp","--toki_config_path","/home/krnnt/")'


--------------------------------------------------------------------------------
/krnnt_train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from argparse import ArgumentParser
 5 | 
 6 | from krnnt.keras_models import BEST, ExperimentParameters
 7 | from krnnt.new import UnalignedSimpleEvaluator
 8 | from krnnt.tagger_exps import RunFolds2, KerasData, RunExperiment
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     parser = ArgumentParser()
13 |     parser.add_argument('corpus_path', help='path to corpus')
14 |     parser.add_argument('-p', '--preanalyzed', action='store_false',
15 |                         default=True, dest='reanalyzed',
16 |                         help='training data have not been reanalyzed')
17 |     parser.add_argument('-c', '--cv', action='store_true',
18 |                         default=False, dest='cv',
19 |                         help='run 10-fold cross-validation')
20 |     parser.add_argument('-t', '--train_ratio',
21 |                         default=1.0, dest='train_ratio', type=float,
22 |                         help='percentage of data for training')
23 |     parser.add_argument('-d', '--dev_ratio',
24 |                         default=0.0, dest='dev_ratio', type=float,
25 |                         help='percentage of training data for development')
26 |     parser.add_argument('-e', '--epochs',
27 |                         default=100, dest='epochs', type=int,
28 |                         help='number of epochs')
29 |     parser.add_argument('--patience',
30 |                         default=10, dest='patience', type=int,
31 |                         help='patience')
32 |     parser.add_argument('--maca_config',
33 |                         default='morfeusz-nkjp-official',
34 |                         help='Maca config')
35 |     parser.add_argument('-g', '--debug', action='store_true', dest='debug_mode')  # TODO
36 |     parser.add_argument('--hash', action='store', default=None, dest='hash')
37 |     parser.add_argument('--reproducible', action='store_true', default=False, help='set seeds')
38 |     parser.add_argument('-f', '--fold', default=None, dest='fold')
39 |     args = parser.parse_args()
40 | 
41 |     if args.reproducible:
42 |         from numpy.random import seed
43 |         seed(1337)
44 |         import random as rn
45 |         rn.seed(1337)
46 |         import tensorflow as tf
47 |         session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
48 |                                       inter_op_parallelism_threads=1)
49 |         from keras import backend as K
50 |         tf.set_random_seed(1337)
51 |         sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
52 |         K.set_session(sess)
53 | 
54 |     pref = {'nb_epoch': 100, 'batch_size': 256,
55 |             'internal_neurons': 256, 'feature_name': 'tags4e3', 'label_name': 'label',
56 |             'evaluator': UnalignedSimpleEvaluator, 'patience': 10,
57 |             'weight_path': 'weights.hdf5', 'samples_per_epoch': 10000, 'keras_model_class': BEST,
58 |             'corpus_path': 'data/train-reanalyzed.spickle', 'reanalyze': True, 'train_data_ratio': 0.9,
59 |             'dev_data_ratio': 0.1}
60 | 
61 |     pref['reanalyze'] = args.reanalyzed
62 |     pref['train_data_ratio'] = float(args.train_ratio)
63 |     pref['dev_data_ratio'] = float(args.dev_ratio)
64 |     pref['nb_epoch'] = int(args.epochs)
65 |     pref['corpus_path'] = args.corpus_path
66 |     pref['patience'] = args.patience
67 |     pref['maca_config'] = args.maca_config
68 |     if args.hash is not None:
69 |         pref['h'] = args.hash
70 |     if args.fold is not None:
71 |         pref['fold'] = int(args.fold)
72 | 
73 |     keras_model_class = pref['keras_model_class']
74 | 
75 |     if args.cv:
76 |         rf = RunFolds2(keras_model_class, pref)
77 |         rf.run()
78 |     else:
79 |         parameters = ExperimentParameters(pref)
80 |         km = keras_model_class(parameters)
81 | 
82 |         print('Model will be saved under: %s.final' % parameters.pref['weight_path'])
83 |         print('Lemmatisation model will be saved under: %s' % parameters.pref['lemmatisation_path'])
84 | 
85 |         kd = KerasData(pref['corpus_path'], pref['reanalyze'])
86 |         re = RunExperiment(kd, km)
87 |         re.run()
88 | 
89 |         print('Model is saved under: %s' % parameters.pref['weight_path'])
90 |         print('Lemmatisation model is saved under: %s' % parameters.pref['lemmatisation_path'])
91 |         if pref['reanalyze']:
92 |             print('Dictionary is saved under: %s' % parameters.pref[
93 |                 'corpus_path'] + '_FormatData2_PreprocessData_UniqueFeaturesValues')
94 |         else:
95 |             print('Dictionary is saved under: %s' % parameters.pref[
96 |                 'corpus_path'] + '_FormatDataPreAnalyzed_PreprocessData_UniqueFeaturesValues')
97 | 


--------------------------------------------------------------------------------
/merge_analyzed_gold.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | from argparse import ArgumentParser
 4 | 
 5 | from krnnt.serial_pickle import SerialPickler, SerialUnpickler
 6 | 
 7 | if __name__ == '__main__':
 8 |     parser = ArgumentParser(
 9 |         description='Combines analyzed corpus with gold. Analyzed corpus must be with gold segmentation.')
10 |     parser.add_argument('gold_path', help='')
11 |     parser.add_argument('analyzed_path', help='')
12 |     parser.add_argument('output_path', help='')
13 |     args = parser.parse_args()
14 | 
15 |     file_path1 = args.gold_path
16 |     file_path2 = args.analyzed_path
17 |     output_path = args.output_path
18 | 
19 |     file1 = open(file_path1, 'rb')
20 |     su_gold = SerialUnpickler(file1)
21 | 
22 |     file2 = open(file_path2, 'rb')
23 |     su_analyzed = SerialUnpickler(file2)
24 | 
25 |     file3 = open(output_path, 'wb')
26 |     sp = SerialPickler(file3)
27 | 
28 |     for paragraph_gold in su_gold:
29 |         for sentence_gold in paragraph_gold:
30 |             paragraph_analyzed = next(su_analyzed.__iter__())
31 |             assert len(paragraph_analyzed.sentences), 1
32 |             sentence_analyzed = paragraph_analyzed.sentences[0]
33 |             assert len(sentence_analyzed.tokens), len(sentence_gold.tokens)
34 |             for token_gold, token_analyzed in zip(sentence_gold, sentence_analyzed):
35 |                 token_gold.interpretations = token_analyzed.interpretations
36 |         sp.add(paragraph_gold)
37 | 
38 |     file3.close()
39 | 


--------------------------------------------------------------------------------
/preprocess_data.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | 
 3 | from tqdm import tqdm
 4 | 
 5 | from krnnt.new import preprocess_paragraph_preanalyzed, \
 6 |     preprocess_paragraph_reanalyzed
 7 | from krnnt.serial_pickle import SerialPickler, SerialUnpickler
 8 | from krnnt.structure import Paragraph
 9 | 
10 | if __name__ == '__main__':
11 |     parser = ArgumentParser(description='Create features for neural network.')
12 |     parser.add_argument('input_path', type=str, help='path to re/preanalyzed data')
13 |     parser.add_argument('output_path', type=str, help='save path')
14 |     parser.add_argument('-p', '--preanalyzed', action='store_false',
15 |                         default=True, dest='reanalyzed',
16 |                         help='training data have not been reanalyzed')
17 |     args = parser.parse_args()
18 | 
19 |     file = open(args.input_path, 'rb')
20 |     su = SerialUnpickler(file)
21 | 
22 |     file2 = open(args.output_path, 'wb')
23 |     sp = SerialPickler(file2)
24 | 
25 |     if args.reanalyzed:
26 |         preprocess_method = preprocess_paragraph_reanalyzed
27 |     else:
28 |         preprocess_method = preprocess_paragraph_preanalyzed
29 | 
30 |     paragraph: Paragraph
31 |     for paragraph in tqdm(su, total=18484):
32 |         paragraph_sequence = preprocess_method(paragraph)
33 | 
34 |         sp.add(paragraph_sequence)
35 | 
36 |     file.close()
37 |     file2.close()
38 | 
39 | 


--------------------------------------------------------------------------------
/process_xces.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import glob
 4 | 
 5 | from krnnt.serial_pickle import SerialPickler
 6 | from argparse import ArgumentParser
 7 | 
 8 | from krnnt.readers import read_xces
 9 | 
10 | usage = """%prog CORPUS SAVE_PATH
11 | 
12 | Converts XCES corpus to internal KRNNT representation and saves it to file.
13 | 
14 | E.g. %prog train-analyzed.xml train-analyzed.spickle
15 | """
16 | 
17 | if __name__ == '__main__':
18 |     parser = ArgumentParser(usage="usage")
19 |     parser.add_argument('file_path', type=str, help='path to XCES corpus (or path with wildcard)')
20 |     parser.add_argument('output_path', type=str, help='save path')
21 |     args = parser.parse_args()
22 | 
23 |     with open(args.output_path, 'wb') as file:
24 |         sp = SerialPickler(file)
25 | 
26 |         for path in glob.iglob(args.file_path):
27 |             print(path)
28 |             for paragraph in read_xces(path):
29 |                 sp.add(paragraph)
30 | 


--------------------------------------------------------------------------------
/reanalyze.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | from argparse import ArgumentParser
 4 | 
 5 | from tqdm import tqdm
 6 | 
 7 | from krnnt.aligner import align_paragraphs
 8 | from krnnt.analyzers import MacaAnalyzer
 9 | from krnnt.structure import Paragraph
10 | from krnnt.serial_pickle import SerialPickler, SerialUnpickler
11 | 
12 | usage = """prog CORPUS_GOLD CORPUS_SAVE
13 | 
14 | Reanalyze corpus with Maca.
15 | 
16 | E.g. prog train-gold.spickle train-reanalyzed.spickle
17 | """
18 | 
19 | if __name__ == '__main__':
20 |     parser = ArgumentParser(usage=usage)
21 |     parser.add_argument('file_path', type=str, help='paths to corpus')
22 |     parser.add_argument('output_path', type=str, help='save path')
23 |     parser.add_argument('--maca_config', default='morfeusz2-nkjp', help='Maca config')
24 |     parser.add_argument('--toki_config_path', default='', help='Toki config path (directory)')
25 |     args = parser.parse_args()
26 | 
27 |     file1 = open(args.file_path, 'rb')
28 |     su_gold = SerialUnpickler(file1)
29 | 
30 |     file2 = open(args.output_path, 'wb')
31 |     sp = SerialPickler(file2)
32 | 
33 |     maca_analyzer = MacaAnalyzer(args.maca_config)
34 | 
35 |     paragraph_gold: Paragraph
36 |     for j, paragraph_gold in tqdm(enumerate(su_gold), total=18484, desc='Morphological analysis'):
37 |         paragraph_raw = paragraph_gold.text()
38 | 
39 |         paragraph_reanalyzed = maca_analyzer.analyze(paragraph_raw)
40 | 
41 |         print('Number of sentences by Maca vs gold', len(paragraph_reanalyzed.sentences), len(paragraph_gold.sentences))
42 | 
43 |         paragraph_reanalyzed = align_paragraphs(paragraph_reanalyzed, paragraph_gold)
44 | 
45 |         sp.add(paragraph_reanalyzed)
46 | 
47 |     file2.close()
48 | 
49 |     # TODO: count mismatched sentences
50 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Cython
 2 | scikit-learn
 3 | flask
 4 | tqdm
 5 | h5py==2.9.0
 6 | Keras==2.2.4
 7 | numpy==1.16.4
 8 | regex==2019.6.8
 9 | requests==2.22.0
10 | jsonlines==1.2.0
11 | tensorflow-gpu==1.12.0
12 | pytest
13 | gunicorn
14 | git+https://github.com/djstrong/pytest-shell.git#egg=pytest-shell
15 | git+https://github.com/djstrong/krnnt_text_utils.git@cython
16 | pytest-benchmark
17 | 


--------------------------------------------------------------------------------
/run_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #check if server is running
 4 | SERVER_STARTED=0
 5 | if [ `ps aux | grep krnnt_serve -c` -eq 1 ]; then
 6 |     echo 'Starting server'
 7 |     ./start_flask_server.sh > /dev/null 2>&1 &
 8 |     PID=$!
 9 |     echo "PID: $PID"
10 |     SERVER_STARTED=1
11 |     sleep 5
12 | fi
13 | 
14 | 
15 | cd tests
16 | python3 -m pytest
17 | 
18 | 
19 | if [ $SERVER_STARTED -eq 1 ]; then
20 |     echo 'Killing server'
21 |     pkill -P "$PID"
22 | fi


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(name='krnnt',
 4 |       version='1.0.0',
 5 |       description='Part of speech tagger for Polish',
 6 |       url='https://github.com/kwrobel-nlp/krnnt',
 7 |       author='Krzysztof Wróbel',
 8 |       author_email='Wrobel.Krzysztof@gmail.com',
 9 |       packages=['krnnt'],
10 |       license='LGPL',
11 |       python_requires='>=3, <4',
12 |       install_requires=[
13 |           'Cython', 'h5py', 'Keras==2.2.5', 'numpy', 'regex', 'requests', 'jsonlines', 'tqdm', 'flask', 'gunicorn',
14 |           'krnnt_utils @ git+https://github.com/Zhylkaaa/krnnt_text_utils@cython'
15 |       ],
16 |       extras_require={
17 |           'train': ['scikit-learn'],
18 |           'pytest': ['pytest', 'pytest-benchmark',
19 |                      'pytest-shell @ https://api.github.com/repos/djstrong/pytest-shell/tarball/'],
20 |           'tfcpu': ['tensorflow==1.14.0'],
21 |           'tfgpu': ['tensorflow-gpu==1.12.0']
22 |       },
23 |       zip_safe=False)


--------------------------------------------------------------------------------
/shuffle.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import random
 5 | from argparse import ArgumentParser
 6 | 
 7 | from tqdm import tqdm
 8 | 
 9 | from krnnt.structure import Paragraph
10 | from krnnt.serial_pickle import SerialPickler, SerialUnpickler
11 | 
12 | usage = """%prog CORPUS SAVE_PATH
13 | 
14 | Shuffle training data.
15 | 
16 | E.g. %prog train-merged.spickle train-merged.shuf.spickle
17 | """
18 | 
19 | if __name__ == '__main__':
20 |     parser = ArgumentParser(usage=usage)
21 |     parser.add_argument('file_path', type=str, help='paths to corpus')
22 |     parser.add_argument('output_path', type=str, help='save path')
23 |     parser.add_argument('--seed', '-s', type=int, default=1337, help='seed')
24 |     args = parser.parse_args()
25 | 
26 |     file_path1 = args.file_path
27 |     file_path2 = args.output_path
28 | 
29 |     file = open(file_path1, 'rb')
30 |     su = SerialUnpickler(file)
31 | 
32 |     paragraphs = []
33 |     paragraph: Paragraph
34 |     for paragraph in tqdm(su, desc='Loading', total=18484):
35 |         paragraphs.append(paragraph)
36 |     file.close()
37 | 
38 |     random.seed(args.seed)
39 |     random.shuffle(paragraphs)
40 | 
41 |     file2 = open(file_path2, 'wb')
42 |     sp = SerialPickler(file2)
43 | 
44 |     for paragraph in tqdm(paragraphs, desc='Saving'):
45 |         sp.add(paragraph)
46 | 
47 |     file2.close()
48 | 


--------------------------------------------------------------------------------
/split_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import math
 4 | from argparse import ArgumentParser
 5 | 
 6 | from krnnt.serial_pickle import SerialPickler, SerialUnpickler, count_samples
 7 | 
 8 | if __name__ == '__main__':
 9 |     parser = ArgumentParser(description='Split data')
10 |     parser.add_argument('input_path', help='input path to data')
11 |     parser.add_argument('output_path1', help='output path to data')
12 |     parser.add_argument('output_path2', help='output path to data')
13 |     parser.add_argument('ratio', type=float, help='ratio of data to write to the first output')
14 | 
15 |     args = parser.parse_args()
16 | 
17 |     num_data = count_samples(args.input_path)
18 |     first_part = math.ceil(num_data * args.ratio)
19 | 
20 |     sp1 = SerialPickler(open(args.output_path1, 'wb'))
21 |     sp2 = SerialPickler(open(args.output_path2, 'wb'))
22 | 
23 |     su = SerialUnpickler(open(args.input_path, 'rb'))
24 |     for i, paragraph in enumerate(su):
25 |         if i < first_part:
26 |             sp1.add(paragraph)
27 |         else:
28 |             sp2.add(paragraph)
29 |     sp1.close()
30 |     sp2.close()
31 | 


--------------------------------------------------------------------------------
/start_flask_server.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | PORT=${PORT:-9003}
4 | 
5 | export CUDA_VISIBLE_DEVICES=""
6 | 
7 | python3 krnnt_serve.py model_data --maca_config morfeusz2-nkjp -p $PORT


--------------------------------------------------------------------------------
/start_gunicorn_server.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | PORT=${PORT:-9003}
 4 | WORKERS=${WORKERS:-1}
 5 | 
 6 | echo "Starting server with $WORKERS workers."
 7 | 
 8 | export CUDA_VISIBLE_DEVICES=""
 9 | 
10 | gunicorn -b 0.0.0.0:$PORT -w $WORKERS -k sync -t 3600 --threads 1 'krnnt_serve:start("model_data","--maca_config","morfeusz2-nkjp")'


--------------------------------------------------------------------------------
/tests/benchmark/test_maca.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from krnnt.analyzers import MacaAnalyzer
 4 | 
 5 | paragraph_raw = 'Moje niefortunne pudła przygnębiły mnie do reszty. Zawsze miałem pretensje, że jestem dobrym myśliwym, a od dzieciństwa nie rozstawałem się ze strzelbą, a tu wśród obcych zblamowałem się jak nigdy w życiu. Jakże inaczej strzelałem cietrzewie i pardwy z moich "hollandów", które pozostawiłem na wieczną zgubę w Petersburgu. Poczciwy Staś Sierakowski pośpieszył mi z pomocą, by wyjaśnić moje niepowodzenia. - Pokaż mi strzelbę - poprosił, a gdy podałem mu mojego mauzera, spytał ze śmiechem: - Gdzieś to świństwo wykopał? - Ano w Gdańsku - odrzekłem zawstydzony. - Chyba byłeś ślepy, kupując taką szkaradę. Z czego strzelałeś przed wojną? - Miałem hollandy - odrzekłem. - Jedyna rada - rzekł w końcu Staś po oględzinach mojej broni. - Każ sobie skrócić szyję na dobrych kilka centymetrów, albo jeszcze lepiej rzuć to świństwo do pieca, a co się nie spali - na śmietnik.'
 6 | MACA_CONFIG1 = 'morfeusz-nkjp-official'
 7 | MACA_CONFIG2 = 'morfeusz2-nkjp'
 8 | 
 9 | 
10 | @pytest.fixture
11 | def get_maca_wrapper():
12 |     try:
13 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
14 |         list(maca_analyzer._maca_wrapper(paragraph_raw))
15 |     except:
16 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
17 |         list(maca_analyzer._maca_wrapper(paragraph_raw))
18 | 
19 |     return maca_analyzer
20 | 
21 | 
22 | @pytest.fixture
23 | def get_maca_process():
24 |     try:
25 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
26 |         list(maca_analyzer._maca_process(paragraph_raw))
27 |     except:
28 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
29 |         list(maca_analyzer._maca_process(paragraph_raw))
30 | 
31 |     return maca_analyzer
32 | 
33 | 
34 | def analyze_process(maca_analyzer, data):
35 |     results = maca_analyzer._maca_process(data)
36 |     return list(results)
37 | 
38 | 
39 | def analyze_wrapper(maca_analyzer, data):
40 |     results = maca_analyzer._maca_wrapper(data)
41 |     return list(results)
42 | 
43 | 
44 | @pytest.mark.slow
45 | def test_maca_process_speed(benchmark, get_maca_process):
46 |     maca_analyzer = get_maca_process
47 |     benchmark(analyze_process, maca_analyzer, paragraph_raw)
48 | 
49 | 
50 | @pytest.mark.slow
51 | def test_maca_wrapper_speed(benchmark, get_maca_wrapper):
52 |     maca_analyzer = get_maca_wrapper
53 |     benchmark(analyze_wrapper, maca_analyzer, paragraph_raw)
54 | 


--------------------------------------------------------------------------------
/tests/benchmark/test_maca_analyze.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from krnnt.analyzers import MacaAnalyzer
 4 | 
 5 | paragraph_raw = 'Moje niefortunne pudła przygnębiły mnie do reszty. Zawsze miałem pretensje, że jestem dobrym myśliwym, a od dzieciństwa nie rozstawałem się ze strzelbą, a tu wśród obcych zblamowałem się jak nigdy w życiu. Jakże inaczej strzelałem cietrzewie i pardwy z moich "hollandów", które pozostawiłem na wieczną zgubę w Petersburgu. Poczciwy Staś Sierakowski pośpieszył mi z pomocą, by wyjaśnić moje niepowodzenia. - Pokaż mi strzelbę - poprosił, a gdy podałem mu mojego mauzera, spytał ze śmiechem: - Gdzieś to świństwo wykopał? - Ano w Gdańsku - odrzekłem zawstydzony. - Chyba byłeś ślepy, kupując taką szkaradę. Z czego strzelałeś przed wojną? - Miałem hollandy - odrzekłem. - Jedyna rada - rzekł w końcu Staś po oględzinach mojej broni. - Każ sobie skrócić szyję na dobrych kilka centymetrów, albo jeszcze lepiej rzuć to świństwo do pieca, a co się nie spali - na śmietnik.'
 6 | MACA_CONFIG1 = 'morfeusz-nkjp-official'
 7 | MACA_CONFIG2 = 'morfeusz2-nkjp'
 8 | 
 9 | 
10 | @pytest.fixture
11 | def get_maca_wrapper():
12 |     try:
13 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
14 |         list(maca_analyzer._maca_wrapper(paragraph_raw))
15 |         maca_analyzer._maca = maca_analyzer._maca_wrapper
16 |     except:
17 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
18 |         list(maca_analyzer._maca_wrapper(paragraph_raw))
19 |         maca_analyzer._maca = maca_analyzer._maca_wrapper
20 | 
21 |     return maca_analyzer
22 | 
23 | 
24 | @pytest.fixture
25 | def get_maca_process():
26 |     try:
27 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
28 |         list(maca_analyzer._maca_process(paragraph_raw))
29 |         maca_analyzer._maca = maca_analyzer._maca_process
30 |     except:
31 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
32 |         list(maca_analyzer._maca_process(paragraph_raw))
33 |         maca_analyzer._maca = maca_analyzer._maca_process
34 | 
35 |     return maca_analyzer
36 | 
37 | 
38 | def analyze_process(maca_analyzer, data):
39 |     results = maca_analyzer.analyze(data)
40 |     return list(results)
41 | 
42 | 
43 | def analyze_wrapper(maca_analyzer, data):
44 |     results = maca_analyzer.analyze(data)
45 |     return list(results)
46 | 
47 | 
48 | @pytest.mark.slow
49 | def test_maca_process_speed(benchmark, get_maca_process):
50 |     maca_analyzer = get_maca_process
51 |     benchmark(analyze_process, maca_analyzer, paragraph_raw)
52 | 
53 | 
54 | @pytest.mark.slow
55 | def test_maca_wrapper_speed(benchmark, get_maca_wrapper):
56 |     maca_analyzer = get_maca_wrapper
57 |     benchmark(analyze_wrapper, maca_analyzer, paragraph_raw)
58 | 


--------------------------------------------------------------------------------
/tests/benchmark/test_shape.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from krnnt.utils import shape
 4 | import krnnt_utils
 5 | 
 6 | @pytest.fixture
 7 | def word():
 8 |     return "ljhbasjk8f5IYTVIGHVaisftityvfiouyfO*86f97f697"
 9 | 
10 | @pytest.mark.slow
11 | def test_shape_regex(word, benchmark):
12 |     benchmark(shape,word)
13 | 
14 | @pytest.mark.slow
15 | def test_shape_cython(word, benchmark):
16 |     benchmark(krnnt_utils.shape,word)


--------------------------------------------------------------------------------
/tests/benchmark/test_tags.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from krnnt.features import TagsPreprocessor, TagsPreprocessorCython
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def tags():
 8 |     return ['fin:sg:ter:imperf', 'subst:sg:nom:f']
 9 | 
10 | 
11 | @pytest.mark.slow
12 | def test_tags4(tags, benchmark):
13 |     benchmark(TagsPreprocessor.create_tags4_without_guesser, tags)
14 | 
15 | 
16 | @pytest.mark.slow
17 | def test_tags4_cython(tags, benchmark):
18 |     benchmark(TagsPreprocessorCython.create_tags4_without_guesser, tags)
19 | 
20 | 
21 | @pytest.mark.slow
22 | def test_tags5(tags, benchmark):
23 |     benchmark(TagsPreprocessor.create_tags5_without_guesser, tags)
24 | 
25 | 
26 | @pytest.mark.slow
27 | def test_tags5_cython(tags, benchmark):
28 |     benchmark(TagsPreprocessorCython.create_tags5_without_guesser, tags)
29 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pytest
3 | 
4 | @pytest.fixture
5 | def rootdir():
6 |     return os.path.dirname(os.path.abspath(__file__))


--------------------------------------------------------------------------------
/tests/data/reference/gold-task-c_evaluation.txt:
--------------------------------------------------------------------------------
 1 | ### FOLD  1: tests/data/small/gold-task-c.xml (tag) v. /tmp/out.xces (ref)
 2 | PolEval 2017 competition scores
 3 | -------------------------------
 4 | POS accuracy (Subtask A score): 	33.1343%
 5 | POS accuracy (known words): 	33.1343%
 6 | POS accuracy (unknown words): 	0.0000%
 7 | Lemmatization accuracy (Subtask B score): 	51.3433%
 8 | Lemmatization accuracy (known words): 	51.3433%
 9 | Lemmatization accuracy (unknown words): 	0.0000%
10 | Overall accuracy (Subtask C score): 	42.2388%
11 | ----
12 | REF-toks	335
13 | KN	100.0000%
14 | KN_POS_SC_LOWER	53.1343%
15 | KN_SC_LOWER	33.1343%
16 | KN_SEG_CHANGE	0.8955%
17 | KN_SL_LOWER	51.3433%
18 | KN_WC_LOWER	34.0299%
19 | POS_SC_LOWER	53.1343%
20 | POS_WC_LOWER	53.1343%
21 | SC_LOWER	33.1343%
22 | SEG_CHANGE	0.8955%
23 | SEG_NOCHANGE	99.1045%
24 | SL_CASE_CAT_HEUR	51.3433%
25 | SL_LOWER	51.3433%
26 | SL_NOCASE_CAT_HEUR	54.3284%
27 | SL_NOCASE_LOWER	54.3284%
28 | UNK	0.0000%
29 | UNK_POS_SC_LOWER	0.0000%
30 | UNK_SC_LOWER	0.0000%
31 | UNK_SEG_CHANGE	0.0000%
32 | UNK_SL_LOWER	0.0000%
33 | UNK_WC_LOWER	0.0000%
34 | WC_LOWER	34.0299%
35 | WL_LOWER	51.3433%
36 | WC_UPPER	34.9254%
37 | AVG weak lemma lower bound	51.3433%
38 | AVG KN strong lemma lower bound	51.3433%
39 | AVG UNK strong lemma lower bound	0.0000%
40 | AVG strong lemma lower bound	51.3433%
41 | AVG strong lemma nocase lower bound	54.3284%
42 | AVG strong lemma case concat heur	51.3433%
43 | AVG strong lemma nocase concat heur	54.3284%
44 | AVG weak corr lower bound	34.0299%
45 | AVG weak corr upper bound	34.9254%
46 | AVG UNK weak corr lower bound	0.0000%
47 | AVG UNK weak corr upper bound	0.0000%
48 | AVG KN  weak corr lower bound	34.0299%
49 | AVG KN  weak corr upper bound	34.9254%
50 | AVG POS strong corr lower bound	53.1343%
51 | AVG percentage UNK	0.0000%
52 | AVG percentage seg change	0.8955%
53 | 


--------------------------------------------------------------------------------
/tests/data/reference/in_raw.txt:
--------------------------------------------------------------------------------
1 | Lubię placki. Ala ma kota.
2 | 
3 | Raz dwa trzy.


--------------------------------------------------------------------------------
/tests/data/reference/lemmatisation_test.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/lemmatisation_test.pkl


--------------------------------------------------------------------------------
/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle


--------------------------------------------------------------------------------
/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part1


--------------------------------------------------------------------------------
/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part2


--------------------------------------------------------------------------------
/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess


--------------------------------------------------------------------------------
/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict


--------------------------------------------------------------------------------
/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2


--------------------------------------------------------------------------------
/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData


--------------------------------------------------------------------------------
/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues


--------------------------------------------------------------------------------
/tests/data/reference/nkjp1m-1.2-reanalyzed.spickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.spickle


--------------------------------------------------------------------------------
/tests/data/reference/nkjp1m-1.2.spickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2.spickle


--------------------------------------------------------------------------------
/tests/data/reference/out.conll:
--------------------------------------------------------------------------------
 1 | Lubię	Lubię	1	adj:pl:nom:m1:pos	0	5
 2 | placki	placka	1	subst:pl:acc:f	6	12
 3 | .	.	0	interp	12	13
 4 | 
 5 | Ala	Ala	1	subst:sg:nom:f	14	17
 6 | ma	ma	1	subst:sg:nom:f	18	20
 7 | kota	kota	1	subst:sg:nom:f	21	25
 8 | .	.	0	interp	25	26
 9 | 
10 | 
11 | Raz	Raz	1	subst:sg:nom:f	0	3
12 | dwa	dwa	1	adj:pl:acc:f:pos	4	7
13 | trzy	trzy	1	subst:pl:acc:f	8	12
14 | .	.	0	interp	12	13
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/tests/data/reference/out.conllu:
--------------------------------------------------------------------------------
 1 | 1	Lubię	Lubię	_	adj:pl:nom:m1:pos	_	_	_	_	_
 2 | 2	placki	placka	_	subst:pl:acc:f	_	_	_	_	_
 3 | 3	.	.	_	interp	_	_	_	_	_
 4 | 
 5 | 1	Ala	Ala	_	subst:sg:nom:f	_	_	_	_	_
 6 | 2	ma	ma	_	subst:sg:nom:f	_	_	_	_	_
 7 | 3	kota	kota	_	subst:sg:nom:f	_	_	_	_	_
 8 | 4	.	.	_	interp	_	_	_	_	_
 9 | 
10 | 
11 | 1	Raz	Raz	_	subst:sg:nom:f	_	_	_	_	_
12 | 2	dwa	dwa	_	adj:pl:acc:f:pos	_	_	_	_	_
13 | 3	trzy	trzy	_	subst:pl:acc:f	_	_	_	_	_
14 | 4	.	.	_	interp	_	_	_	_	_
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/tests/data/reference/out.jsonl:
--------------------------------------------------------------------------------
1 | [[["Lubię", "Lubię", "adj:pl:nom:m1:pos"], ["placki", "placka", "subst:pl:acc:f"], [".", ".", "interp"]], [["Ala", "Ala", "subst:sg:nom:f"], ["ma", "ma", "subst:sg:nom:f"], ["kota", "kota", "subst:sg:nom:f"], [".", ".", "interp"]]]
2 | [[["Raz", "Raz", "subst:sg:nom:f"], ["dwa", "dwa", "adj:pl:acc:f:pos"], ["trzy", "trzy", "subst:pl:acc:f"], [".", ".", "interp"]]]
3 | 
4 | 


--------------------------------------------------------------------------------
/tests/data/reference/out.plain:
--------------------------------------------------------------------------------
 1 | Lubię	newline
 2 | 	Lubię	adj:pl:nom:m1:pos	disamb
 3 | placki	space
 4 | 	placka	subst:pl:acc:f	disamb
 5 | .	none
 6 | 	.	interp	disamb
 7 | 
 8 | Ala	space
 9 | 	Ala	subst:sg:nom:f	disamb
10 | ma	space
11 | 	ma	subst:sg:nom:f	disamb
12 | kota	space
13 | 	kota	subst:sg:nom:f	disamb
14 | .	none
15 | 	.	interp	disamb
16 | 
17 | 
18 | Raz	newline
19 | 	Raz	subst:sg:nom:f	disamb
20 | dwa	space
21 | 	dwa	adj:pl:acc:f:pos	disamb
22 | trzy	space
23 | 	trzy	subst:pl:acc:f	disamb
24 | .	none
25 | 	.	interp	disamb
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/tests/data/reference/out.xces:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE cesAna SYSTEM "xcesAnaIPI.dtd">
 3 | <cesAna xmlns:xlink="http://www.w3.org/1999/xlink" version="1.0" type="lex disamb">
 4 | <chunkList>
 5 |  <chunk type="p">
 6 |   <chunk type="s">
 7 |    <tok>
 8 |     <orth>Lubię</orth>
 9 |     <lex disamb="1"><base>Lubię</base><ctag>adj:pl:nom:m1:pos</ctag></lex>
10 |    </tok>
11 |    <tok>
12 |     <orth>placki</orth>
13 |     <lex disamb="1"><base>placka</base><ctag>subst:pl:acc:f</ctag></lex>
14 |    </tok>
15 |    <ns/>
16 |    <tok>
17 |     <orth>.</orth>
18 |     <lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
19 |    </tok>
20 |   </chunk>
21 |   <chunk type="s">
22 |    <tok>
23 |     <orth>Ala</orth>
24 |     <lex disamb="1"><base>Ala</base><ctag>subst:sg:nom:f</ctag></lex>
25 |    </tok>
26 |    <tok>
27 |     <orth>ma</orth>
28 |     <lex disamb="1"><base>ma</base><ctag>subst:sg:nom:f</ctag></lex>
29 |    </tok>
30 |    <tok>
31 |     <orth>kota</orth>
32 |     <lex disamb="1"><base>kota</base><ctag>subst:sg:nom:f</ctag></lex>
33 |    </tok>
34 |    <ns/>
35 |    <tok>
36 |     <orth>.</orth>
37 |     <lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
38 |    </tok>
39 |   </chunk>
40 |  </chunk>
41 |  <chunk type="p">
42 |   <chunk type="s">
43 |    <tok>
44 |     <orth>Raz</orth>
45 |     <lex disamb="1"><base>Raz</base><ctag>subst:sg:nom:f</ctag></lex>
46 |    </tok>
47 |    <tok>
48 |     <orth>dwa</orth>
49 |     <lex disamb="1"><base>dwa</base><ctag>adj:pl:acc:f:pos</ctag></lex>
50 |    </tok>
51 |    <tok>
52 |     <orth>trzy</orth>
53 |     <lex disamb="1"><base>trzy</base><ctag>subst:pl:acc:f</ctag></lex>
54 |    </tok>
55 |    <ns/>
56 |    <tok>
57 |     <orth>.</orth>
58 |     <lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
59 |    </tok>
60 |   </chunk>
61 |  </chunk>
62 | </chunkList>
63 | </cesAna>
64 | 


--------------------------------------------------------------------------------
/tests/data/reference/weight_test.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/weight_test.hdf5


--------------------------------------------------------------------------------
/tests/data/reference/weight_test.hdf5.final:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/weight_test.hdf5.final


--------------------------------------------------------------------------------
/tests/data/reference/weight_test.hdf5.new:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/weight_test.hdf5.new


--------------------------------------------------------------------------------
/tests/data/server/in_raw.txt:
--------------------------------------------------------------------------------
1 | Lubię placki. Ala ma kota.
2 | 
3 | Raz dwa trzy.


--------------------------------------------------------------------------------
/tests/data/server/in_tokenized.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "documents": [
 3 |     {
 4 |       "text": "Lubię placki. Ala ma kota.",
 5 |       "sentences": [
 6 |         {
 7 |           "tokens": [
 8 |             {
 9 |               "form": "Lubię",
10 |               "separator": "newline",
11 |               "start": 0,
12 |               "end": 0
13 |             },
14 |             {
15 |               "form": "placki",
16 |               "separator": "space",
17 |               "start": 0,
18 |               "end": 0
19 |             },
20 |             {
21 |               "form": ".",
22 |               "separator": "none",
23 |               "start": 0,
24 |               "end": 0
25 |             }
26 |           ]
27 |         },
28 |         {
29 |           "tokens": [
30 |             {
31 |               "form": "Ala",
32 |               "separator": "space",
33 |               "start": 0,
34 |               "end": 0
35 |             },
36 |             {
37 |               "form": "ma",
38 |               "separator": "space",
39 |               "start": 0,
40 |               "end": 0
41 |             },
42 |             {
43 |               "form": "kota",
44 |               "separator": "space",
45 |               "start": 0,
46 |               "end": 0
47 |             },
48 |             {
49 |               "form": ".",
50 |               "separator": "none",
51 |               "start": 0,
52 |               "end": 0
53 |             }
54 |           ]
55 |         }
56 |       ]
57 |     },
58 |     {
59 |       "text": "Raz dwa trzy.",
60 |       "sentences": [
61 |         {
62 |           "tokens": [
63 |             {
64 |               "form": "Raz",
65 |               "separator": "newline",
66 |               "start": 0,
67 |               "end": 0
68 |             },
69 |             {
70 |               "form": "dwa",
71 |               "separator": "space",
72 |               "start": 0,
73 |               "end": 0
74 |             },
75 |             {
76 |               "form": "trzy",
77 |               "separator": "space",
78 |               "start": 0,
79 |               "end": 0
80 |             },
81 |             {
82 |               "form": ".",
83 |               "separator": "none",
84 |               "start": 0,
85 |               "end": 0
86 |             }
87 |           ]
88 |         }
89 |       ]
90 |     }
91 |   ]
92 | }


--------------------------------------------------------------------------------
/tests/data/server/in_tokenized_compact.json:
--------------------------------------------------------------------------------
1 | [
2 |   [
3 |     [["Lubię","newline"],["placki","space"],[".","none"]],
4 |     [["Ala","space"],["ma","space"],["kota","space"],[".","none"]]
5 |   ],
6 |   [
7 |     [["Raz","newline"],["dwa","space"],["trzy","space"],[".","none"]]
8 |   ]
9 | ]


--------------------------------------------------------------------------------
/tests/data/server/out_raw.conll:
--------------------------------------------------------------------------------
 1 | Lubię	lubić	1	fin:sg:pri:imperf	0	5
 2 | placki	placek	1	subst:pl:acc:m3	6	12
 3 | .	.	0	interp	12	13
 4 | 
 5 | Ala	Ala	1	subst:sg:nom:f	14	17
 6 | ma	mieć	1	fin:sg:ter:imperf	18	20
 7 | kota	kot	1	subst:sg:acc:m2	21	25
 8 | .	.	0	interp	25	26
 9 | 
10 | 
11 | Raz	raz	1	subst:sg:nom:m3	0	3
12 | dwa	dwa	1	num:pl:nom:m3:congr	4	7
13 | trzy	trzy	1	num:pl:nom:m3:congr	8	12
14 | .	.	0	interp	12	13
15 | 
16 | 


--------------------------------------------------------------------------------
/tests/data/server/out_raw.conllu:
--------------------------------------------------------------------------------
 1 | 1	Lubię	lubić	_	fin:sg:pri:imperf	_	_	_	_	_
 2 | 2	placki	placek	_	subst:pl:acc:m3	_	_	_	_	_
 3 | 3	.	.	_	interp	_	_	_	_	_
 4 | 
 5 | 1	Ala	Ala	_	subst:sg:nom:f	_	_	_	_	_
 6 | 2	ma	mieć	_	fin:sg:ter:imperf	_	_	_	_	_
 7 | 3	kota	kot	_	subst:sg:acc:m2	_	_	_	_	_
 8 | 4	.	.	_	interp	_	_	_	_	_
 9 | 
10 | 
11 | 1	Raz	raz	_	subst:sg:nom:m3	_	_	_	_	_
12 | 2	dwa	dwa	_	num:pl:nom:m3:congr	_	_	_	_	_
13 | 3	trzy	trzy	_	num:pl:nom:m3:congr	_	_	_	_	_
14 | 4	.	.	_	interp	_	_	_	_	_
15 | 
16 | 


--------------------------------------------------------------------------------
/tests/data/server/out_raw.jsonl:
--------------------------------------------------------------------------------
1 | [[["Lubię", "lubić", "fin:sg:pri:imperf"], ["placki", "placek", "subst:pl:acc:m3"], [".", ".", "interp"]], [["Ala", "Ala", "subst:sg:nom:f"], ["ma", "mieć", "fin:sg:ter:imperf"], ["kota", "kot", "subst:sg:acc:m2"], [".", ".", "interp"]]]
2 | [[["Raz", "raz", "subst:sg:nom:m3"], ["dwa", "dwa", "num:pl:nom:m3:congr"], ["trzy", "trzy", "num:pl:nom:m3:congr"], [".", ".", "interp"]]]
3 | 


--------------------------------------------------------------------------------
/tests/data/server/out_raw.plain:
--------------------------------------------------------------------------------
 1 | Lubię	newline
 2 | 	lubić	fin:sg:pri:imperf	disamb
 3 | placki	space
 4 | 	placek	subst:pl:acc:m3	disamb
 5 | .	none
 6 | 	.	interp	disamb
 7 | 
 8 | Ala	space
 9 | 	Ala	subst:sg:nom:f	disamb
10 | ma	space
11 | 	mieć	fin:sg:ter:imperf	disamb
12 | kota	space
13 | 	kot	subst:sg:acc:m2	disamb
14 | .	none
15 | 	.	interp	disamb
16 | 
17 | Raz	newline
18 | 	raz	subst:sg:nom:m3	disamb
19 | dwa	space
20 | 	dwa	num:pl:nom:m3:congr	disamb
21 | trzy	space
22 | 	trzy	num:pl:nom:m3:congr	disamb
23 | .	none
24 | 	.	interp	disamb
25 | 
26 | 


--------------------------------------------------------------------------------
/tests/data/server/out_raw.xces:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE cesAna SYSTEM "xcesAnaIPI.dtd">
 3 | <cesAna xmlns:xlink="http://www.w3.org/1999/xlink" version="1.0" type="lex disamb">
 4 | <chunkList>
 5 |  <chunk type="p">
 6 |   <chunk type="s">
 7 |    <tok>
 8 |     <orth>Lubię</orth>
 9 |     <lex disamb="1"><base>lubić</base><ctag>fin:sg:pri:imperf</ctag></lex>
10 |    </tok>
11 |    <tok>
12 |     <orth>placki</orth>
13 |     <lex disamb="1"><base>placek</base><ctag>subst:pl:acc:m3</ctag></lex>
14 |    </tok>
15 |    <ns/>
16 |    <tok>
17 |     <orth>.</orth>
18 |     <lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
19 |    </tok>
20 |   </chunk>
21 |   <chunk type="s">
22 |    <tok>
23 |     <orth>Ala</orth>
24 |     <lex disamb="1"><base>Ala</base><ctag>subst:sg:nom:f</ctag></lex>
25 |    </tok>
26 |    <tok>
27 |     <orth>ma</orth>
28 |     <lex disamb="1"><base>mieć</base><ctag>fin:sg:ter:imperf</ctag></lex>
29 |    </tok>
30 |    <tok>
31 |     <orth>kota</orth>
32 |     <lex disamb="1"><base>kot</base><ctag>subst:sg:acc:m2</ctag></lex>
33 |    </tok>
34 |    <ns/>
35 |    <tok>
36 |     <orth>.</orth>
37 |     <lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
38 |    </tok>
39 |   </chunk>
40 |  </chunk>
41 |  <chunk type="p">
42 |   <chunk type="s">
43 |    <tok>
44 |     <orth>Raz</orth>
45 |     <lex disamb="1"><base>raz</base><ctag>subst:sg:nom:m3</ctag></lex>
46 |    </tok>
47 |    <tok>
48 |     <orth>dwa</orth>
49 |     <lex disamb="1"><base>dwa</base><ctag>num:pl:nom:m3:congr</ctag></lex>
50 |    </tok>
51 |    <tok>
52 |     <orth>trzy</orth>
53 |     <lex disamb="1"><base>trzy</base><ctag>num:pl:nom:m3:congr</ctag></lex>
54 |    </tok>
55 |    <ns/>
56 |    <tok>
57 |     <orth>.</orth>
58 |     <lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
59 |    </tok>
60 |   </chunk>
61 |  </chunk>
62 | </chunkList>
63 | </cesAna>


--------------------------------------------------------------------------------
/tests/data/small/00132482.ann.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE chunkList SYSTEM "ccl.dtd">
 3 | <chunkList>
 4 |  <chunk id="ch1">
 5 |   <sentence id="sent1">
 6 |    <tok>
 7 |     <orth>Gdzie</orth>
 8 |     <lex disamb="1"><base>gdzie</base><ctag>adv</ctag></lex>
 9 |    </tok>
10 |    <tok>
11 |     <orth>kupiła</orth>
12 |     <lex disamb="1"><base>kupić</base><ctag>praet:sg:f:perf</ctag></lex>
13 |    </tok>
14 |    <ns/>
15 |    <tok>
16 |     <orth>ś</orth>
17 |     <lex disamb="1"><base>być</base><ctag>aglt:sg:sec:imperf:nwok</ctag></lex>
18 |    </tok>
19 |    <tok>
20 |     <orth>łańcuszek</orth>
21 |     <lex disamb="1"><base>łańcuszek</base><ctag>subst:sg:acc:m3</ctag></lex>
22 |    </tok>
23 |    <ns/>
24 |    <tok>
25 |     <orth>?</orth>
26 |     <lex disamb="1"><base>?</base><ctag>interp</ctag></lex>
27 |    </tok>
28 |   </sentence>
29 |   <sentence id="sent2">
30 |    <tok>
31 |     <orth>:</orth>
32 |     <lex disamb="1"><base>:)</base><ctag>interj</ctag></lex>
33 |    </tok>
34 |    <ns/>
35 |    <tok>
36 |     <orth>)</orth>
37 |     <lex disamb="1"><base>)</base><ctag>blank</ctag></lex>
38 |    </tok>
39 |   </sentence>
40 |  </chunk>
41 | </chunkList>


--------------------------------------------------------------------------------
/tests/data/small/gold-task-c.txt:
--------------------------------------------------------------------------------
1 | Moje niefortunne pudła przygnębiły mnie do reszty. Zawsze miałem pretensje, że jestem dobrym myśliwym, a od dzieciństwa nie rozstawałem się ze strzelbą, a tu wśród obcych zblamowałem się jak nigdy w życiu. Jakże inaczej strzelałem cietrzewie i pardwy z moich "hollandów", które pozostawiłem na wieczną zgubę w Petersburgu. Poczciwy Staś Sierakowski pośpieszył mi z pomocą, by wyjaśnić moje niepowodzenia. - Pokaż mi strzelbę - poprosił, a gdy podałem mu mojego mauzera, spytał ze śmiechem: - Gdzieś to świństwo wykopał? - Ano w Gdańsku - odrzekłem zawstydzony. - Chyba byłeś ślepy, kupując taką szkaradę. Z czego strzelałeś przed wojną? - Miałem hollandy - odrzekłem. - Jedyna rada - rzekł w końcu Staś po oględzinach mojej broni. - Każ sobie skrócić szyję na dobrych kilka centymetrów, albo jeszcze lepiej rzuć to świństwo do pieca, a co się nie spali - na śmietnik.
2 | Przestrzeń dzielącą je od kolejnego skłonu schodów pokonało, kolebiąc się na boki, rozkołysanym kaczym chodem. Najdziwniejsze jednak było to, co nastąpiło potem. Jego wspinanie się na stopień. Mianowicie najpierw przed nim stanęło, niemal doń przywarło. Samo zresztą było niewiele od niego wyższe. A potem z olbrzymim wysiłkiem zaczęło się nań wspinać, a kiedy betonowa krawędź była już w połowie jego wysokości, ostrożnie się pochylając powoli przeważyło ciężar ciała na poziomą płaszczyznę stopnia. Jakby nie mogło się zginać, jakby kręgosłup miało całkiem zesztywniały. W końcu udało się. Z lekkim stukotem opadło na brzuch. Leżąc tak, wydało z siebie właśnie to jedyne w swoim rodzaju cichutkie jęknięcie. Osiągnąwszy tę fazę wspinaczki, przeszło po chwili do następnego etapu. Ciągle leżąc, zaczęło się czołgać dalej, aż do chwili kiedy środek ciężkości, w ogóle całe ciało, całkowicie znalazło się na stopniu.


--------------------------------------------------------------------------------
/tests/download_model.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | cd ..
 3 | mkdir model_data -p
 4 | cd model_data
 5 | 
 6 | if [ ! -f "weights.hdf5" ]; then
 7 |     wget "https://github.com/kwrobel-nlp/krnnt/releases/download/poleval/reanalyze_150epochs_train1.0.zip"
 8 |     unzip reanalyze_150epochs_train1.0.zip
 9 |     mv lemmatisation_reana150_1.0.pkl lemmatisation.pkl
10 |     mv weights_reana150_1.0.hdf5 weights.hdf5
11 | fi


--------------------------------------------------------------------------------
/tests/test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #TODO pytest-shell
 4 | 
 5 | #version of morfeusz dictionary may influence results
 6 | 
 7 | MACA_CONFIG=morfeusz2-nkjp
 8 | 
 9 | cd ..
10 | 
11 | python3 process_xces.py tests/data/small/nkjp1m-1.2-xces.xml /tmp/nkjp.spickle
12 | echo $?
13 | diff /tmp/nkjp.spickle tests/data/reference/nkjp1m-1.2.spickle
14 | 
15 | python3 reanalyze.py --maca_config $MACA_CONFIG /tmp/nkjp.spickle /tmp/nkjp-reanalyzed.spickle
16 | echo $?
17 | diff /tmp/nkjp-reanalyzed.spickle tests/data/reference/nkjp1m-1.2-reanalyzed.spickle
18 | 
19 | python3 shuffle.py /tmp/nkjp-reanalyzed.spickle /tmp/nkjp-reanalyzed.shuf.spickle
20 | echo $?
21 | diff /tmp/nkjp-reanalyzed.shuf.spickle tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle
22 | 
23 | rm /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2 /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues
24 | CUDA_VISIBLE_DEVICES="" PYTHONHASHSEED=0 python3 krnnt_train.py --maca_config $MACA_CONFIG /tmp/nkjp-reanalyzed.shuf.spickle -e 2 --reproducible --hash test
25 | echo $?
26 | h5diff weight_test.hdf5 tests/data/reference/weight_test.hdf5
27 | h5diff weight_test.hdf5.final tests/data/reference/weight_test.hdf5.final
28 | diff lemmatisation_test.pkl tests/data/reference/lemmatisation_test.pkl
29 | diff /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2 tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2
30 | diff /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData
31 | diff /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues
32 | 
33 | echo "Lubię placki." | python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces > /tmp/out.xces
34 | echo $?
35 | diff /tmp/out.xces tests/data/reference/out.xces
36 | 
37 | echo "Lubię placki." | python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o plain > /tmp/out.plain
38 | echo $?
39 | diff /tmp/out.plain tests/data/reference/out.plain
40 | 
41 | echo "Lubię placki." | python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o conll > /tmp/out.conll
42 | echo $?
43 | diff /tmp/out.conll tests/data/reference/out.conll
44 | 
45 | echo "Lubię placki." | python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o conllu > /tmp/out.conllu
46 | echo $?
47 | diff /tmp/out.conllu tests/data/reference/out.conllu
48 | 
49 | echo "Lubię placki." | python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o jsonl > /tmp/out.jsonl
50 | echo $?
51 | diff /tmp/out.jsonl tests/data/reference/out.jsonl
52 | 


--------------------------------------------------------------------------------
/tests/test_aglt.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | 
  3 | from krnnt.aglt import rewrite_praet, remove_aglt, rule1, rule3, rule1b
  4 | 
  5 | paragraph = [
  6 |     [
  7 |         {'token': 'Zrobił', 'sep': 'newline', 'tag': 'praet:sg:m1:perf',
  8 |          'lemmas': ['zrobić'], 'start': 0, 'end': 6},
  9 |         {'token': 'by', 'sep': 'none', 'tag': 'qub', 'lemmas': ['by'],
 10 |          'start': 6, 'end': 8},
 11 |         {'token': 'm', 'sep': 'none', 'tag': 'aglt:sg:pri:imperf:nwok',
 12 |          'lemmas': ['być'], 'start': 8, 'end': 9},
 13 |         {'token': 'to', 'sep': 'space', 'tag': 'subst:sg:acc:n',
 14 |          'lemmas': ['to'], 'start': 10, 'end': 12},
 15 |         {'token': '.', 'sep': 'none', 'prob': 1.0, 'tag': 'interp', 'lemmas': ['.'],
 16 |          'start': 12, 'end': 13}
 17 |     ],
 18 |     [
 19 |         {'token': 'Czy', 'sep': 'space', 'tag': 'qub', 'lemmas': ['czy'],
 20 |          'start': 14, 'end': 17},
 21 |         {'token': 'by', 'sep': 'space', 'tag': 'qub', 'lemmas': ['by'],
 22 |          'start': 18, 'end': 20},
 23 |         {'token': 'm', 'sep': 'none', 'tag': 'aglt:sg:pri:imperf:nwok',
 24 |          'lemmas': ['być'], 'start': 20, 'end': 21},
 25 |         {'token': 'to', 'sep': 'space', 'tag': 'subst:sg:acc:n',
 26 |          'lemmas': ['to'], 'start': 22, 'end': 24},
 27 |         {'token': 'zrobił', 'sep': 'space', 'tag': 'praet:sg:m1:perf',
 28 |          'lemmas': ['zrobić'], 'start': 25, 'end': 31},
 29 |         {'token': '?', 'sep': 'none', 'tag': 'interp', 'lemmas': ['?'],
 30 |          'start': 31, 'end': 32}
 31 |     ],
 32 |     [
 33 |         {'token': 'Zrobił', 'sep': 'space', 'tag': 'praet:sg:m1:perf',
 34 |          'lemmas': ['zrobić'], 'start': 33, 'end': 39},
 35 |         {'token': 'em', 'sep': 'none', 'tag': 'aglt:sg:pri:imperf:wok',
 36 |          'lemmas': ['być'], 'start': 39, 'end': 41},
 37 |         {'token': 'to', 'sep': 'space', 'tag': 'subst:sg:acc:n',
 38 |          'lemmas': ['to'], 'start': 42, 'end': 44},
 39 |         {'token': '.', 'sep': 'none', 'prob': 1.0, 'tag': 'interp', 'lemmas': ['.'],
 40 |          'start': 44, 'end': 45}
 41 |     ],
 42 |     [
 43 |         {'token': 'Aby', 'sep': 'space', 'tag': 'comp', 'lemmas': ['aby'],
 44 |          'start': 46, 'end': 49},
 45 |         {'token': 'm', 'sep': 'none', 'tag': 'aglt:sg:pri:imperf:nwok',
 46 |          'lemmas': ['być'], 'start': 49, 'end': 50},
 47 |         {'token': 'to', 'sep': 'space', 'tag': 'subst:sg:acc:n',
 48 |          'lemmas': ['to'], 'start': 51, 'end': 53},
 49 |         {'token': 'zrobił', 'sep': 'space', 'tag': 'praet:sg:m1:perf',
 50 |          'lemmas': ['zrobić'], 'start': 54, 'end': 60},
 51 |         {'token': '?', 'sep': 'none', 'tag': 'interp', 'lemmas': ['?'],
 52 |          'start': 60, 'end': 61}
 53 |     ],
 54 |     [
 55 |         {'token': 'Zrobił', 'sep': 'newline', 'tag': 'praet:sg:m1:perf',
 56 |          'lemmas': ['zrobić'], 'start': 0, 'end': 6},
 57 |         {'token': 'by', 'sep': 'none', 'tag': 'qub', 'lemmas': ['by'],
 58 |          'start': 6, 'end': 8},
 59 |         {'token': 'to', 'sep': 'space', 'tag': 'subst:sg:acc:n',
 60 |          'lemmas': ['to'], 'start': 9, 'end': 11},
 61 |         {'token': '.', 'sep': 'none', 'prob': 1.0, 'tag': 'interp', 'lemmas': ['.'],
 62 |          'start': 11, 'end': 12}
 63 |     ],
 64 | [
 65 |         {'token': 'Czy', 'sep': 'space', 'tag': 'qub', 'lemmas': ['czy'],
 66 |          'start': 14, 'end': 17},
 67 |         {'token': 'by', 'sep': 'space', 'tag': 'qub', 'lemmas': ['by'],
 68 |          'start': 18, 'end': 20},
 69 |         {'token': 'to', 'sep': 'space', 'tag': 'subst:sg:acc:n',
 70 |          'lemmas': ['to'], 'start': 21, 'end': 23},
 71 |         {'token': 'zrobił', 'sep': 'space', 'tag': 'praet:sg:m1:perf',
 72 |          'lemmas': ['zrobić'], 'start': 24, 'end': 30},
 73 |         {'token': '?', 'sep': 'none', 'tag': 'interp', 'lemmas': ['?'],
 74 |          'start': 30, 'end': 31}
 75 |     ]
 76 | ]
 77 | 
 78 | 
 79 | def test_rewrite_praet():
 80 |     sentence1 = copy.deepcopy(paragraph[2])
 81 | 
 82 |     rewrite_praet(sentence1[1], sentence1[0])
 83 |     assert sentence1[0]['tag'] == 'praet:sg:m1:pri:perf'
 84 | 
 85 | 
 86 | def test_rewrite_cond():
 87 |     sentence1 = copy.deepcopy(paragraph[0])
 88 |     rewrite_praet(sentence1[2], sentence1[0], sentence1[1])
 89 |     assert sentence1[0]['tag'] == 'cond:sg:m1:pri:perf'
 90 | 
 91 | def test_rewrite_cond2():
 92 |     sentence1 = copy.deepcopy(paragraph[4])
 93 |     rewrite_praet(None, sentence1[0], sentence1[1])
 94 |     assert sentence1[0]['tag'] == 'cond:sg:m1:ter:perf'
 95 | 
 96 | def test_rule1_cond():
 97 |     sentence1 = copy.deepcopy(paragraph[0])
 98 | 
 99 |     remove_aglt(sentence1, [rule1])
100 |     print(sentence1)
101 |     assert sentence1[0]['tag'] == 'cond:sg:m1:pri:perf'
102 |     assert sentence1[1]['token'] != 'by'
103 |     assert sentence1[2]['token'] != 'm'
104 |     assert sentence1[0]['token'] == 'Zrobiłbym'
105 |     assert sentence1[0]['end'] == 9
106 | 
107 | 
108 | def test_rule1_praet():
109 |     sentence1 = copy.deepcopy(paragraph[2])
110 | 
111 |     remove_aglt(sentence1, [rule1])
112 |     print(sentence1)
113 |     assert sentence1[0]['tag'] == 'praet:sg:m1:pri:perf'
114 |     assert sentence1[1]['token'] != 'm'
115 |     assert sentence1[0]['token'] == 'Zrobiłem'
116 |     assert sentence1[0]['end'] == 41
117 | 
118 | def test_rule3_1():
119 |     sentence1 = copy.deepcopy(paragraph[1])
120 | 
121 |     print(sentence1)
122 |     remove_aglt(sentence1, [rule1, rule3])
123 |     print(sentence1)
124 |     assert sentence1[3]['tag'] == 'cond:sg:m1:pri:perf'
125 |     assert sentence1[1]['token'] == 'bym'
126 |     assert sentence1[1]['end'] == 21
127 | 
128 | def test_rule3_2():
129 |     sentence1 = copy.deepcopy(paragraph[3])
130 | 
131 |     remove_aglt(sentence1, [rule1, rule3])
132 |     print(sentence1)
133 |     assert sentence1[2]['tag'] == 'praet:sg:m1:pri:perf'
134 |     assert sentence1[0]['token'] == 'Abym'
135 |     assert sentence1[0]['end'] == 50
136 | 
137 | def test_rule3_3():
138 |     sentence1 = copy.deepcopy(paragraph[4])
139 | 
140 |     remove_aglt(sentence1, [rule1b, rule3])
141 |     print(sentence1)
142 |     assert sentence1[0]['tag'] == 'cond:sg:m1:ter:perf'
143 |     assert sentence1[0]['token'] == 'Zrobiłby'
144 |     assert sentence1[0]['end'] == 8
145 |     assert sentence1[1]['token'] != 'by'
146 | 
147 | def test_rule3_4():
148 |     sentence1 = copy.deepcopy(paragraph[5])
149 | 
150 |     remove_aglt(sentence1, [rule1b, rule3])
151 |     print(sentence1)
152 |     assert sentence1[3]['tag'] == 'cond:sg:m1:ter:perf'
153 |     assert sentence1[3]['token'] == 'zrobił'
154 | 


--------------------------------------------------------------------------------
/tests/test_analyzers.py:
--------------------------------------------------------------------------------
  1 | from krnnt.analyzers import MacaAnalyzer
  2 | from krnnt.structure import Form
  3 | 
  4 | reference_maca_output = \
  5 | '''Lubię	newline
  6 | 	lubić	fin:sg:pri:imperf
  7 | pociągi	space
  8 | 	pociąg	subst:pl:nom:m3
  9 | 	pociąg	subst:pl:acc:m3
 10 | 	pociąg	subst:pl:voc:m3
 11 | .	none
 12 | 	.	interp'''
 13 | 
 14 | paragraph_raw = 'Lubię pociągi.'
 15 | 
 16 | MACA_CONFIG1='morfeusz-nkjp-official'
 17 | MACA_CONFIG2='morfeusz2-nkjp'
 18 | 
 19 | def test_maca():
 20 |     try:
 21 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
 22 |         results = maca_analyzer._maca(paragraph_raw)
 23 |         results = list(results)
 24 |     except:
 25 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
 26 |         results = maca_analyzer._maca(paragraph_raw)
 27 |         results = list(results)
 28 | 
 29 |     assert len(results) == 1
 30 |     assert results[0] == reference_maca_output
 31 | 
 32 | def test_maca_process():
 33 |     try:
 34 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
 35 |         results = maca_analyzer._maca_process(paragraph_raw)
 36 |         results = list(results)
 37 |     except:
 38 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
 39 |         results = maca_analyzer._maca_process(paragraph_raw)
 40 |         results = list(results)
 41 | 
 42 |     assert len(results) == 1
 43 |     assert results[0] == reference_maca_output
 44 | 
 45 | def test_maca_wrapper():
 46 |     try:
 47 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
 48 |         results = maca_analyzer._maca_wrapper(paragraph_raw)
 49 |         results = list(results)
 50 |     except:
 51 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
 52 |         results = maca_analyzer._maca_wrapper(paragraph_raw)
 53 |         results = list(results)
 54 | 
 55 |     assert len(results) == 1
 56 |     assert results[0] == reference_maca_output
 57 | 
 58 | def test_parse():
 59 |     maca_analyzer = MacaAnalyzer('')
 60 |     maca_analyzer.text = paragraph_raw
 61 |     maca_analyzer.last_offset = 0
 62 |     result = maca_analyzer._parse(reference_maca_output)
 63 | 
 64 |     reference = [
 65 |         ('Lubię', 'newline',
 66 |          [('lubić', 'fin:sg:pri:imperf')],0,5),
 67 |         ('pociągi', 'space',
 68 |          [('pociąg', 'subst:pl:nom:m3'),
 69 |           ('pociąg', 'subst:pl:acc:m3'),
 70 |           ('pociąg', 'subst:pl:voc:m3')],6,13),
 71 |         ('.', 'none',
 72 |          [('.', 'interp')], 13,14)]
 73 | 
 74 |     assert result == reference
 75 | 
 76 | def test_maca_analyzer():
 77 |     try:
 78 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
 79 |         result = maca_analyzer.analyze(paragraph_raw)
 80 |     except:
 81 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
 82 |         result = maca_analyzer.analyze(paragraph_raw)
 83 | 
 84 |     assert len(result.sentences)==1
 85 |     assert len(result.sentences[0].tokens) == 3
 86 | 
 87 |     assert result.sentences[0].tokens[0].form == 'Lubię'
 88 |     assert result.sentences[0].tokens[0].space_before == 'newline'
 89 |     assert len(result.sentences[0].tokens[0].interpretations) == 1
 90 | 
 91 |     assert result.sentences[0].tokens[1].form == 'pociągi'
 92 |     assert result.sentences[0].tokens[1].space_before == 'space'
 93 |     assert len(result.sentences[0].tokens[1].interpretations) == 3
 94 | 
 95 |     assert result.sentences[0].tokens[2].form == '.'
 96 |     assert result.sentences[0].tokens[2].space_before == 'none'
 97 |     assert len(result.sentences[0].tokens[2].interpretations) == 1
 98 | 
 99 |     assert result.sentences[0].tokens[1].interpretations[0] == Form('pociąg', 'subst:pl:nom:m3')
100 |     assert result.sentences[0].tokens[1].interpretations[1] == Form('pociąg', 'subst:pl:acc:m3')
101 |     assert result.sentences[0].tokens[1].interpretations[2] == Form('pociąg', 'subst:pl:voc:m3')
102 | 
103 | 
104 | def test_maca_analyzer_lemmas():
105 |     paragraph_raw='Ala ma kota.'
106 |     try:
107 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
108 |         result = maca_analyzer.analyze(paragraph_raw)
109 |     except:
110 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
111 |         result = maca_analyzer.analyze(paragraph_raw)
112 | 
113 |     lemmas =[form.lemma for form in result.sentences[0].tokens[2].interpretations]
114 |     assert 'kot' in lemmas
115 |     assert 'kot:s1' not in lemmas
116 |     assert 'kot:s2' not in lemmas
117 | 
118 | 


--------------------------------------------------------------------------------
/tests/test_blank.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | 
 3 | from krnnt.aglt import rewrite_praet, remove_aglt, rule1, rule3, rule1b
 4 | from krnnt.blanks import remove_blanks
 5 | 
 6 | sentence = [
 7 | 
 8 |         {'token': '200', 'sep': 'newline', 'tag': 'num:pl:nom:m2:rec',
 9 |          'lemmas': ['200'], 'start': 0, 'end': 3},
10 |         {'token': '.', 'sep': 'none', 'tag': 'blank', 'lemmas': ['.'],
11 |          'start': 3, 'end': 4},
12 |         {'token': '000', 'sep': 'none', 'tag': 'blank',
13 |          'lemmas': ['000'], 'start': 4, 'end': 7},
14 |         {'token': 'zł', 'sep': 'space', 'tag': 'brev:npun',
15 |          'lemmas': ['złoty'], 'start': 8, 'end': 10}
16 | ]
17 | 
18 | 
19 | def test_remove_blanks():
20 |     sentence1 = copy.deepcopy(sentence)
21 |     remove_blanks(sentence1)
22 |     print(sentence1)
23 | 
24 |     assert len(sentence1)==2
25 | 
26 | 
27 |     assert sentence1[0]['tag'] == 'num:pl:nom:m2:rec'
28 |     assert sentence1[0]['token'] == '200.000'
29 |     assert sentence1[0]['start'] == 0
30 |     assert sentence1[0]['end'] == 7
31 | 
32 |     assert sentence1[1] == sentence[-1]


--------------------------------------------------------------------------------
/tests/test_features.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from krnnt.features import FeaturePreprocessor, TagsPreprocessorCython, TagsPreprocessor, create_token_features
  4 | 
  5 | 
  6 | @pytest.fixture
  7 | def token():
  8 |     return 'asd'
  9 | 
 10 | 
 11 | def test_nic(token):
 12 |     assert ["NIC"] == FeaturePreprocessor.nic(token)
 13 | 
 14 | 
 15 | def test_interps():
 16 |     assert ["."] == FeaturePreprocessor.interps('.', {'tags': ['interp']})
 17 |     assert [] == FeaturePreprocessor.interps('.', {'tags': ['subst']})
 18 |     assert [] == FeaturePreprocessor.interps(':)', {'tags': ['interp']})
 19 | 
 20 | 
 21 | def test_prefix1():
 22 |     assert ["P0k"] == FeaturePreprocessor.prefix1('kot')
 23 |     assert ["P0??"] == FeaturePreprocessor.prefix1('©kot')
 24 |     assert ["P0k"] == FeaturePreprocessor.prefix1('KOT')
 25 | 
 26 | 
 27 | def test_prefix2():
 28 |     assert ["P1o"] == FeaturePreprocessor.prefix2('kot')
 29 |     assert ["P1xx"] == FeaturePreprocessor.prefix2('k')
 30 | 
 31 | 
 32 | def test_prefix3():
 33 |     assert ["P2t"] == FeaturePreprocessor.prefix3('kot')
 34 | 
 35 | 
 36 | def test_suffix1():
 37 |     assert ["S1t"] == FeaturePreprocessor.suffix1('kot')
 38 |     assert ["S1??"] == FeaturePreprocessor.suffix1('kot©')
 39 | 
 40 | 
 41 | def test_suffix2():
 42 |     assert ["S2o"] == FeaturePreprocessor.suffix2('kot')
 43 |     assert ["S2xx"] == FeaturePreprocessor.suffix2('k')
 44 | 
 45 | 
 46 | def test_suffix3():
 47 |     assert ["S3k"] == FeaturePreprocessor.suffix3('kot')
 48 | 
 49 | 
 50 | def test_qubliki():
 51 |     assert [] == FeaturePreprocessor.qubliki('kot')
 52 |     assert ['ale'] == FeaturePreprocessor.qubliki('ale')
 53 |     assert ['ale'] == FeaturePreprocessor.qubliki('Ale')
 54 | 
 55 | 
 56 | @pytest.mark.parametrize('token, expected', [('wrobel', 'l'),
 57 |                                              ('Wrobel', 'ul'),
 58 |                                              ('WROBEL', 'u'),
 59 |                                              ('2019', 'd'),
 60 |                                              ('Wrobel2019', 'uld'),
 61 |                                              ('Wrobel2019:)', 'uldx')])
 62 | def test_shape(token, expected):
 63 |     features = FeaturePreprocessor.shape(token)
 64 |     assert features[0] == expected
 65 |     assert len(features) == 1
 66 | 
 67 | 
 68 | @pytest.mark.parametrize('tags, expected', [
 69 |     (['fin:sg:ter:imperf', 'subst:sg:nom:f'], ['1fin:ter', '2fin:sg:imperf', '1subst:nom',
 70 |                                                '2subst:sg:f']),
 71 |     (['adjp:dat'], ['1adjp:dat', '2adjp']),
 72 |     (['interp'], ['1interp', '2interp']),
 73 |     ([''], ['1', '2']),
 74 |     ([], [])])
 75 | def test_tags4(tags, expected):
 76 |     assert TagsPreprocessor.create_tags4_without_guesser(tags) == expected
 77 |     assert TagsPreprocessorCython.create_tags4_without_guesser(tags) == expected
 78 | 
 79 | 
 80 | @pytest.mark.parametrize('tags, expected', [
 81 |     (['fin:sg:ter:imperf', 'subst:sg:nom:f'], ['sg', 'sg:nom:f', 'nom']),
 82 |     (['adjp:dat'], ['dat']),
 83 |     (['interp'], []),
 84 |     ([''], []),
 85 |     ([], [])])
 86 | def test_tags5(tags, expected):
 87 |     assert TagsPreprocessor.create_tags5_without_guesser(tags) == expected
 88 |     assert TagsPreprocessorCython.create_tags5_without_guesser(tags) == expected
 89 | 
 90 | 
 91 | def test_create_token_features(benchmark):
 92 |     token = 'obejmie'
 93 |     tags = ['subst:sg:loc:m3', 'subst:sg:voc:m3', 'subst:sg:dat:f', 'subst:sg:loc:f',
 94 |             'fin:sg:ter:perf']
 95 |     space_before = ['space_before']
 96 |     features=['l', 'P0o', 'P1b', 'P2e', 'S1e', 'S2i', 'S3m', '1subst:loc', '2subst:sg:m3',
 97 |                           '1subst:voc', '1subst:dat', '2subst:sg:f', '1fin:ter', '2fin:sg:perf', 'sg:loc:m3', 'loc',
 98 |                           'sg:voc:m3', 'voc', 'sg:dat:f', 'dat', 'sg:loc:f', 'sg', 'space_before']
 99 | 
100 |     result_features = create_token_features(token, tags, space_before)
101 |     assert result_features == features
102 | 


--------------------------------------------------------------------------------
/tests/test_morfeusz.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from krnnt.analyzers import MacaAnalyzer
 4 | from krnnt.new import get_morfeusz, analyze_tokenized, analyze_token
 5 | from krnnt.structure import Form
 6 | 
 7 | reference_maca_output = \
 8 | '''Lubię	newline
 9 | 	lubić	fin:sg:pri:imperf
10 | pociągi	space
11 | 	pociąg	subst:pl:nom:m3
12 | 	pociąg	subst:pl:acc:m3
13 | 	pociąg	subst:pl:voc:m3
14 | .	none
15 | 	.	interp'''
16 | 
17 | paragraph_raw = 'Lubię pociągi.'
18 | 
19 | MACA_CONFIG1='morfeusz-nkjp-official'
20 | MACA_CONFIG2='morfeusz2-nkjp'
21 | 
22 | def test_maca():
23 |     try:
24 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
25 |         results = maca_analyzer._maca(paragraph_raw)
26 |         results = list(results)
27 |     except:
28 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
29 |         results = maca_analyzer._maca(paragraph_raw)
30 |         results = list(results)
31 | 
32 |     assert len(results) == 1
33 |     assert results[0] == reference_maca_output
34 | 
35 | 
36 | def test_maca_analyzer(rootdir):
37 |     try:
38 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
39 |         result = maca_analyzer.analyze(paragraph_raw)
40 |     except:
41 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
42 |         result = maca_analyzer.analyze(paragraph_raw)
43 | 
44 |     lines = []
45 |     for line in open(os.path.join(rootdir, 'data/full/test-raw.txt')):
46 |         line = line.strip()
47 |         if not line: continue
48 |         lines.append(line)
49 | 
50 |     morfeusz = get_morfeusz()
51 | 
52 | 
53 | 
54 |     for line in lines:
55 |         paragraph = maca_analyzer.analyze(line)
56 |         for sentence in paragraph:
57 |             for token in sentence:
58 | 
59 |                 maca_tags=[(form.lemma, form.tags) for form in token.interpretations]
60 |                 morfeusz_tags=analyze_token(morfeusz, token.form)
61 |                 maca_tags=set(maca_tags)
62 |                 morfeusz_tags=set(morfeusz_tags)
63 |                 if maca_tags!=morfeusz_tags:
64 |                     print(token)
65 |                     print(sorted(maca_tags-morfeusz_tags))
66 |                     print(sorted(morfeusz_tags-maca_tags))


--------------------------------------------------------------------------------
/tests/test_parallel_api_speed.py:
--------------------------------------------------------------------------------
 1 | import concurrent.futures
 2 | import os
 3 | 
 4 | import pytest
 5 | import requests
 6 | 
 7 | 
 8 | def test_api(rootdir):
 9 |     url = 'http://localhost:9003'
10 | 
11 |     for line in open(os.path.join(rootdir, 'data/full/test-raw.txt')):
12 |         line=line.strip()
13 |         if not line: continue
14 | 
15 |         tag('http://localhost:9003', line)
16 | 
17 | def tag(url, data):
18 |     payload = data.encode('utf-8')
19 |     r = requests.post(url, data=payload)
20 |     return r
21 | 
22 | def chunk(l, batch_size):
23 |     batch = []
24 |     for element in l:
25 |         batch.append(element)
26 |         if len(batch) == batch_size:
27 |             yield batch
28 |             batch = []
29 |     if batch:
30 |         yield batch
31 | 
32 | @pytest.mark.slow
33 | @pytest.mark.parametrize('chunk_size', [100000, 10000, 1000, 100, 10, 4, 2,1])
34 | def test_parallel_api(rootdir, chunk_size):
35 |     print(rootdir, chunk_size)
36 | 
37 |     lines=[]
38 |     for line in open(os.path.join(rootdir, 'data/full/test-raw.txt')):
39 |         line = line.strip()
40 |         if not line: continue
41 |         lines.append(line)
42 | 
43 |     batches = list(chunk(lines, chunk_size))
44 |     print(len(batches))
45 | 
46 |     with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
47 |         future_to_url = {executor.submit(tag, 'http://localhost:9003', "\n\n".join(batch)): "\n\n".join(batch) for batch in batches}
48 |         for future in concurrent.futures.as_completed(future_to_url):
49 |             r=future.result()
50 |             # print(r.text)
51 | 
52 | @pytest.mark.slow
53 | @pytest.mark.parametrize('chunk_size', [100000,10,1])
54 | def test_parallel_api_maca(rootdir, chunk_size):
55 |     lines=[]
56 |     for line in open(os.path.join(rootdir, 'data/full/train-raw.txt')):
57 |         line = line.strip()
58 |         if not line: continue
59 |         lines.append(line)
60 | 
61 |     batches = list(chunk(lines, chunk_size))
62 | 
63 |     with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
64 |         future_to_url = {executor.submit(tag, 'http://localhost:9003/maca/', "\n\n".join(batch)): "\n\n".join(batch) for batch in batches}
65 |         for future in concurrent.futures.as_completed(future_to_url):
66 |             r=future.result()
67 | 


--------------------------------------------------------------------------------
/tests/test_process_xces.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from krnnt.aligner import align_paragraphs
 4 | from krnnt.analyzers import MacaAnalyzer
 5 | from krnnt.readers import read_xces
 6 | 
 7 | #TODO parametrize?
 8 | 
 9 | 
10 | 
11 | def test_different_xces_formats(rootdir):
12 |     data = {
13 |         os.path.join(rootdir, 'data/small/nkjp1m-1.2-xces.xml'): [8, 7],
14 |         os.path.join(rootdir, 'data/small/train-gold.xml'): [10, 8, 6],
15 |         os.path.join(rootdir, 'data/small/gold-task-c.xml'): [12, 12],
16 |         os.path.join(rootdir, 'data/small/00130846.ann.xml'): [25],
17 |         os.path.join(rootdir, 'data/small/00130846.xml'): [25],
18 |         os.path.join(rootdir, 'data/small/00132482.ann.xml'): [2],
19 |         os.path.join(rootdir, 'data/small/00132482.xml'): [2]
20 |     }
21 | 
22 |     for path, paragraph_lenghts in data.items():
23 |         assert paragraph_lenghts == [len(paragraph.sentences) for paragraph in read_xces(path)]
24 |         for paragraph in read_xces(path):
25 |             print(paragraph.text())
26 | 
27 |             for sentence in paragraph:
28 |                 for token in sentence:
29 |                     print(token)
30 |         print()
31 | 
32 | def test_reanalyze(rootdir):
33 |     data = {
34 |         os.path.join(rootdir, 'data/small/nkjp1m-1.2-xces.xml'): [8, 7],
35 |         os.path.join(rootdir, 'data/small/train-gold.xml'): [10, 8, 6],
36 |         os.path.join(rootdir, 'data/small/gold-task-c.xml'): [12, 12],
37 |         os.path.join(rootdir, 'data/small/00130846.ann.xml'): [25],
38 |         os.path.join(rootdir, 'data/small/00130846.xml'): [25],
39 |         os.path.join(rootdir, 'data/small/00132482.ann.xml'): [2],
40 |         os.path.join(rootdir, 'data/small/00132482.xml'): [2]
41 |     }
42 | 
43 |     for path, paragraph_lenghts in data.items():
44 |         # assert paragraph_lenghts == [len(paragraph.sentences) for paragraph in read_xces(path)]
45 |         maca_analyzer = MacaAnalyzer('morfeusz2-nkjp')
46 |         for paragraph in read_xces(path):
47 |             paragraph_raw = paragraph.text()
48 | 
49 |             paragraph_reanalyzed = maca_analyzer.analyze(paragraph_raw)
50 | 
51 |             print('Number of sentences by Maca vs gold', len(paragraph_reanalyzed.sentences), len(paragraph.sentences))
52 | 
53 |             paragraph_reanalyzed = align_paragraphs(paragraph_reanalyzed, paragraph)
54 |             for sentence in paragraph_reanalyzed:
55 |                 for token in sentence:
56 |                     print(token)
57 |         print()
58 | 


--------------------------------------------------------------------------------
/tests/test_speed.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | MACA_CONFIG=morfeusz2-nkjp
 4 | 
 5 | 
 6 | time cat tests/data/full/test-raw.txt | CUDA_VISIBLE_DEVICES="" python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces  > /tmp/out.xces
 7 | #12s
 8 | 
 9 | time cat tests/data/full/train-raw.txt | CUDA_VISIBLE_DEVICES="" python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces  > /tmp/out.xces
10 | #7m16s
11 | 
12 | #one thread
13 | time cat tests/data/full/test-raw.txt | CUDA_VISIBLE_DEVICES="" python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces --reproducible > /tmp/out.xces
14 | #22s
15 | 
16 | #GPU 1050GTX
17 | #train
18 | #5m12s
19 | 
20 | #time maca-analyse -c morfeusz2-nkjp < tests/data/full/train-raw.txt > /dev/null
21 | #35s
22 | 
23 | #time maca-analyse -c morfeusz2-nkjp < tests/data/full/test-raw.txt > /dev/null
24 | #0.9s
25 | 
26 | #maca per line test-raw.txt
27 | #45s
28 | 
29 | #i tak zrównolegla
30 | 
31 | # test-raw.txt API 1w1t GPU 44s
32 | # test-raw.txt API 1w2t GPU 44s
33 | # test-raw.txt API 2w1t GPU 44s
34 | 
35 | # test-raw.txt API 1w1t CPU 43s
36 | # test-raw.txt API 1w2t CPU 43s
37 | # test-raw.txt API 2w1t CPU 42s
38 | 
39 | # pool=2 test-raw.txt API 1w1t CPU 29s
40 | # pool=2 test-raw.txt API 1w2t CPU 28s
41 | # pool=2 test-raw.txt API 2w1t CPU 25s
42 | 
43 | # pool=2 test-raw.txt API 1w1t GPU 21s
44 | # pool=2 test-raw.txt API 1w2t GPU 30s
45 | # pool=2 test-raw.txt API 2w1t GPU 23s
46 | 
47 | # pool=10 test-raw.txt API 1w1t CPU 20s
48 | # pool=10 test-raw.txt API 1w2t CPU 20s
49 | # pool=10 test-raw.txt API 2w1t CPU 17s
50 | # pool=10 test-raw.txt API 4w1t CPU 15s
51 | # pool=10 test-raw.txt API 4w2t CPU 16s
52 | # pool=10 test-raw.txt API 8w1t CPU 16s
53 | # pool=100 test-raw.txt API 10w1t CPU 14s
54 | # pool=100 test-raw.txt API 20w1t CPU 14s
55 | 
56 | # pool=10 test-raw.txt API 1w1t GPU 21s
57 | # pool=10 test-raw.txt API 1w2t GPU 21s
58 | # pool=10 test-raw.txt API 2w1t GPU 14s
59 | # pool=10 test-raw.txt API 4w1t GPU OOM


--------------------------------------------------------------------------------
/tests/test_structure.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from krnnt.readers import read_xces
 4 | 
 5 | 
 6 | def test_paragraph_text(rootdir):
 7 |     data = {
 8 |         os.path.join(rootdir, 'data/small/nkjp1m-1.2-xces.xml'): [8, 7],
 9 |         os.path.join(rootdir, 'data/small/train-gold.xml'): [10, 8, 6],
10 |         os.path.join(rootdir, 'data/small/gold-task-c.xml'): [12, 12],
11 |         os.path.join(rootdir, 'data/small/00130846.ann.xml'): [25],
12 |         os.path.join(rootdir, 'data/small/00130846.xml'): [25],
13 |         os.path.join(rootdir, 'data/small/00132482.ann.xml'): [2],
14 |         os.path.join(rootdir, 'data/small/00132482.xml'): [2]
15 |     }
16 | 
17 |     for path, paragraph_lenghts in data.items():
18 |         print(path)
19 |         for paragraph in read_xces(path):
20 |             paragraph_raw = ''
21 |             for sentence_gold in paragraph:
22 |                 paragraph_raw += sentence_gold.text()
23 |             paragraph_raw = paragraph_raw[1:]
24 |             assert paragraph_raw == paragraph.text()
25 | 


--------------------------------------------------------------------------------
/tests/test_system.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | 
  4 | def test_download_model(bash, rootdir):
  5 |     commands = [
  6 |         'cd %s' % rootdir,
  7 |         './download_model.sh'
  8 |     ]
  9 | 
 10 |     with bash() as s:
 11 |         for command in commands:
 12 |             s.run_script_inline([command])
 13 | 
 14 | 
 15 | def test_process_xces(bash, rootdir):
 16 |     commands = [
 17 |         'cd %s' % rootdir,
 18 |         'cd ..',
 19 |         'python3 process_xces.py tests/data/small/nkjp1m-1.2-xces.xml /tmp/nkjp.spickle',
 20 |         'diff /tmp/nkjp.spickle tests/data/reference/nkjp1m-1.2.spickle']
 21 | 
 22 |     for command in commands:
 23 |         bash.run_script_inline([command])
 24 | 
 25 | 
 26 | @pytest.mark.xfail(reason="version of morfeusz dictionary may influence results")
 27 | def test_reanalyze(bash, rootdir):
 28 |     commands = [
 29 |         'cd %s' % rootdir,
 30 |         'cd ..',
 31 |         'python3 reanalyze.py --maca_config $MACA_CONFIG /tmp/nkjp.spickle /tmp/nkjp-reanalyzed.spickle',
 32 |         'diff /tmp/nkjp-reanalyzed.spickle tests/data/reference/nkjp1m-1.2-reanalyzed.spickle'
 33 |     ]
 34 | 
 35 |     with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp'}) as s:
 36 |         for command in commands:
 37 |             print(command)
 38 |             s.run_script_inline([command])
 39 | 
 40 | 
 41 | def test_shuffle(bash, rootdir):
 42 |     commands = [
 43 |         'cd %s' % rootdir,
 44 |         'cd ..',
 45 |         'python3 shuffle.py tests/data/reference/nkjp1m-1.2-reanalyzed.spickle /tmp/nkjp-reanalyzed.shuf.spickle',
 46 |         'diff /tmp/nkjp-reanalyzed.shuf.spickle tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle'
 47 |     ]
 48 | 
 49 |     with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp'}) as s:
 50 |         for command in commands:
 51 |             s.run_script_inline([command])
 52 | 
 53 | 
 54 | def test_preprocess(bash, rootdir):
 55 |     commands = [
 56 |         'cd %s' % rootdir,
 57 |         'cd ..',
 58 |         'rm -f /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2 /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData',
 59 |         'python3 preprocess_data.py /tmp/nkjp-reanalyzed.shuf.spickle /tmp/nkjp-reanalyzed.shuf.spickle.preprocess',
 60 |         'diff /tmp/nkjp-reanalyzed.shuf.spickle.preprocess tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess',
 61 |     ]
 62 | 
 63 |     with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp'}) as s:
 64 |         for command in commands:
 65 |             s.run_script_inline([command])
 66 | 
 67 | 
 68 | def test_create_dict(bash, rootdir):
 69 |     commands = [
 70 |         'cd %s' % rootdir,
 71 |         'cd ..',
 72 |         'rm -f /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2 /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData',
 73 |         'python3 create_dict.py tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess /tmp/nkjp-reanalyzed.shuf.spickle.preprocess.dict',
 74 |         'diff /tmp/nkjp-reanalyzed.shuf.spickle.preprocess.dict tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict',
 75 |     ]
 76 | 
 77 |     with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp'}) as s:
 78 |         for command in commands:
 79 |             s.run_script_inline([command])
 80 | 
 81 | 
 82 | @pytest.mark.slow
 83 | def test_train2(bash, rootdir):
 84 |     commands = [
 85 |         'cd %s' % rootdir,
 86 |         'cd ..',
 87 |         'python3 train.py tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict --maca_config $MACA_CONFIG -e 2 --reproducible --hash test',
 88 |         'h5diff weight_test.hdf5 tests/data/reference/weight_test.hdf5.new',
 89 |         'h5diff weight_test.hdf5.final tests/data/reference/weight_test.hdf5.final.new',
 90 |     ]
 91 | 
 92 |     with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s:
 93 |         for command in commands:
 94 |             s.run_script_inline([command])
 95 | 
 96 | 
 97 | @pytest.mark.slow
 98 | def test_train_lemmatization(bash, rootdir):
 99 |     commands = [
100 |         'cd %s' % rootdir,
101 |         'cd ..',
102 |         'python3 train_lemmatization.py tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess --reproducible --hash test',
103 |         'diff lemmatisation_test.pkl tests/data/reference/lemmatisation_test.pkl',
104 |     ]
105 | 
106 |     with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s:
107 |         for command in commands:
108 |             s.run_script_inline([command])
109 | 
110 | 
111 | def test_join_dicts(bash, rootdir):
112 |     commands = [
113 |         'cd %s' % rootdir,
114 |         'cd ..',
115 |         'python3 join_dicts.py /tmp/joined_dicts.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict --reproducible',
116 |         'diff /tmp/joined_dicts.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict',
117 |     ]
118 | 
119 |     with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s:
120 |         for command in commands:
121 |             s.run_script_inline([command])
122 | 
123 | 
124 | def test_split_data(bash, rootdir):
125 |     commands = [
126 |         'cd %s' % rootdir,
127 |         'cd ..',
128 |         'python3 split_data.py tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle /tmp/nkjp1m-1.2-reanalyzed.shuf.spickle.part1 /tmp/nkjp1m-1.2-reanalyzed.shuf.spickle.part2 0.2',
129 |         'diff /tmp/nkjp1m-1.2-reanalyzed.shuf.spickle.part1 tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part1',
130 |         'diff /tmp/nkjp1m-1.2-reanalyzed.shuf.spickle.part2 tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part2',
131 |     ]
132 | 
133 |     with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s:
134 |         for command in commands:
135 |             s.run_script_inline([command])
136 | 
137 | 
138 | def test_join_data(bash, rootdir):
139 |     commands = [
140 |         'cd %s' % rootdir,
141 |         'cd ..',
142 |         'python3 join_data.py /tmp/nkjp1m-1.2-reanalyzed.shuf.spickle.joined tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part1 tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part2',
143 |         'diff /tmp/nkjp1m-1.2-reanalyzed.shuf.spickle.joined tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle',
144 |     ]
145 | 
146 |     with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s:
147 |         for command in commands:
148 |             s.run_script_inline([command])
149 | 
150 | 
151 | @pytest.mark.slow
152 | def test_train(bash, rootdir):
153 |     commands = [
154 |         'cd %s' % rootdir,
155 |         'cd ..',
156 |         'rm -f /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2 /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues',
157 |         'python3 krnnt_train.py --maca_config $MACA_CONFIG /tmp/nkjp-reanalyzed.shuf.spickle -e 2 --reproducible --hash test',
158 | 
159 |         'h5diff weight_test.hdf5 tests/data/reference/weight_test.hdf5',
160 |         'h5diff weight_test.hdf5.final tests/data/reference/weight_test.hdf5.final',
161 |         'diff lemmatisation_test.pkl tests/data/reference/lemmatisation_test.pkl',
162 |         'diff /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2 tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2',
163 |         'diff /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData',
164 |         'diff /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues',
165 |     ]
166 | 
167 |     with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s:
168 |         for command in commands:
169 |             s.run_script_inline([command])
170 | 
171 | 
172 | def test_run_xces(bash, rootdir):
173 |     commands = [
174 |         'cd %s' % rootdir,
175 |         'cd ..',
176 |         'python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces > /tmp/out.xces < tests/data/reference/in_raw.txt',
177 |         'diff /tmp/out.xces tests/data/reference/out.xces'
178 |     ]
179 | 
180 |     with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': ''}) as s:
181 |         for command in commands:
182 |             s.run_script_inline([command])
183 | 
184 | 
185 | def test_run_xces_from_training(bash, rootdir):
186 |     commands = [
187 |         'cd %s' % rootdir,
188 |         'cd ..',
189 |         'python3 krnnt_run.py weight_test.hdf5.final lemmatisation_test.pkl /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces > /tmp/out.xces < tests/data/reference/in_raw.txt',
190 |         'diff /tmp/out.xces tests/data/reference/out.xces'
191 |     ]
192 | 
193 |     with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': ''}) as s:
194 |         for command in commands:
195 |             s.run_script_inline([command])
196 | 
197 | 
198 | def test_run_plain(bash, rootdir):
199 |     commands = [
200 |         'cd %s' % rootdir,
201 |         'cd ..',
202 |         'python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o plain > /tmp/out.plain < tests/data/reference/in_raw.txt',
203 |         'diff /tmp/out.plain tests/data/reference/out.plain'
204 |     ]
205 | 
206 |     with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': ''}) as s:
207 |         for command in commands:
208 |             s.run_script_inline([command])
209 | 
210 | 
211 | def test_run_conll(bash, rootdir):
212 |     commands = [
213 |         'cd %s' % rootdir,
214 |         'cd ..',
215 |         'python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o conll > /tmp/out.conll < tests/data/reference/in_raw.txt',
216 |         'diff /tmp/out.conll tests/data/reference/out.conll'
217 |     ]
218 | 
219 |     with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': ''}) as s:
220 |         for command in commands:
221 |             s.run_script_inline([command])
222 | 
223 | 
224 | def test_run_conllu(bash, rootdir):
225 |     commands = [
226 |         'cd %s' % rootdir,
227 |         'cd ..',
228 |         'python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o conllu > /tmp/out.conllu < tests/data/reference/in_raw.txt',
229 |         'diff /tmp/out.conllu tests/data/reference/out.conllu'
230 |     ]
231 | 
232 |     with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': ''}) as s:
233 |         for command in commands:
234 |             s.run_script_inline([command])
235 | 
236 | 
237 | def test_run_jsonl(bash, rootdir):
238 |     commands = [
239 |         'cd %s' % rootdir,
240 |         'cd ..',
241 | 
242 |         'python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o jsonl > /tmp/out.jsonl < tests/data/reference/in_raw.txt',
243 |         'diff /tmp/out.jsonl tests/data/reference/out.jsonl'
244 |     ]
245 | 
246 |     with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': ''}) as s:
247 |         for command in commands:
248 |             s.run_script_inline([command])
249 | 
250 | 
251 | @pytest.mark.xfail(reason="non-deterministic lemmatisation?")
252 | def test_run_evaluation(bash, rootdir):
253 |     commands = [
254 |         'cd %s' % rootdir,
255 |         'cd ..',
256 |         'cat tests/data/small/gold-task-c.txt | python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces --reproducible > /tmp/out.xces',
257 |         'python2 tagger-eval.py  tests/data/small/gold-task-c.xml /tmp/out.xces > /tmp/out_evaluation.txt',
258 |         'diff /tmp/out_evaluation.txt tests/data/reference/gold-task-c_evaluation.txt '
259 |     ]
260 | 
261 |     with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s:
262 |         for command in commands:
263 |             s.run_script_inline([command])
264 | 
265 | 
266 | @pytest.mark.xfail(reason="non-deterministic lemmatisation?")
267 | def test_run_evaluation_from_training(bash, rootdir):
268 |     commands = [
269 |         'cd %s' % rootdir,
270 |         'cd ..',
271 |         'cat tests/data/small/gold-task-c.txt | python3 krnnt_run.py weight_test.hdf5.final lemmatisation_test.pkl /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces --reproducible > /tmp/out.xces',
272 |         'python2 tagger-eval.py  tests/data/small/gold-task-c.xml /tmp/out.xces > /tmp/out_evaluation.txt',
273 |         'diff /tmp/out_evaluation.txt tests/data/reference/gold-task-c_evaluation.txt '
274 |     ]
275 | 
276 |     with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s:
277 |         for command in commands:
278 |             s.run_script_inline([command])
279 | 


--------------------------------------------------------------------------------
/tests/test_system_server.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | 
  4 | def test_download_model(bash, rootdir):
  5 |     commands = [
  6 |         'cd %s' % rootdir,
  7 |         './download_model.sh'
  8 |     ]
  9 | 
 10 |     with bash() as s:
 11 |         for command in commands:
 12 |             s.run_script_inline([command])
 13 | 
 14 | #TODO: run server: python3 krnnt_serve.py model_data/ --maca_config morfeusz2-nkjp
 15 | 
 16 | def test_post_raw(bash,rootdir):
 17 |     commands = [
 18 |         'cd %s' % rootdir,
 19 |         'cd ..',
 20 |         'curl -X POST "http://localhost:9003" --data-binary @tests/data/server/in_raw.txt  > /tmp/out.txt',
 21 |         'diff /tmp/out.txt tests/data/server/out_raw.plain'
 22 |     ]
 23 | 
 24 |     with bash() as s:
 25 |         for command in commands:
 26 |             s.run_script_inline([command])
 27 | 
 28 | def test_post_raw_jsonl(bash,rootdir):
 29 |     commands = [
 30 |         'cd %s' % rootdir,
 31 |         'cd ..',
 32 |         'curl -X POST "http://localhost:9003/?output_format=jsonl&input_format=lines" --data-binary @tests/data/server/in_raw.txt  > /tmp/out.txt',
 33 |         'diff /tmp/out.txt tests/data/server/out_raw.jsonl'
 34 |     ]
 35 | 
 36 |     with bash() as s:
 37 |         for command in commands:
 38 |             s.run_script_inline([command])
 39 | 
 40 | def test_post_raw_conll(bash,rootdir):
 41 |     commands = [
 42 |         'cd %s' % rootdir,
 43 |         'cd ..',
 44 |         'curl -X POST "http://localhost:9003/?output_format=conll&input_format=lines" --data-binary @tests/data/server/in_raw.txt  > /tmp/out.txt',
 45 |         'diff /tmp/out.txt tests/data/server/out_raw.conll'
 46 |     ]
 47 | 
 48 |     with bash() as s:
 49 |         for command in commands:
 50 |             s.run_script_inline([command])
 51 | 
 52 | def test_post_raw_conllu(bash,rootdir):
 53 |     commands = [
 54 |         'cd %s' % rootdir,
 55 |         'cd ..',
 56 |         'curl -X POST "http://localhost:9003/?output_format=conllu&input_format=lines" --data-binary @tests/data/server/in_raw.txt  > /tmp/out.txt',
 57 |         'diff /tmp/out.txt tests/data/server/out_raw.conllu'
 58 |     ]
 59 | 
 60 |     with bash() as s:
 61 |         for command in commands:
 62 |             s.run_script_inline([command])
 63 | 
 64 | def test_post_raw_xces(bash,rootdir):
 65 |     commands = [
 66 |         'cd %s' % rootdir,
 67 |         'cd ..',
 68 |         'curl -X POST "http://localhost:9003/?output_format=xces&input_format=lines" --data-binary @tests/data/server/in_raw.txt  > /tmp/out.txt',
 69 |         'diff /tmp/out.txt tests/data/server/out_raw.xces'
 70 |     ]
 71 | 
 72 |     with bash() as s:
 73 |         for command in commands:
 74 |             s.run_script_inline([command])
 75 | 
 76 | def test_post_form(bash, rootdir):
 77 |     commands = [
 78 |         'cd %s' % rootdir,
 79 |         'cd ..',
 80 |         'curl -X POST "http://localhost:9003" --data-binary "text=Lubię placki. Ala ma kota.\n\nRaz dwa trzy." > /tmp/out.txt'
 81 |     ]
 82 | 
 83 |     with bash() as s:
 84 |         for command in commands:
 85 |             s.run_script_inline([command])
 86 | 
 87 |     generated = open('/tmp/out.txt').read()
 88 |     reference = open(os.path.join(rootdir,'data/server/out_raw.plain')).read()
 89 | 
 90 |     assert reference in generated
 91 | 
 92 | def test_post_tokenized_json(bash, rootdir):
 93 |     commands = [
 94 |         'cd %s' % rootdir,
 95 |         'cd ..',
 96 |         'curl -X POST -H "Content-Type: application/json" "http://localhost:9003" -d @tests/data/server/in_tokenized.json > /tmp/out.txt',
 97 |         'diff -B /tmp/out.txt tests/data/server/out_raw.plain'
 98 |     ]
 99 | 
100 |     with bash() as s:
101 |         for command in commands:
102 |             s.run_script_inline([command])
103 | 
104 | def test_post_tokenized_compact_json(bash, rootdir):
105 |     commands = [
106 |         'cd %s' % rootdir,
107 |         'cd ..',
108 |         'curl -X POST -H "Content-Type: application/json" "http://localhost:9003" -d @tests/data/server/in_tokenized_compact.json > /tmp/out.txt',
109 |         'diff -B /tmp/out.txt tests/data/server/out_raw.plain'
110 |     ]
111 | 
112 |     with bash() as s:
113 |         for command in commands:
114 |             s.run_script_inline([command])
115 | 
116 | def test_post_raw_poleval(bash, rootdir):
117 |     commands = [
118 |         'cd %s' % rootdir,
119 |         'cd ..',
120 |         'curl -X POST "http://localhost:9003" --data-binary @tests/data/full/test-raw.txt  > /tmp/out.txt'
121 |     ]
122 | 
123 |     with bash() as s:
124 |         for command in commands:
125 |             s.run_script_inline([command])


--------------------------------------------------------------------------------
/tests/test_tagset.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from krnnt.analyzers import MacaAnalyzer
 3 | from krnnt.new import get_morfeusz, analyze_token
 4 | 
 5 | MACA_CONFIG1='morfeusz-nkjp-official'
 6 | MACA_CONFIG2='morfeusz2-nkjp'
 7 | 
 8 | @pytest.fixture
 9 | def maca():
10 |     try:
11 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
12 |         list(maca_analyzer._maca("test"))
13 |     except:
14 |         maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
15 |         list(maca_analyzer._maca("test"))
16 | 
17 |     return maca_analyzer
18 | 
19 | test_data = [
20 |     ('IV', '', 'num:::'),
21 |     ('IV', '', 'romandig'),
22 |     ('1', '', 'dig'),
23 |     ('prostu', 'adjp', 'adjp:gen'),
24 |     (':)', '', 'emo'),
25 |     ('godzien', 'adjc', ''),
26 |     ('oślep', 'burk', 'frag'),
27 |     ('obojga', 'numcol:pl:gen:m1:rec', ''),
28 |     ('dwoje', 'numcol:pl:acc:m1:rec', ''),
29 |     ('czworo', 'numcol:pl:nom:m1:rec', ''),
30 |     ('hej', 'interj', ''),
31 |     ('jeszcze', 'qub', 'part'),
32 |     ('czterem', 'num:pl:dat:m1:congr', ''),
33 |     ('czym', 'conj', 'comp'),
34 |     ('niedaleko', 'prep:gen', ''),
35 |     ('doprawdy', 'qub', 'adv'),
36 |     ('jak', 'qub', 'adv'),
37 |     ('pół', '', 'numcomp'),
38 |     ('pół', '', 'num:comp'),
39 |     ('pół', 'num:pl:acc:n:rec', ''),
40 |     ('słowa', 'subst:pl:acc:n', 'subst:sg:gen:n:ncol'),
41 |     ('rozklepywało', '', 'praet:sg:n1:ter:imperf'),
42 |     ('bardzo', 'adv:pos', 'adv'),
43 |     ('bardziej', 'adv:com', ''),
44 |     ('znacząco', 'adv:pos', 'pacta'),
45 |     ('my', '', 'ppron12:pl:nom:_:pri'),
46 |     ('sobie', 'siebie:dat', ''),
47 |     ('zł', 'brev:npun', 'brev'),
48 | ]
49 | 
50 | @pytest.mark.parametrize('form, exist, not_exist', test_data)
51 | @pytest.mark.xfail
52 | def test_maca(maca, form, exist, not_exist):
53 |     paragraph=maca.analyze(form)
54 |     sentence=paragraph.sentences[0]
55 |     token=sentence.tokens[0]
56 |     tags = [form.tags for form in token.interpretations]
57 |     print(tags)
58 |     if exist:
59 |         assert exist in tags
60 |     if not_exist:
61 |         assert not_exist not in tags
62 | 
63 | @pytest.mark.parametrize('form, exist, not_exist', test_data)
64 | @pytest.mark.xfail
65 | def test_morfeusz(maca, form, exist, not_exist):
66 |     morfeusz = get_morfeusz()
67 |     tags=[tag for form, tag in analyze_token(morfeusz, form)]
68 |     print(tags)
69 |     if exist:
70 |         assert exist in tags
71 |     if not_exist:
72 |         assert not_exist not in tags


--------------------------------------------------------------------------------
/tests/test_writers.py:
--------------------------------------------------------------------------------
  1 | from krnnt.writers import results_to_conll_str, results_to_conllu_str, results_to_txt_str, results_to_plain_str, \
  2 |     results_to_xces_str
  3 | 
  4 | results = [[[{'token': 'Lubię', 'sep': 'newline', 'prob': 0.37375012, 'tag': 'adj:pl:nom:m1:pos', 'lemmas': ['Lubię'],
  5 |              'start': 0, 'end': 5},
  6 |             {'token': 'placki', 'sep': 'space', 'prob': 0.38550463, 'tag': 'subst:pl:nom:m1', 'lemmas': ['placki'],
  7 |              'start': 6, 'end': 12},
  8 |             {'token': '.', 'sep': 'none', 'prob': 0.99999726, 'tag': 'interp', 'lemmas': ['.'], 'start': 12,
  9 |              'end': 13}], [
 10 |                {'token': 'Ala', 'sep': 'space', 'prob': 0.9995969, 'tag': 'subst:sg:nom:f', 'lemmas': ['Ala'],
 11 |                 'start': 14, 'end': 17},
 12 |                {'token': 'ma', 'sep': 'space', 'prob': 0.6605565, 'tag': 'subst:sg:nom:f', 'lemmas': ['ma'],
 13 |                 'start': 18, 'end': 20},
 14 |                {'token': 'kota', 'sep': 'space', 'prob': 0.93132496, 'tag': 'subst:sg:nom:f', 'lemmas': ['kota'],
 15 |                 'start': 21, 'end': 25},
 16 |                {'token': '.', 'sep': 'none', 'prob': 0.9999993, 'tag': 'interp', 'lemmas': ['.'], 'start': 25,
 17 |                 'end': 26}]], [[
 18 |                {'token': 'Raz', 'sep': 'space', 'prob': 0.23650545, 'tag': 'subst:sg:nom:f', 'lemmas': ['Raz'],
 19 |                 'start': 27, 'end': 30},
 20 |                {'token': 'dwa', 'sep': 'space', 'prob': 0.581044, 'tag': 'adj:pl:acc:f:pos', 'lemmas': ['dwa'],
 21 |                 'start': 31, 'end': 34},
 22 |                {'token': 'trzy', 'sep': 'space', 'prob': 0.71970826, 'tag': 'subst:pl:acc:f', 'lemmas': ['trzy'],
 23 |                 'start': 35, 'end': 39},
 24 |                {'token': '.', 'sep': 'none', 'prob': 0.99999905, 'tag': 'interp', 'lemmas': ['.'], 'start': 39,
 25 |                 'end': 40}]]]
 26 | 
 27 | 
 28 | def test_conll():
 29 |     reference=\
 30 | """Lubię	Lubię	1	adj:pl:nom:m1:pos	0	5
 31 | placki	placki	1	subst:pl:nom:m1	6	12
 32 | .	.	0	interp	12	13
 33 | 
 34 | Ala	Ala	1	subst:sg:nom:f	14	17
 35 | ma	ma	1	subst:sg:nom:f	18	20
 36 | kota	kota	1	subst:sg:nom:f	21	25
 37 | .	.	0	interp	25	26
 38 | 
 39 | 
 40 | Raz	Raz	1	subst:sg:nom:f	27	30
 41 | dwa	dwa	1	adj:pl:acc:f:pos	31	34
 42 | trzy	trzy	1	subst:pl:acc:f	35	39
 43 | .	.	0	interp	39	40
 44 | 
 45 | """
 46 |     output = results_to_conll_str(results)
 47 |     assert output == reference
 48 | 
 49 | def test_conllu():
 50 |     reference=\
 51 | """1	Lubię	Lubię	_	adj:pl:nom:m1:pos	_	_	_	_	_
 52 | 2	placki	placki	_	subst:pl:nom:m1	_	_	_	_	_
 53 | 3	.	.	_	interp	_	_	_	_	_
 54 | 
 55 | 1	Ala	Ala	_	subst:sg:nom:f	_	_	_	_	_
 56 | 2	ma	ma	_	subst:sg:nom:f	_	_	_	_	_
 57 | 3	kota	kota	_	subst:sg:nom:f	_	_	_	_	_
 58 | 4	.	.	_	interp	_	_	_	_	_
 59 | 
 60 | 
 61 | 1	Raz	Raz	_	subst:sg:nom:f	_	_	_	_	_
 62 | 2	dwa	dwa	_	adj:pl:acc:f:pos	_	_	_	_	_
 63 | 3	trzy	trzy	_	subst:pl:acc:f	_	_	_	_	_
 64 | 4	.	.	_	interp	_	_	_	_	_
 65 | 
 66 | """
 67 |     output = results_to_conllu_str(results)
 68 |     assert output == reference
 69 | 
 70 | def test_txt():
 71 |     reference=\
 72 | """Lubię placki.
 73 | Ala ma kota.
 74 | 
 75 | Raz dwa trzy.
 76 | 
 77 | """
 78 |     output = results_to_txt_str(results)
 79 | 
 80 |     assert output == reference
 81 | 
 82 | def test_plain():
 83 |     reference=\
 84 | """Lubię	newline
 85 | 	Lubię	adj:pl:nom:m1:pos	disamb
 86 | placki	space
 87 | 	placki	subst:pl:nom:m1	disamb
 88 | .	none
 89 | 	.	interp	disamb
 90 | 
 91 | Ala	space
 92 | 	Ala	subst:sg:nom:f	disamb
 93 | ma	space
 94 | 	ma	subst:sg:nom:f	disamb
 95 | kota	space
 96 | 	kota	subst:sg:nom:f	disamb
 97 | .	none
 98 | 	.	interp	disamb
 99 | 
100 | 
101 | Raz	space
102 | 	Raz	subst:sg:nom:f	disamb
103 | dwa	space
104 | 	dwa	adj:pl:acc:f:pos	disamb
105 | trzy	space
106 | 	trzy	subst:pl:acc:f	disamb
107 | .	none
108 | 	.	interp	disamb
109 | 
110 | """
111 |     output = results_to_plain_str(results)
112 |     assert output == reference
113 | 
114 | def test_xces():
115 |     reference=\
116 | """<?xml version="1.0" encoding="UTF-8"?>
117 | <!DOCTYPE cesAna SYSTEM "xcesAnaIPI.dtd">
118 | <cesAna xmlns:xlink="http://www.w3.org/1999/xlink" version="1.0" type="lex disamb">
119 | <chunkList>
120 |  <chunk type="p">
121 |   <chunk type="s">
122 |    <tok>
123 |     <orth>Lubię</orth>
124 |     <lex disamb="1"><base>Lubię</base><ctag>adj:pl:nom:m1:pos</ctag></lex>
125 |    </tok>
126 |    <tok>
127 |     <orth>placki</orth>
128 |     <lex disamb="1"><base>placki</base><ctag>subst:pl:nom:m1</ctag></lex>
129 |    </tok>
130 |    <ns/>
131 |    <tok>
132 |     <orth>.</orth>
133 |     <lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
134 |    </tok>
135 |   </chunk>
136 |   <chunk type="s">
137 |    <tok>
138 |     <orth>Ala</orth>
139 |     <lex disamb="1"><base>Ala</base><ctag>subst:sg:nom:f</ctag></lex>
140 |    </tok>
141 |    <tok>
142 |     <orth>ma</orth>
143 |     <lex disamb="1"><base>ma</base><ctag>subst:sg:nom:f</ctag></lex>
144 |    </tok>
145 |    <tok>
146 |     <orth>kota</orth>
147 |     <lex disamb="1"><base>kota</base><ctag>subst:sg:nom:f</ctag></lex>
148 |    </tok>
149 |    <ns/>
150 |    <tok>
151 |     <orth>.</orth>
152 |     <lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
153 |    </tok>
154 |   </chunk>
155 |  </chunk>
156 |  <chunk type="p">
157 |   <chunk type="s">
158 |    <tok>
159 |     <orth>Raz</orth>
160 |     <lex disamb="1"><base>Raz</base><ctag>subst:sg:nom:f</ctag></lex>
161 |    </tok>
162 |    <tok>
163 |     <orth>dwa</orth>
164 |     <lex disamb="1"><base>dwa</base><ctag>adj:pl:acc:f:pos</ctag></lex>
165 |    </tok>
166 |    <tok>
167 |     <orth>trzy</orth>
168 |     <lex disamb="1"><base>trzy</base><ctag>subst:pl:acc:f</ctag></lex>
169 |    </tok>
170 |    <ns/>
171 |    <tok>
172 |     <orth>.</orth>
173 |     <lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
174 |    </tok>
175 |   </chunk>
176 |  </chunk>
177 | </chunkList>
178 | </cesAna>"""
179 | 
180 |     output = results_to_xces_str(results)
181 |     assert output == reference


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | import logging
  4 | from argparse import ArgumentParser
  5 | 
  6 | from keras.models import load_model
  7 | 
  8 | from krnnt.keras_models import BEST, ExperimentParameters
  9 | from krnnt.new import UnalignedSimpleEvaluator
 10 | from krnnt.tagger_exps import RunFolds2, KerasData, RunExperiment, KerasData2, RunExperiment2
 11 | 
 12 | logging.basicConfig(level=logging.DEBUG)
 13 | 
 14 | if __name__ == '__main__':
 15 |     parser = ArgumentParser()
 16 |     parser.add_argument('data_path', help='path to preprocessed data')
 17 |     parser.add_argument('features_dict', help='path to features dict')
 18 | 
 19 |     parser.add_argument('-p', '--preanalyzed', action='store_false',
 20 |                         default=True, dest='reanalyzed',
 21 |                         help='training data have not been reanalyzed')
 22 |     parser.add_argument('-c', '--cv', action='store_true',
 23 |                         default=False, dest='cv',
 24 |                         help='run 10-fold cross-validation')
 25 |     parser.add_argument('-t', '--train_ratio',
 26 |                         default=1.0, dest='train_ratio', type=float,
 27 |                         help='percentage of data for training')
 28 |     parser.add_argument('-d', '--dev_ratio',
 29 |                         default=0.0, dest='dev_ratio', type=float,
 30 |                         help='percentage of training data for development')
 31 |     parser.add_argument('--dev_data', default='0.0', help='dev data ratio or path to dev data')
 32 |     parser.add_argument('--test_data', default='0.0', help='test data ratio or path to test data')
 33 |     parser.add_argument('--load_model', default=None, help='path to pretrained model')
 34 |     parser.add_argument('-e', '--epochs',
 35 |                         default=100, dest='epochs', type=int,
 36 |                         help='number of epochs')
 37 |     parser.add_argument('--patience',
 38 |                         default=10, dest='patience', type=int,
 39 |                         help='patience')
 40 |     parser.add_argument('--maca_config',
 41 |                         default='morfeusz2-nkjp',
 42 |                         help='Maca config')
 43 |     parser.add_argument('--tensor_board',
 44 |                         action='store_true',
 45 |                         help='save data for TensorBoard')
 46 |     parser.add_argument('-g', '--debug', action='store_true', dest='debug_mode')  # TODO
 47 |     parser.add_argument('--hash', action='store', default=None, dest='hash')
 48 |     parser.add_argument('--reproducible', action='store_true', default=False, help='set seeds')
 49 |     parser.add_argument('-f', '--fold', default=None, dest='fold')
 50 |     args = parser.parse_args()
 51 | 
 52 |     if args.reproducible:
 53 |         from numpy.random import seed
 54 |         seed(1337)
 55 |         import random as rn
 56 |         rn.seed(1337)
 57 |         import tensorflow as tf
 58 |         session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
 59 |                                       inter_op_parallelism_threads=1)
 60 |         from keras import backend as K
 61 |         tf.set_random_seed(1337)
 62 |         sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
 63 |         K.set_session(sess)
 64 | 
 65 |     pref = {'nb_epoch': 100, 'batch_size': 256,
 66 |             'internal_neurons': 256, 'feature_name': 'tags4e3', 'label_name': 'label',
 67 |             'evaluator': UnalignedSimpleEvaluator, 'patience': 10,
 68 |             'weight_path': 'weights.hdf5', 'samples_per_epoch': 10000, 'keras_model_class': BEST,
 69 |             'corpus_path': 'data/train-reanalyzed.spickle', 'reanalyze': True, 'train_data_ratio': 0.9,
 70 |             'dev_data_ratio': 0.1}
 71 | 
 72 |     pref['reanalyze'] = args.reanalyzed
 73 |     pref['train_data_ratio'] = float(args.train_ratio)
 74 |     pref['dev_data_ratio'] = float(args.dev_ratio)
 75 | 
 76 |     pref['tensor_board']= args.tensor_board
 77 |     pref['nb_epoch'] = args.epochs
 78 | 
 79 |     pref['dev_data'] = args.dev_data
 80 |     if pref['dev_data']=='0.0':
 81 |         pref['patience'] = pref['nb_epoch']
 82 |     pref['test_data'] = args.test_data
 83 |     pref['load_model'] = args.load_model
 84 | 
 85 | 
 86 |     # pref['corpus_path'] = args.corpus_path
 87 |     pref['patience'] = args.patience
 88 |     pref['maca_config'] = args.maca_config
 89 |     if args.hash is not None:
 90 |         pref['h'] = args.hash
 91 |     if args.fold is not None:
 92 |         pref['fold'] = int(args.fold)
 93 | 
 94 |     keras_model_class = pref['keras_model_class']
 95 | 
 96 |     if args.cv:
 97 |         logging.error('CV is not supported')
 98 |         # rf = RunFolds2(keras_model_class, pref)
 99 |         # rf.run()
100 |     else:
101 |         parameters = ExperimentParameters(pref)
102 | 
103 |         km = keras_model_class(parameters)
104 | 
105 | 
106 | 
107 | 
108 |         print('Model will be saved under: %s.final' % parameters.pref['weight_path'])
109 | 
110 |         kd = KerasData2(args.data_path, args.features_dict, parameters)
111 |         re = RunExperiment2(kd, km)
112 |         re.run()
113 | 
114 |         print('Model is saved under: %s' % parameters.pref['weight_path'])
115 | 
116 | 


--------------------------------------------------------------------------------
/train_lemmatization.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from argparse import ArgumentParser
 5 | 
 6 | from krnnt.keras_models import ExperimentParameters
 7 | from krnnt.tagger_exps import KerasData2, RunLemma
 8 | 
 9 | if __name__ == '__main__':
10 |     parser = ArgumentParser(description='Train lemmatization')
11 |     parser.add_argument('data_path', help='path to preprocessed data')
12 | 
13 | 
14 |     parser.add_argument('-t', '--train_ratio',
15 |                         default=1.0, dest='train_ratio', type=float,
16 |                         help='percentage of data for training')
17 |     parser.add_argument('-d', '--dev_ratio',
18 |                         default=0.0, dest='dev_ratio', type=float,
19 |                         help='percentage of training data for development')
20 |     parser.add_argument('--dev_data', default='0.1', help='dev data ratio or path to dev data')
21 |     parser.add_argument('--test_data', default='0.1', help='test data ratio or path to test data')
22 |     parser.add_argument('-g', '--debug', action='store_true', dest='debug_mode')  # TODO
23 |     parser.add_argument('--hash', action='store', default=None, dest='hash')
24 |     parser.add_argument('--reproducible', action='store_true', default=False, help='set seeds')
25 | 
26 |     args = parser.parse_args()
27 | 
28 |     if args.reproducible:
29 |         from numpy.random import seed
30 |         seed(1337)
31 |         import random as rn
32 |         rn.seed(1337)
33 |         import tensorflow as tf
34 |         session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
35 |                                       inter_op_parallelism_threads=1)
36 |         from keras import backend as K
37 |         tf.set_random_seed(1337)
38 |         sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
39 |         K.set_session(sess)
40 | 
41 |     pref = {
42 |         'train_data_ratio': float(args.train_ratio),
43 |         'dev_data_ratio': float(args.dev_ratio),
44 |         'dev_data': args.dev_data,
45 |         'test_data': args.test_data
46 |     }
47 | 
48 |     if args.hash is not None:
49 |         pref['h'] = args.hash
50 | 
51 | 
52 |     parameters = ExperimentParameters(pref)
53 | 
54 |     kd = KerasData2(args.data_path, None, parameters)
55 |     re = RunLemma(kd)
56 |     re.learn_lemma()
57 | 
58 |     print('Lemmatisation model is saved under: %s' % parameters.pref['lemmatisation_path'])
59 | 
60 |     #TODO CV, usunac zaleznosc od TF, KerasData2 bez słownika


--------------------------------------------------------------------------------
/voting.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | 
 3 | import sys
 4 | 
 5 | from krnnt.writers import results_to_xces, results_to_xces_str
 6 | from krnnt.readers import read_xces
 7 | 
 8 | # path='/home/djstrong/projects/repos/krnnt/models/voting/'
 9 | # path='/home/djstrong/projects/repos/krnnt/'
10 | # files=[path+'text-raw.'+str(i)+'.xml' for i in range(4)]
11 | # files=[path+str(i)+'b.xml' for i in range(10)]
12 | 
13 | path=sys.argv[1]
14 | files=[path+str(i)+'.xml' for i in range(10)]
15 | 
16 | def checkEqual2(iterator):
17 |    return len(set(iterator)) == 1
18 | 
19 | xcess = [read_xces(file) for file in files]
20 | 
21 | result = []
22 | 
23 | count_all=0
24 | count_mismatch=0
25 | 
26 | while True:
27 |     try:
28 |         paragraphs = [next(xces) for xces in xcess]
29 | 
30 |         for sentences in zip(*paragraphs):
31 |             sentence = []
32 |             result.append(sentence)
33 |             for tokens in zip(*sentences):
34 |                 count_all+=1
35 |                 # print(tokens)
36 |                 forms = [token.gold_form for token in tokens]
37 |                 tags = [form.tags for form in forms]
38 | 
39 |                 token_result = {'sep': 'space' if tokens[0].space_before else 'none','token':tokens[0].form}
40 |                 sentence.append(token_result)
41 |                 if not checkEqual2(tags):
42 |                     # print(tags)
43 |                     tags_count=collections.defaultdict(list)
44 |                     for form in forms:
45 |                         tags_count[form.tags].append(form)
46 |                     # print(tags_count)
47 | 
48 |                     sorted_forms = sorted(tags_count.items(), key=lambda x: len(x[1]), reverse=True)
49 |                     # print(tokens[0].form, '\t'*(3-int(len(tokens[0].form)/8)), [(form[0], len(form[1])) for form in sorted_forms])
50 |                     winner = sorted_forms[0][1][0]
51 | 
52 | 
53 | 
54 |                     token_result['tag']=winner.tags
55 |                     token_result['lemmas']=[winner.lemma]
56 |                     count_mismatch+=1
57 |                 else:
58 |                     # print(tokens[0].form, '\t'*(3-int(len(tokens[0].form)/8)), forms[0].tags)
59 |                     token_result['tag']=forms[0].tags
60 |                     token_result['lemmas']=[forms[0].lemma]
61 | 
62 |             # print()
63 |         # print()
64 | 
65 | 
66 | 
67 |     except StopIteration:
68 |         break
69 | 
70 | 
71 | print(results_to_xces_str(result))
72 | 
73 | print(count_all, count_mismatch, file=sys.stderr)


--------------------------------------------------------------------------------