List[str]:
29 | return ['NIC']
30 |
31 | @staticmethod
32 | def interps(form, features) -> List[str]:
33 | if 'interp' in features['tags'] and len(form) == 1:
34 | return [form]
35 | else:
36 | return []
37 |
38 | @staticmethod
39 | def qubliki(form, features=None) -> List[str]:
40 | if form.lower() in FeaturePreprocessor.qubs:
41 | return [form.lower()] #TODO: form.lower()
42 | else:
43 | return []
44 |
45 | @staticmethod
46 | def shape(form, features=None) -> List[str]:
47 | # print(form, shape(form))
48 | return [shape(form)]
49 |
50 | @staticmethod
51 | def prefix(n, form, features=None) -> List[str]:
52 | try:
53 | char = form[n].lower()
54 | if char not in FeaturePreprocessor.safe_chars:
55 | char = '??'
56 | except IndexError:
57 | char = 'xx'
58 |
59 | return ['P' + str(n) + char]
60 |
61 | @staticmethod
62 | def prefix1(form, features=None) -> List[str]:
63 | return FeaturePreprocessor.prefix(0, form, features)
64 |
65 | @staticmethod
66 | def prefix2(form, features=None) -> List[str]:
67 | return FeaturePreprocessor.prefix(1, form, features)
68 |
69 | @staticmethod
70 | def prefix3(form, features=None) -> List[str]:
71 | return FeaturePreprocessor.prefix(2, form, features)
72 |
73 | @staticmethod
74 | def suffix(n, form, features=None) -> List[str]:
75 | try:
76 | char = form[-n].lower()
77 | if char not in FeaturePreprocessor.safe_chars:
78 | char = '??'
79 | except IndexError:
80 | char = 'xx'
81 |
82 | return ['S' + str(n) + char]
83 |
84 | @staticmethod
85 | def suffix1(form, features=None) -> List[str]:
86 | return FeaturePreprocessor.suffix(1, form, features)
87 |
88 | @staticmethod
89 | def suffix2(form, features=None) -> List[str]:
90 | return FeaturePreprocessor.suffix(2, form, features)
91 |
92 | @staticmethod
93 | def suffix3(form, features=None) -> List[str]:
94 | return FeaturePreprocessor.suffix(3, form, features)
95 |
96 |
97 | class TagsPreprocessorCython:
98 | @staticmethod
99 | def create_tags4_without_guesser(tags, features=None) -> List[str]:
100 | return krnnt_utils.create_tags4_without_guesser(tags)
101 |
102 | @staticmethod
103 | def create_tags5_without_guesser(tags, features=None) -> List[str]:
104 | return krnnt_utils.create_tags5_without_guesser(tags)
105 |
106 |
107 | class TagsPreprocessor:
108 | cas = ['nom', 'gen', 'dat', 'acc', 'inst', 'loc', 'voc']
109 | per = ['pri', 'sec', 'ter']
110 | nmb = ['sg', 'pl']
111 | gnd = ['m1', 'm2', 'm3', 'f', 'n']
112 |
113 | @staticmethod
114 | def create_tags4(tags, features=None, keep_guesser=True) -> List[str]: # concraft
115 | if not keep_guesser and 'ign' in tags:
116 | return ['ign']
117 | # return ['1ign','2ign','1subst:nom','2subst:sg:f','1adj:nom','1subst:gen','2subst:sg:n','2subst:sg:m1','2adj:sg:m3:pos','2subst:sg:m3','1num:acc','2num:pl:m3:rec','1brev','2adj:sg:n:pos','2num:pl:m3:congr','1num:nom','1adj:gen','1adj:loc']
118 | return uniq(flatten(map(lambda tag: TagsPreprocessor.create_tag4(tag), tags)))
119 |
120 | @staticmethod
121 | def create_tags4_without_guesser(tags, features=None) -> List[str]:
122 | return TagsPreprocessor.create_tags4(tags, features=features, keep_guesser=False)
123 |
124 | @staticmethod
125 | def create_tag4(otag, features=None) -> List[str]:
126 | tags = flatten(map(lambda x: x.split('.'), otag.split(':')))
127 | pos = tags[0]
128 | tags = tags[1:]
129 | tags2 = []
130 |
131 | first = None
132 | for tag in tags:
133 | if tag in TagsPreprocessor.cas or tag in TagsPreprocessor.per:
134 | first = tag
135 | break
136 |
137 | if first:
138 | tags.remove(first)
139 | tags2.append('1' + pos + ':' + first)
140 | else:
141 | tags2.append('1' + pos) # TODO sprawdzic
142 |
143 | tags2.append('2' + (':'.join([pos] + tags)))
144 |
145 | # print otag, tags2
146 | return uniq(tags2)
147 |
148 | @staticmethod
149 | def create_tags5(tags, features=None, keep_guesser=True) -> List[str]: # concraft
150 | if not keep_guesser and 'ign' in tags:
151 | return ['ign']
152 | # return ['ign','sg:loc:m3','sg:nom:n','pl:nom:m3','pl:acc:m3','loc','sg:gen:m3','pl:gen:m3','sg:nom:m1','sg:nom:m3','gen','nom','acc','sg:nom:f']
153 |
154 | return uniq(flatten(map(lambda tag: TagsPreprocessor.create_tag5(tag), tags)))
155 |
156 | @staticmethod
157 | def create_tags5_without_guesser(tags, features=None) -> List[str]:
158 | return TagsPreprocessor.create_tags5(tags, features=features, keep_guesser=False)
159 |
160 | @staticmethod
161 | def create_tag5(otag, features=None) -> List[str]:
162 |
163 | tags = flatten(map(lambda x: x.split('.'), otag.split(':')))
164 |
165 | tags_out = []
166 | tags2 = []
167 | tags3 = []
168 | for tag in tags:
169 | if tag in TagsPreprocessor.nmb:
170 | tags2.append(tag)
171 | elif tag in TagsPreprocessor.cas:
172 | tags2.append(tag)
173 | tags3.append(tag)
174 | elif tag in TagsPreprocessor.gnd:
175 | tags2.append(tag)
176 |
177 | for tagsX in [tags2, tags3]:
178 | if tagsX:
179 | tags_out.append(':'.join(tagsX))
180 |
181 | return uniq(tags_out)
182 |
183 | def create_token_features(token, tags, space_before) -> List[str]: #TODO
184 | f = []
185 | f+=FeaturePreprocessor.interps(token, {'tags':tags})
186 | f+=FeaturePreprocessor.qubliki(token)
187 | f+=FeaturePreprocessor.shape(token) # 90%
188 | f+=FeaturePreprocessor.prefix1(token)
189 | f+=FeaturePreprocessor.prefix2(token)
190 | f+=FeaturePreprocessor.prefix3(token)
191 | f+=FeaturePreprocessor.suffix1(token)
192 | f+=FeaturePreprocessor.suffix2(token)
193 | f+=FeaturePreprocessor.suffix3(token)
194 | f+=TagsPreprocessorCython.create_tags4_without_guesser(
195 | tags) # 3% moze cache dla wszystkich tagów
196 | f+=TagsPreprocessorCython.create_tags5_without_guesser(tags) # 3%
197 | f+=space_before
198 |
199 | return f
--------------------------------------------------------------------------------
/krnnt/keras_models.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import uuid
3 | from typing import Dict
4 |
5 | import keras
6 | from keras.layers import Dense, Dropout, Input, GRU, TimeDistributed, \
7 | Masking
8 | from keras.layers.wrappers import Bidirectional
9 | from keras.models import Model
10 |
11 |
12 | class ExperimentParameters:
13 | def __init__(self, pref: Dict, testing=False):
14 | self.pref = pref.copy()
15 | if testing:
16 | pass # TODO self.h
17 | else:
18 | if 'h' not in self.pref:
19 | self.pref['h'] = str(uuid.uuid1())
20 | self.h = self.pref['h']
21 | self.pref['weight_path'] = 'weight_' + self.h + '.hdf5'
22 | self.pref['lemmatisation_path'] = 'lemmatisation_' + self.h + '.pkl'
23 |
24 | def save_prefs(self):
25 | # TODO
26 | print(self.pref)
27 |
28 |
29 | class KerasModel:
30 | model: Model
31 |
32 | def __init__(self, parameters: ExperimentParameters):
33 | self.parameters = parameters
34 |
35 | def compile(self):
36 | logging.info('Model compiling')
37 | self.model.compile(loss=self.loss, optimizer=self.optimizer, metrics=['accuracy'])
38 | logging.info('Model compiled')
39 |
40 | def make_predict_func(self):
41 | self.model._make_predict_function()
42 |
43 | def load_weights(self, path):
44 | self.model.load_weights(path)
45 | logging.info('Weights loaded')
46 |
47 | def load_model(self, path):
48 | self.model = keras.models.load_model(path)
49 |
50 | def yaml_model(self):
51 | model_yaml = self.model.to_yaml()
52 | # TODO
53 | return model_yaml
54 |
55 | def create_model(self):
56 | raise NotImplementedError
57 |
58 |
59 | class BEST(KerasModel):
60 | def __init__(self, parameters):
61 | super().__init__(parameters)
62 |
63 | def create_model(self):
64 | features_length = self.parameters.pref['features_length']
65 |
66 | inputs = Input(shape=(None, features_length))
67 | x = inputs
68 | x = Masking(mask_value=0., input_shape=(None, features_length))(x)
69 | x = Bidirectional(
70 | GRU(self.parameters.pref['internal_neurons'], return_sequences=True, dropout=0.0, recurrent_dropout=0.5,
71 | implementation=1), input_shape=(None, features_length))(x)
72 | x = Bidirectional(
73 | GRU(self.parameters.pref['internal_neurons'], return_sequences=True, dropout=0.0, recurrent_dropout=0.5,
74 | implementation=1), input_shape=(None, features_length))(x)
75 | x = Dropout(0.5)(x)
76 | x = TimeDistributed(Dense(self.parameters.pref['output_length'], activation='softmax'))(x)
77 |
78 | self.model = Model(inputs=inputs, outputs=x)
79 |
80 | self.loss = 'categorical_crossentropy'
81 | self.optimizer = keras.optimizers.Nadam()
82 |
--------------------------------------------------------------------------------
/krnnt/pipeline.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import math
3 | import pickle
4 | import re
5 | import sys
6 | from typing import List, Iterable, Generator, Union
7 |
8 | from krnnt.analyzers import MacaAnalyzer
9 | from krnnt.structure import Paragraph
10 |
11 | from .keras_models import ExperimentParameters, KerasModel
12 | from krnnt.utils import uniq
13 | from .new import k_hot, UniqueFeaturesValues, Lemmatisation, Lemmatisation2
14 | from krnnt.features import create_token_features
15 |
16 | sys.setrecursionlimit(10000)
17 |
18 | from keras.preprocessing import sequence
19 | import numpy as np
20 | import krnnt_utils
21 |
22 |
23 | class KRNNTSingle:
24 | def __init__(self, pref):
25 | self.pref = pref
26 | self.unique_features_dict = pickle.load(open(pref['UniqueFeaturesValues'], 'rb'))
27 | self.km = KerasThread.create_model(pref, testing=True)
28 | self.lemmatisation = pref['lemmatisation_class']()
29 | self.lemmatisation.load(pref['lemmatisation_path'])
30 |
31 | self.configure()
32 |
33 | def tag_sentence(self, sentence: str, preana=False):
34 | return self.__tag([sentence], preana)
35 |
36 | def tag_sentences(self, sentences: List[str], preana=False):
37 | return self.__tag(sentences, preana)
38 |
39 | def tag_sentences_preana(self, sentences: List[Paragraph]):
40 | return self.__tag(sentences, preana=True)
41 |
42 | def tag_paragraphs(self, paragraphs: Iterable[str], preana=False):
43 | return self.__tag_paragraphs(paragraphs, preana)
44 |
45 | def __tag_paragraphs(self, paragraphs: Iterable[str], preana):
46 |
47 |
48 | if preana:
49 | sequences = Preprocess.process_batch_preana(enumerate(paragraphs))
50 | else:
51 | sequences = Preprocess.process_batch(paragraphs, self.pref['maca_config'], self.pref['toki_config_path'])
52 |
53 | # batch_size=math.ceil(len_sequences/max(math.floor(len_sequences/self.pref['keras_batch_size']), 1)) # dynamic batch
54 |
55 | result = []
56 | for batch in chunk(sequences, self.pref['keras_batch_size']):
57 | pad_batch = self.pad(batch, self.unique_features_dict, 'tags4e3')
58 | preds = self.km.model.predict_on_batch(pad_batch)
59 | for plain in KerasThread.return_results(batch, preds, self.km.classes, self.lemmatisation):
60 | result.append(plain)
61 |
62 |
63 | # podziel na paragrafy
64 | result2=[]
65 | result_paragraph=[]
66 | for sentence in result:
67 | if not result_paragraph or sentence[0]['document_id']==result_paragraph[-1][0]['document_id']:
68 | result_paragraph+= (sentence, )
69 | else:
70 | result2+=(result_paragraph,)
71 | result_paragraph=[sentence]
72 |
73 | if result_paragraph:
74 | result2 += (result_paragraph,)
75 |
76 | return result2
77 |
78 | def configure(self):
79 | if 'krnnt_utils' in sys.modules:
80 | self.pad = krnnt_utils.pad
81 | else:
82 | self.pad = Preprocess.pad
83 |
84 | def __tag(self, sentences: List[str], preana: bool):
85 | if preana:
86 | sequences = Preprocess.process_batch_preana(enumerate(sentences))
87 | else:
88 | sequences = Preprocess.process_batch(sentences, self.pref['maca_config'], self.pref['toki_config_path'])
89 |
90 | # batch_size=math.ceil(len_sequences/max(math.floor(len_sequences/self.pref['keras_batch_size']), 1)) # dynamic batch
91 |
92 | result = []
93 | for batch in chunk(sequences, self.pref['keras_batch_size']):
94 | pad_batch = self.pad(batch, self.unique_features_dict, 'tags4e3')
95 | preds = self.km.model.predict_on_batch(pad_batch)
96 | for plain in KerasThread.return_results(batch, preds, self.km.classes, self.lemmatisation):
97 | result.append(plain)
98 |
99 | return result
100 |
101 |
102 | class Sample:
103 | def __init__(self):
104 | self.features = {}
105 |
106 |
107 | class Preprocess:
108 | @staticmethod
109 | def create_features(sequence: List[Sample]):
110 | for sample in sequence:
111 | sample.features['tags4e3'] = create_token_features(sample.features['token'], sample.features['tags'],
112 | sample.features['space_before'])
113 |
114 | @staticmethod
115 | def process_batch(documents: Iterable[str], maca_config: str, toki_config_path: str) -> Generator[
116 | List[Sample], None, None]:
117 | maca_analyzer = MacaAnalyzer(maca_config, toki_config_path)
118 |
119 | for document_id, document in enumerate(documents):
120 | results = maca_analyzer._maca(document)
121 |
122 | for res in results:
123 | result = maca_analyzer._parse(res)
124 |
125 | sequence = []
126 | for form, space_before, interpretations, start, end in result:
127 | sample = Sample()
128 | sequence.append(sample)
129 | sample.features['token'] = form
130 | sample.features['tags'] = uniq([t for l, t in interpretations])
131 | interpretations = [(re.sub(r':[abcdijnopqsv]\d?$', '', l), t) for l, t in
132 | interpretations]
133 | sample.features['maca_lemmas'] = [(l.replace('_', ' '), t) for l, t in uniq(interpretations)]
134 |
135 | # TODO: cleanup space before
136 | sample.features['space_before'] = ['space_before'] if space_before !='none' else [
137 | 'no_space_before']
138 | sample.features['space_before'].append(space_before)
139 | sample.features['start'] = start
140 | sample.features['end'] = end
141 | sample.features['document_id'] = document_id
142 | Preprocess.create_features(sequence)
143 |
144 | if sequence:
145 | yield sequence
146 |
147 | @staticmethod
148 | def process_batch_preana(batch: Iterable[Paragraph]) -> Generator[List[Sample], None, None]:
149 | for document_id, paragraph in batch:
150 | for sentence in paragraph:
151 | sequence = []
152 | for token in sentence:
153 | sample = Sample()
154 | sequence.append(sample)
155 | sample.features['token'] = token.form
156 | sample.features['tags'] = uniq([form.tags for form in token.interpretations])
157 | sample.features['maca_lemmas'] = uniq([(form.lemma, form.tags) for form in token.interpretations])
158 | sample.features['space_before'] = ['space_before'] if token.space_before else ['no_space_before']
159 | sample.features['space_before'].append(token.space_before)
160 | sample.features['document_id'] = document_id
161 | Preprocess.create_features(sequence)
162 |
163 | if sequence:
164 | yield sequence
165 |
166 | @staticmethod
167 | def pad(batch: List[List[Sample]], unique_features_dict, feature_name: str):
168 | if not batch:
169 | return []
170 |
171 | result_batchX = []
172 | for sentence in batch:
173 | X_sentence = []
174 | for sample in sentence:
175 | X_sentence.append(np.array(k_hot(sample.features[feature_name], unique_features_dict[feature_name])))
176 |
177 | result_batchX.append(X_sentence)
178 |
179 | return sequence.pad_sequences(result_batchX)
180 |
181 |
182 | def chunk(l: Iterable, batch_size: int) -> List:
183 | batch = []
184 | for element in l:
185 | batch.append(element)
186 | if len(batch) == batch_size:
187 | yield batch
188 | batch = []
189 | if batch:
190 | yield batch
191 |
192 |
193 | class KerasThread():
194 |
195 | @staticmethod
196 | def create_model(pref, testing=False) -> KerasModel:
197 | keras_model_class = pref['keras_model_class']
198 |
199 | parameters = ExperimentParameters(pref, testing)
200 |
201 | km = keras_model_class(parameters)
202 |
203 | if 'UniqueFeaturesValues' in pref:
204 | km.unique_features_dict = pickle.load(open(pref['UniqueFeaturesValues'], 'rb'))
205 | else:
206 | # data_path = 'nkjp_paragraphs_shuffled_concraft.spickle_FormatData_PreprocessData'
207 | data_path = pref['data_path']
208 | km.unique_features_dict = UniqueFeaturesValues(data_path).get()
209 |
210 | unique_tags_dict = km.unique_features_dict[pref['label_name']]
211 | km.classes = list(map(lambda k: k[0], sorted(unique_tags_dict.items(), key=lambda k: k[1])))
212 | pref = km.parameters.pref
213 | pref['features_length'] = len(km.unique_features_dict[pref['feature_name']])
214 | pref['output_length'] = len(km.unique_features_dict[pref['label_name']])
215 |
216 | km.create_model()
217 | # self.km.load_weights('weight_7471898792961270266.hdf5')
218 | # km.load_weights('weight_7471898792961270266.hdf5')
219 | # km.load_weights('../artykul/compare/train_on_all.weights')
220 | km.load_weights(pref['weight_path'])
221 | km.compile()
222 |
223 | return km
224 |
225 | @staticmethod
226 | def return_results(sentences: List[List[Sample]], preds, classes: List[str],
227 | lemmatisation: Union[Lemmatisation, Lemmatisation2]):
228 | for sentence, preds2 in zip(sentences, preds): # TODO sentences
229 | # print(preds2.shape)
230 | # print(preds2)
231 |
232 | response = []
233 |
234 | preds3 = preds2.argmax(axis=-1)
235 | preds3max = preds2.max(axis=-1)
236 | # print(len(sentence), len(preds3))
237 | first = True
238 | for sample, max_index, prob in zip(sentence, list(preds3)[-len(sentence):],
239 | list(preds3max)[-len(sentence):]):
240 | # print(sample.features, max_index)
241 | # max_index, max_value = max(enumerate(d), key=lambda x: x[1])
242 |
243 | token_response = {}
244 | response.append(token_response)
245 | predicted_tag = classes[max_index]
246 |
247 | # TODO
248 | if sample.features['space_before'] == ['space_before']:
249 | sep = 'space'
250 | else:
251 | sep = 'none'
252 |
253 | if 'newline' in sample.features['space_before'] or 'newlines' in sample.features['space_before']:
254 | sep = 'newline'
255 | elif 'space' in sample.features['space_before'] or 'spaces' in sample.features['space_before']:
256 | sep = 'space'
257 | elif 'none' in sample.features['space_before']:
258 | sep = 'none'
259 |
260 | # print(sample.features['token']+'\t'+sep)
261 | # response.append(sample.features['token']+'\t'+sep)
262 | token_response['token'] = sample.features['token']
263 | token_response['sep'] = sep
264 | token_response['prob'] = float(prob)
265 | token_response['document_id'] = sample.features['document_id']
266 |
267 | lemmas = [x for x in sample.features['maca_lemmas']]
268 | token_response['tag'] = predicted_tag
269 | token_response['lemmas'] = []
270 | try:
271 | token_response['start'] = sample.features['start']
272 | token_response['end'] = sample.features['end']
273 | except KeyError:
274 | token_response['start'] = None
275 | token_response['end'] = None
276 |
277 | # if not lemmas:
278 | # lemmas.append((sample.features['token'], predicted_tag))
279 | lemma = lemmatisation.disambiguate(token_response['token'], lemmas, predicted_tag)
280 |
281 | token_response['lemmas'].append(lemma)
282 |
283 | # if lemmas:
284 | # for l, t in lemmas:
285 | # #print('\t'+l+'\t'+t+'\tdisamb')
286 | # #response.append('\t'+l+'\t'+t+'\tdisamb')
287 | # token_response['lemmas'].append(l)
288 | # else:
289 | # #print('\t'+sample.features['token']+'\t'+predicted_tag+'\tdisamb')
290 | # #response.append('\t'+sample.features['token']+'\t'+predicted_tag+'\tdisamb')
291 | # token_response['lemmas'].append(sample.features['token'])
292 |
293 | first = False
294 | # print()
295 | # response.append('')
296 |
297 | yield response
298 |
--------------------------------------------------------------------------------
/krnnt/readers.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import sys
3 | from typing import Generator
4 | from xml.etree import ElementTree as ET
5 |
6 | import jsonlines
7 |
8 | from krnnt.structure import Paragraph, Sentence, Token, Form
9 |
10 |
11 | def read_xces(file_path: str) -> Paragraph:
12 | paragraphs_defined = True
13 | ns=False #no separator
14 | first_chunk=True
15 |
16 | for event, elem in ET.iterparse(file_path, events=("start","end",)):
17 | if first_chunk and event=="start" and elem.tag in ('chunk','sentence'):
18 | if elem.get('type') == 's' or elem.tag =='sentence':
19 | paragraphs_defined = False
20 | first_chunk=False
21 | elif event=="end" and elem.tag in ('chunk','sentence'):
22 | xml_sentences=[]
23 | paragraph=Paragraph()
24 | if paragraphs_defined and elem.tag == 'chunk' and elem.get('type')!='s':
25 | xml_sentences = elem.getchildren()
26 | elif (not paragraphs_defined) and ((elem.tag == 'chunk' and elem.get('type')=='s') or elem.tag == 'sentence'):
27 | xml_sentences = [elem]
28 | else:
29 | continue
30 |
31 | for sentence_index, xml_sentence in enumerate(xml_sentences):
32 | sentence=Sentence()
33 | paragraph.add_sentence(sentence)
34 | for token_index, xml_token in enumerate(xml_sentence.getchildren()):
35 | if xml_token.tag=='ns':
36 | if token_index>0 or sentence_index>0: #omit first ns in paragraph
37 | ns=True
38 | elif xml_token.tag=='tok':
39 | token=Token()
40 | token.space_before=not ns
41 |
42 | for xml_node in xml_token.getchildren():
43 | if xml_node.tag=='orth':
44 | orth=xml_node.text
45 | token.form=orth
46 | elif xml_node.tag=='lex':
47 | if xml_node.get('disamb')=='1':
48 | disamb=True
49 | else:
50 | disamb=False
51 |
52 | base=xml_node.find('base').text
53 | ctag=xml_node.find('ctag').text
54 |
55 | form = Form(base, ctag)
56 | if disamb:
57 | if token.gold_form is not None:
58 | logging.warning(f'More than 1 disamb {file_path} {orth}')
59 | token.gold_form=form
60 | else:
61 | token.interpretations.append(form)
62 | elif xml_node.tag=='ann':
63 | continue
64 | else:
65 | logging.error('Error 1 {xml_token}')
66 | if token.form:
67 | sentence.add_token(token)
68 | ns=False
69 | else:
70 | logging.error(f'Error 2 {xml_token}')
71 | yield paragraph
72 | elem.clear()
73 |
74 |
75 | def read_jsonl(file_path: str) -> Generator[Paragraph,None,None]:
76 | with jsonlines.Reader(file_path) as reader:
77 | for obj in reader:
78 | a = _list_to_paragraph(obj)
79 | yield a
80 |
81 |
82 | def _list_to_paragraph(l) -> Paragraph:
83 | paragraph = Paragraph()
84 | for s in l:
85 | sentence = Sentence()
86 | paragraph.add_sentence(sentence)
87 | for t in s:
88 | token = Token()
89 | form=t[0]
90 | token.form = form
91 |
92 | # print(t)
93 | try:
94 | space=t[1]
95 | token.space_before = (space == 1)
96 | except IndexError:
97 | token.space_before = True # ?
98 |
99 | interpretations = t[2:]
100 | token.interpretations.extend([Form(base, ctag) for (base, ctag) in interpretations])
101 |
102 | sentence.add_token(token)
103 | return paragraph
104 |
105 |
106 | def json_to_objects(data):
107 | paragraphs = []
108 | for input_paragraph in data['documents']:
109 | paragraph = Paragraph()
110 | paragraphs.append(paragraph)
111 | for input_sentence in input_paragraph['sentences']:
112 | sentence = Sentence()
113 | paragraph.add_sentence(sentence)
114 | for input_token in input_sentence['tokens']:
115 | token = Token()
116 | token.form = input_token['form']
117 | if len(input_token)>=2:
118 | separator=input_token['separator']
119 | if separator is not None:
120 | token.space_before=separator
121 | elif len(input_token)>=4:
122 | token.start=input_token['start']
123 | token.end = input_token['end']
124 | #infer separator before from positions
125 | if len(sentence.tokens)==0:
126 | token.space_before='space'
127 | else:
128 | if sentence.tokens[-1].end==token.start:
129 | token.space_before = 'none'
130 | else:
131 | token.space_before = 'space'
132 | else:
133 | token.space_before = 'space' # TODO ?
134 | sentence.add_token(token)
135 | return paragraphs
136 |
137 |
138 | def json_compact_to_objects(data):
139 | paragraphs = []
140 | for input_paragraph in data:
141 | paragraph = Paragraph()
142 | paragraphs.append(paragraph)
143 | for input_sentence in input_paragraph:
144 | sentence = Sentence()
145 | paragraph.add_sentence(sentence)
146 | for input_token in input_sentence:
147 | token = Token()
148 | token.form = input_token[0]
149 | if len(input_token) >= 2:
150 | separator = input_token[1]
151 | if separator is not None:
152 | token.space_before = separator
153 | elif len(input_token) >= 4:
154 | token.start = input_token[2]
155 | token.end = input_token[3]
156 | # infer separator before from positions
157 | if len(sentence.tokens) == 0:
158 | token.space_before = 'space'
159 | else:
160 | if sentence.tokens[-1].end == token.start:
161 | token.space_before = 'none'
162 | else:
163 | token.space_before = 'space'
164 | else:
165 | token.space_before = 'space' # TODO ?
166 | sentence.add_token(token)
167 | return paragraphs
168 |
169 |
--------------------------------------------------------------------------------
/krnnt/serial_pickle.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | from typing import BinaryIO, Iterable
3 |
4 |
5 | class SerialPickler:
6 | def __init__(self, file: BinaryIO, mode=3): # don't work with protocol 4
7 | self.file = file
8 | self.p = pickle.Pickler(file, mode)
9 |
10 | def add(self, obj):
11 | self.p.dump(obj)
12 | self.p.memo.clear()
13 |
14 | def extend(self, objs: Iterable):
15 | for obj in objs:
16 | self.p.dump(obj)
17 | self.p.memo.clear()
18 |
19 | def close(self):
20 | self.file.close()
21 |
22 |
23 | class SerialUnpickler:
24 | def __init__(self, file: BinaryIO, stop: int=-1, start: int =0, ids: Iterable = None):
25 | """
26 |
27 | :param file:
28 | :param start: unpickle objects starting from index start
29 | :param stop: unpickle objects ending with index stop
30 | :param ids: unpickle objects with indexes in ids
31 | """
32 | if ids is None:
33 | ids = []
34 | self.file = file
35 | self.p = pickle.Unpickler(file)
36 | self.c = 0
37 | self.stop = stop
38 | self.start = start
39 | self.ids = set(ids)
40 |
41 | def __iter__(self):
42 | if self.ids:
43 | return self.__iter2()
44 | else:
45 | return self.__iter1()
46 |
47 | def __iter1(self):
48 | while True:
49 | try:
50 | if self.c == self.stop:
51 | break
52 | self.c += 1
53 | x = self.p.load()
54 | if self.c - 1 < self.start:
55 | continue
56 |
57 | # print self.c
58 | yield x
59 | except EOFError:
60 | break
61 |
62 | def __iter2(self):
63 | while True:
64 | try:
65 | x = self.p.load()
66 | if self.c in self.ids:
67 | yield x
68 | self.c += 1
69 | except EOFError:
70 | break
71 |
72 |
73 | def count_samples(path: str) -> int:
74 | """
75 | Return number of items in serial pickle file.
76 | """
77 | with open(path, 'rb') as file:
78 | su = SerialUnpickler(file)
79 |
80 | count = 0
81 | for paragraph in su:
82 | count += 1
83 |
84 | return count
--------------------------------------------------------------------------------
/krnnt/structure.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from typing import List
5 |
6 |
7 | class Paragraph:
8 | sentences: List['Sentences']
9 |
10 | __slots__ = ['sentences', 'concraft']
11 |
12 | def __init__(self):
13 | self.sentences = []
14 |
15 | def add_sentence(self, sentence: 'Sentence'):
16 | self.sentences.append(sentence)
17 |
18 | def __iter__(self):
19 | return self.sentences.__iter__()
20 |
21 | def text(self) -> str:
22 | raw = ''.join([sentence.text() for sentence in self.sentences])
23 | try:
24 | if self.sentences[0].tokens[0].space_before:
25 | return raw[1:]
26 | else:
27 | return raw
28 | except:
29 | return raw
30 |
31 | def __str__(self):
32 | return 'Paragraph([%s])' % ','.join([str(x) for x in self.sentences])
33 |
34 |
35 | class Sentence:
36 | tokens: List['Token']
37 |
38 | __slots__ = ['tokens']
39 |
40 | def __init__(self):
41 | self.tokens = []
42 |
43 | def add_token(self, token: 'Token'):
44 | self.tokens.append(token)
45 |
46 | def text(self) -> str:
47 | return ''.join(map(lambda token: ' ' + token.form if token.space_before else token.form, self.tokens))
48 |
49 | def __iter__(self):
50 | return self.tokens.__iter__()
51 |
52 | def __str__(self):
53 | return 'Sentence([%s])' % ','.join([str(x) for x in self.tokens])
54 |
55 | class Token:
56 | form: str
57 | interpretations: List['Form']
58 | gold_form: 'Form'
59 |
60 | __slots__ = ['form', 'space_before', 'interpretations', 'gold_form', 'start', 'end']
61 |
62 | def __init__(self):
63 | self.form = None
64 | self.space_before = None
65 | self.interpretations = []
66 | self.gold_form = None
67 |
68 | def add_interpretation(self, interpretation: 'Form'):
69 | self.interpretations.append(interpretation)
70 |
71 | def __str__(self):
72 | return 'Token(%s, %s, %s, %s)' % (self.form, ','.join([str(x) for x in self.interpretations]), self.space_before, str(self.gold_form))
73 |
74 |
75 | class Form:
76 | def __init__(self, lemma: str, tags: str):
77 | self.lemma = lemma
78 | self.tags = tags
79 |
80 | def __str__(self):
81 | return 'Form(%s, %s)' % (self.lemma, self.tags)
82 |
83 | def __eq__(self, y):
84 | return self.lemma == y.lemma and self.tags == y.tags
85 |
86 | def __hash__(self):
87 | return hash((self.lemma, self.tags))
88 |
--------------------------------------------------------------------------------
/krnnt/utils.py:
--------------------------------------------------------------------------------
1 | from typing import Iterable, List
2 |
3 | import regex
4 |
5 |
6 | def unix_uniq(l: str) -> str:
7 | packed = []
8 |
9 | for el in l:
10 | if not packed or packed[-1] != el:
11 | packed.append(el)
12 | return ''.join(packed)
13 |
14 |
15 | def uniq(seq: Iterable) -> List:
16 | seen = set()
17 | return [x for x in seq if not (x in seen or seen.add(x))]
18 |
19 |
20 | def flatten(l: Iterable) -> List:
21 | return [item for sublist in l for item in sublist]
22 |
23 |
24 | def shape(word: str) -> str: # TODO zredukowac czas
25 | word = regex.sub(r'(?V1)\p{Lowercase}', 'l', word, flags=regex.U) # 80%
26 | word = regex.sub(r'(?V1)\p{Uppercase}', 'u', word, flags=regex.U)
27 | word = regex.sub(r'\p{gc=Decimal_Number}', 'd', word, flags=regex.U)
28 | word = regex.sub(r'[^A-Za-z0-9]', 'x', word, flags=regex.LOCALE)
29 | return unix_uniq(word)
--------------------------------------------------------------------------------
/krnnt/writers.py:
--------------------------------------------------------------------------------
1 | import io
2 | import json
3 | import logging
4 | import sys
5 | from typing import Callable
6 |
7 | import jsonlines
8 |
9 |
10 | def results_to_txt_str(result_paragraphs):
11 | result_str = []
12 | for paragraph in result_paragraphs:
13 | for sentence in paragraph:
14 | for i, token in enumerate(sentence):
15 | # print(token['sep'])
16 | if i > 0 and token['sep'] != 'none':
17 | result_str += (' ',)
18 | result_str += (token['token'],)
19 | result_str += ("\n",)
20 | result_str += ("\n",)
21 | return ''.join(result_str)
22 |
23 |
24 | def results_to_conll_str(result_paragraphs):
25 | result_str = []
26 | for paragraph in result_paragraphs:
27 | for sentence in paragraph:
28 | for token in sentence:
29 | try:
30 | start = token['start']
31 | except KeyError:
32 | start = ''
33 |
34 | try:
35 | end = token['end']
36 | except KeyError:
37 | end = ''
38 |
39 | result_str += ('%s\t%s\t%s\t%s\t%s\t%s' % (
40 | token['token'], token['lemmas'][0], 0 if token['sep'] == 'none' else 1, token['tag'], start, end),)
41 | result_str += ("",)
42 | result_str += ("",)
43 | return '\n'.join(result_str)
44 |
45 |
46 | def results_to_jsonl_str(result_paragraphs):
47 | fp = io.StringIO()
48 | with jsonlines.Writer(fp) as writer:
49 | for paragraph in result_paragraphs:
50 | output_paragraph=[]
51 | for sentence in paragraph:
52 | ss = [(token['token'], token['lemmas'][0], token['tag']) for token in sentence]
53 | output_paragraph+=(ss,)
54 | writer.write(output_paragraph)
55 | return fp.getvalue()
56 |
57 | def results_to_json_str(result_paragraphs):
58 | return json.dumps(result_paragraphs)
59 |
60 |
61 | def results_to_conllu_str(result_paragraphs):
62 | result_str = []
63 | for paragraph in result_paragraphs:
64 | for sentence in paragraph:
65 | for i, token in enumerate(sentence):
66 | result_str += ('%s\t%s\t%s\t_\t%s\t_\t_\t_\t_\t_' % (
67 | i + 1, token['token'], token['lemmas'][0], token['tag']),)
68 | result_str += ("",)
69 | result_str += ("",)
70 | return '\n'.join(result_str)
71 |
72 |
73 | def results_to_plain_str(result_paragraphs):
74 | result_str = []
75 | for paragraph in result_paragraphs:
76 | for sentence in paragraph:
77 | for token in sentence:
78 | result_str += ('%s\t%s' % (token['token'], token['sep']),)
79 | for lemma in token['lemmas']:
80 | result_str += ('\t%s\t%s\tdisamb' % (lemma, token['tag']),)
81 | result_str += ("",)
82 | result_str += ("",)
83 | return '\n'.join(result_str)
84 |
85 |
86 | def results_to_xces_str(result_paragraphs):
87 | result_str = []
88 | result_str += ('',
89 | '',
90 | '',
91 | '')
92 | for paragraph in result_paragraphs:
93 | result_str += (' ', )
94 | for sentence in paragraph:
95 | result_str += (' ',)
96 | for token in sentence:
97 | if token['sep'] == 'none':
98 | result_str += (' ',)
99 | result_str += (' ',)
100 | result_str += (' %s' % escape_xml(token['token']),)
101 | for lemma in token['lemmas']:
102 | result_str += (' %s%s' % (escape_xml(lemma),
103 | token['tag']),)
104 | result_str += (' ',)
105 | result_str += (' ',)
106 | result_str += (' ',)
107 |
108 | result_str += ('',
109 | '')
110 | return '\n'.join(result_str)
111 |
112 |
113 | def escape_xml(s):
114 | return s.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace('\'',
115 | ''')
116 |
117 |
118 | def get_output_converter(output_format: str) -> Callable:
119 | output_format=output_format.lower()
120 | if output_format == 'xces':
121 | conversion = results_to_xces_str
122 | elif output_format == 'plain':
123 | conversion = results_to_plain_str
124 | elif output_format in ('conll','tsv'):
125 | conversion = results_to_conll_str
126 | elif output_format == 'conllu':
127 | conversion = results_to_conllu_str
128 | elif output_format == 'jsonl':
129 | conversion = results_to_jsonl_str
130 | elif output_format == 'json':
131 | conversion = results_to_json_str
132 | elif output_format in ('txt','text'):
133 | conversion = results_to_txt_str
134 | else:
135 | logging.error('Wrong output format.')
136 | sys.exit(1)
137 |
138 | return conversion
--------------------------------------------------------------------------------
/krnnt_run.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import logging
4 | import sys
5 |
6 | from argparse import ArgumentParser
7 |
8 | from krnnt.aglt import remove_aglt_from_results_rule1_3
9 | from krnnt.blanks import remove_blanks_from_results
10 | from krnnt.keras_models import BEST
11 | from krnnt.new import Lemmatisation, Lemmatisation2, get_morfeusz, analyze_tokenized
12 | from krnnt.pipeline import KRNNTSingle, chunk
13 | from krnnt.readers import read_xces, read_jsonl
14 | from krnnt.writers import results_to_jsonl_str, results_to_conll_str, results_to_conllu_str, \
15 | results_to_xces_str, results_to_plain_str
16 |
17 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
18 |
19 |
20 | if __name__ == '__main__':
21 | parser = ArgumentParser(description='Run tagger')
22 | parser.add_argument('weight_path', help='path to weights, lemmatisation data and dictionary')
23 | parser.add_argument('lemmatisation_data', help='path to lemmatisation data')
24 | parser.add_argument('dictionary', help='path to dictionary')
25 | parser.add_argument('-p', '--preanalyzed', action='store_false',
26 | default=True, dest='reanalyzed',
27 | help='training data have not been reanalyzed')
28 | parser.add_argument('-i', '--input-format', default='xces', dest='input_format',
29 | help='input format of preanalyzed data: xces, jsonl')
30 | parser.add_argument('-o', '--output-format',
31 | default='xces', dest='output_format',
32 | help='output format: xces, plain, conll, conllu, jsonl')
33 | parser.add_argument('--maca_config',
34 | default='morfeusz2-nkjp',
35 | help='Maca config')
36 | parser.add_argument('--toki_config_path',
37 | default='',
38 | help='Toki config path (directory)')
39 | parser.add_argument('--lemmatisation',
40 | default='sgjp',
41 | help='lemmatization mode (sgjp, simple)')
42 | parser.add_argument('-g', '--debug', action='store_true', dest='debug_mode') # TODO
43 | parser.add_argument('--tokenized', action='store_true',
44 | help='input data are tokenized, but not analyzed')
45 | parser.add_argument('--reproducible', action='store_true', default=False, help='set seeds')
46 | parser.add_argument('--chunk_size',
47 | default=100000, type=int,
48 | help='chunk size')
49 | parser.add_argument('--remove_aglt', action='store_true')
50 | parser.add_argument('--dont_remove_blank', action='store_false')
51 | args = parser.parse_args()
52 |
53 | if args.reproducible:
54 | from numpy.random import seed
55 | seed(1337)
56 | import random as rn
57 | rn.seed(1337)
58 | import tensorflow as tf
59 | session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
60 | inter_op_parallelism_threads=1)
61 | from keras import backend as K
62 | tf.set_random_seed(1337)
63 | sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
64 | K.set_session(sess)
65 |
66 | pref = {'keras_batch_size': 32, 'internal_neurons': 256, 'feature_name': 'tags4e3', 'label_name': 'label',
67 | 'keras_model_class': BEST, 'maca_config':args.maca_config, 'toki_config_path':args.toki_config_path}
68 |
69 | if args.lemmatisation== 'simple':
70 | pref['lemmatisation_class'] = Lemmatisation2
71 | else:
72 | pref['lemmatisation_class'] = Lemmatisation
73 |
74 | pref['reanalyze'] = args.reanalyzed
75 | # pref['input_format'] = options.input_format
76 | pref['output_format'] = args.output_format
77 |
78 | pref['weight_path'] = args.weight_path
79 | pref['lemmatisation_path'] = args.lemmatisation_data
80 | pref['UniqueFeaturesValues'] = args.dictionary
81 |
82 | krnnt = KRNNTSingle(pref)
83 | #time python3 -m cProfile -o gpu_run_train2.profil krnnt_run.py ../krnnt/data/weights.hdf5 ../krnnt/data/lemmatisation.pkl ../krnnt/data/dictionary.pkl -o xces > /tmp/out.xces < ../krnnt-refactor/tests/data/full/train-raw.txt
84 |
85 | if args.tokenized:
86 | if args.input_format == 'jsonl':
87 | corpus = read_jsonl(sys.stdin)
88 | else:
89 | print('Wrong input format.')
90 | sys.exit(1)
91 |
92 | morf=get_morfeusz()
93 | corpus = analyze_tokenized(morf, corpus)
94 | results = krnnt.tag_paragraphs(corpus, preana=True)
95 | elif args.reanalyzed:
96 | data=sys.stdin.read().split('\n\n')
97 | results=[]
98 | for batch in chunk(data, args.chunk_size):
99 | results += krnnt.tag_paragraphs(batch) # ['Ala ma kota.', 'Ale nie ma psa.']
100 | #TODO: print here
101 | else:
102 | #f = io.StringIO(sys.stdin.read())
103 | if args.input_format== 'xces':
104 | corpus = read_xces(sys.stdin)
105 | elif args.input_format== 'jsonl':
106 | corpus = read_jsonl(sys.stdin)
107 | else:
108 | print('Wrong input format.')
109 | sys.exit(1)
110 |
111 | results = krnnt.tag_paragraphs(corpus, preana=True)
112 |
113 | # print(results)
114 |
115 | if args.output_format == 'xces':
116 | conversion = results_to_xces_str
117 | elif args.output_format == 'plain':
118 | conversion = results_to_plain_str
119 | elif args.output_format == 'conll':
120 | conversion = results_to_conll_str
121 | elif args.output_format == 'conllu':
122 | conversion = results_to_conllu_str
123 | elif args.output_format == 'jsonl':
124 | conversion = results_to_jsonl_str
125 | else:
126 | print('Wrong output format.')
127 | sys.exit(1)
128 |
129 |
130 | if args.remove_aglt:
131 | remove_aglt_from_results_rule1_3(results)
132 |
133 | if args.dont_remove_blank:
134 | remove_blanks_from_results(results)
135 |
136 | print(conversion(results), end='')
137 |
--------------------------------------------------------------------------------
/krnnt_serve.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import sys
5 | from argparse import ArgumentParser
6 |
7 | from flask import Flask
8 | from flask import request
9 | from krnnt.additional_format import additional_format
10 | from krnnt.aglt import remove_aglt_from_results_rule1_3
11 | from krnnt.analyzers import MacaAnalyzer
12 | from krnnt.blanks import remove_blanks_from_results
13 | from krnnt.keras_models import BEST
14 | from krnnt.new import Lemmatisation, Lemmatisation2, get_morfeusz, analyze_tokenized
15 | from krnnt.writers import get_output_converter
16 | from krnnt.readers import json_to_objects, json_compact_to_objects
17 | from krnnt.pipeline import KRNNTSingle
18 |
19 | app = Flask(__name__)
20 | app.config['JSON_AS_ASCII'] = False
21 | application = app
22 |
23 | global krnntx, conversion, maca_analyzer, morfeusz
24 |
25 |
26 | def render(text='', str_results=''):
27 | return """
28 |
29 |
30 |
31 | KRNNT
32 |
33 |
34 | KRNNT: Polish Recurrent Neural Network Tagger
35 |
39 | %s
40 | The tagset is described here: http://nkjp.pl/poliqarp/help/ense2.html
41 | Wróbel Krzysztof, KRNNT: Polish Recurrent Neural Network Tagger
42 | Source code: https://github.com/kwrobel-nlp/krnnt
43 |
44 | """ % (text, str_results)
45 |
46 |
47 | @app.route('/', methods=['GET'])
48 | def gui():
49 | return render()
50 |
51 |
52 | @app.route('/', methods=['POST'])
53 | def tag_raw():
54 | request.get_data()
55 |
56 | input_format = request.args.get('input_format', default=None, type=str)
57 | output_format = request.args.get('output_format', default='plain', type=str)
58 | remove_aglt = request.args.get('remove_aglt', default='0', type=str)
59 | remove_blank = request.args.get('remove_blank', default='1', type=str)
60 |
61 | conversion2 = get_output_converter(output_format)
62 |
63 | if remove_aglt!='0':
64 | conversionx=conversion2
65 | conversion2=lambda x: conversionx(remove_aglt_from_results_rule1_3(x))
66 |
67 | if remove_blank!='0':
68 | conversionx2=conversion2
69 | conversion2=lambda x: conversionx2(remove_blanks_from_results(x))
70 |
71 | if request.is_json:
72 | data = request.get_json()
73 |
74 | if 'docs' in data:
75 | return additional_format(data, krnntx, morfeusz)
76 | else:
77 | if 'documents' in data:
78 | paragraphs = json_to_objects(data)
79 | else:
80 | paragraphs = json_compact_to_objects(data)
81 |
82 | corpus = analyze_tokenized(morfeusz, paragraphs)
83 | results = krnntx.tag_paragraphs(corpus, preana=True)
84 |
85 | return conversion2(results)
86 | elif 'text' in request.form:
87 | text = request.form['text']
88 |
89 |
90 |
91 | results = krnntx.tag_paragraphs([text]) # ['Ala ma kota.', 'Ale nie ma psa.']
92 | return render(text, conversion(results))
93 | else:
94 | text = request.get_data()
95 |
96 | if input_format == 'lines':
97 | data = text.decode('utf-8').split('\n\n') #TODO
98 | else:
99 | data = [text.decode('utf-8')]
100 |
101 | results = krnntx.tag_paragraphs(data)
102 |
103 | return conversion2(results)
104 |
105 |
106 | @app.route('/tag/', methods=['POST'])
107 | def tag():
108 | text = request.form['text']
109 | results = krnntx.tag_sentences(text.split('\n\n')) # ['Ala ma kota.', 'Ale nie ma psa.']
110 | return render(text, conversion(results))
111 |
112 | @app.route('/maca/', methods=['POST'])
113 | def maca():
114 | text = request.get_data()
115 | # print(text.decode('utf-8').split('\n\n'))
116 |
117 | results = maca_analyzer._maca(text.decode('utf-8').split('\n\n'))
118 | results = list(results)
119 | return str(results)
120 |
121 |
122 | def main(argv=sys.argv[1:]):
123 | print(argv)
124 | global conversion,krnntx,maca_analyzer, morfeusz
125 |
126 | parser = ArgumentParser(usage='HTTP Tagger server')
127 | parser.add_argument('model_path', help='path to directory woth weights, lemmatisation data and dictionary')
128 | parser.add_argument('-p', '--port',
129 | default=9003,
130 | help='server port (defaults to 9003)')
131 | parser.add_argument('-t', '--host',
132 | default='0.0.0.0',
133 | help='server host (defaults to localhost)')
134 | parser.add_argument('--maca_config',
135 | default='morfeusz-nkjp-official',
136 | help='Maca config')
137 | parser.add_argument('--toki_config_path',
138 | default='',
139 | help='Toki config path (directory)')
140 | parser.add_argument('--lemmatisation',
141 | default='sgjp',
142 | help='lemmatization mode (sgjp, simple)')
143 | parser.add_argument('-o', '--output-format',
144 | default='plain', dest='output_format',
145 | help='output format: xces, plain, conll, conllu, jsonl')
146 | parser.add_argument('-b', '--batch_size',
147 | default=32, type=int,
148 | help='batch size')
149 | parser.add_argument('--remove_aglt', action='store_true')
150 | parser.add_argument('--dont_remove_blank', action='store_false')
151 | args = parser.parse_args(argv)
152 |
153 | pref = {'keras_batch_size': args.batch_size, 'internal_neurons': 256, 'feature_name': 'tags4e3', 'label_name': 'label',
154 | 'keras_model_class': BEST, 'maca_config': args.maca_config, 'toki_config_path': args.toki_config_path}
155 |
156 | if args.lemmatisation == 'simple':
157 | pref['lemmatisation_class'] = Lemmatisation2
158 | else:
159 | pref['lemmatisation_class'] = Lemmatisation
160 |
161 | pref['reanalyze'] = True
162 |
163 | pref['weight_path'] = args.model_path + "/weights.hdf5"
164 | pref['lemmatisation_path'] = args.model_path + "/lemmatisation.pkl"
165 | pref['UniqueFeaturesValues'] = args.model_path + "/dictionary.pkl"
166 |
167 | morfeusz = get_morfeusz()
168 | maca_analyzer = MacaAnalyzer(args.maca_config)
169 | krnntx = KRNNTSingle(pref)
170 |
171 | krnntx.tag_sentences(['Ala'])
172 |
173 | conversion= get_output_converter(args.output_format)
174 |
175 | if args.remove_aglt:
176 | conversionx = conversion
177 | conversion=lambda x: conversionx(remove_aglt_from_results_rule1_3(x))
178 |
179 | if args.dont_remove_blank:
180 | conversionx2 = conversion
181 | conversion=lambda x: conversionx2(remove_blanks_from_results(x))
182 |
183 |
184 | return app, args.host, args.port
185 |
186 |
187 |
188 | if __name__ == '__main__':
189 | app,host,port = main()
190 | # from werkzeug.middleware.profiler import ProfilerMiddleware
191 | # app.config['PROFILE'] = True
192 | # app = ProfilerMiddleware(app)
193 | # app.wsgi_app = ProfilerMiddleware(
194 | # app.wsgi_app, profile_dir="."
195 | # )
196 | app.run(host=host, port=port, debug=False) # threaded=False on GPU
197 |
198 | def start(*args, **kwargs):
199 | app, host, port = main(args)
200 | return app
201 |
202 | #gunicorn -b 127.0.0.1:9003 -w 4 -k gevent -t 3600 --threads 4 'krnnt_serve:start("model_data","--maca_config","morfeusz2-nkjp","--toki_config_path","/home/krnnt/")'
--------------------------------------------------------------------------------
/krnnt_train.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from argparse import ArgumentParser
5 |
6 | from krnnt.keras_models import BEST, ExperimentParameters
7 | from krnnt.new import UnalignedSimpleEvaluator
8 | from krnnt.tagger_exps import RunFolds2, KerasData, RunExperiment
9 |
10 |
11 | if __name__ == '__main__':
12 | parser = ArgumentParser()
13 | parser.add_argument('corpus_path', help='path to corpus')
14 | parser.add_argument('-p', '--preanalyzed', action='store_false',
15 | default=True, dest='reanalyzed',
16 | help='training data have not been reanalyzed')
17 | parser.add_argument('-c', '--cv', action='store_true',
18 | default=False, dest='cv',
19 | help='run 10-fold cross-validation')
20 | parser.add_argument('-t', '--train_ratio',
21 | default=1.0, dest='train_ratio', type=float,
22 | help='percentage of data for training')
23 | parser.add_argument('-d', '--dev_ratio',
24 | default=0.0, dest='dev_ratio', type=float,
25 | help='percentage of training data for development')
26 | parser.add_argument('-e', '--epochs',
27 | default=100, dest='epochs', type=int,
28 | help='number of epochs')
29 | parser.add_argument('--patience',
30 | default=10, dest='patience', type=int,
31 | help='patience')
32 | parser.add_argument('--maca_config',
33 | default='morfeusz-nkjp-official',
34 | help='Maca config')
35 | parser.add_argument('-g', '--debug', action='store_true', dest='debug_mode') # TODO
36 | parser.add_argument('--hash', action='store', default=None, dest='hash')
37 | parser.add_argument('--reproducible', action='store_true', default=False, help='set seeds')
38 | parser.add_argument('-f', '--fold', default=None, dest='fold')
39 | args = parser.parse_args()
40 |
41 | if args.reproducible:
42 | from numpy.random import seed
43 | seed(1337)
44 | import random as rn
45 | rn.seed(1337)
46 | import tensorflow as tf
47 | session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
48 | inter_op_parallelism_threads=1)
49 | from keras import backend as K
50 | tf.set_random_seed(1337)
51 | sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
52 | K.set_session(sess)
53 |
54 | pref = {'nb_epoch': 100, 'batch_size': 256,
55 | 'internal_neurons': 256, 'feature_name': 'tags4e3', 'label_name': 'label',
56 | 'evaluator': UnalignedSimpleEvaluator, 'patience': 10,
57 | 'weight_path': 'weights.hdf5', 'samples_per_epoch': 10000, 'keras_model_class': BEST,
58 | 'corpus_path': 'data/train-reanalyzed.spickle', 'reanalyze': True, 'train_data_ratio': 0.9,
59 | 'dev_data_ratio': 0.1}
60 |
61 | pref['reanalyze'] = args.reanalyzed
62 | pref['train_data_ratio'] = float(args.train_ratio)
63 | pref['dev_data_ratio'] = float(args.dev_ratio)
64 | pref['nb_epoch'] = int(args.epochs)
65 | pref['corpus_path'] = args.corpus_path
66 | pref['patience'] = args.patience
67 | pref['maca_config'] = args.maca_config
68 | if args.hash is not None:
69 | pref['h'] = args.hash
70 | if args.fold is not None:
71 | pref['fold'] = int(args.fold)
72 |
73 | keras_model_class = pref['keras_model_class']
74 |
75 | if args.cv:
76 | rf = RunFolds2(keras_model_class, pref)
77 | rf.run()
78 | else:
79 | parameters = ExperimentParameters(pref)
80 | km = keras_model_class(parameters)
81 |
82 | print('Model will be saved under: %s.final' % parameters.pref['weight_path'])
83 | print('Lemmatisation model will be saved under: %s' % parameters.pref['lemmatisation_path'])
84 |
85 | kd = KerasData(pref['corpus_path'], pref['reanalyze'])
86 | re = RunExperiment(kd, km)
87 | re.run()
88 |
89 | print('Model is saved under: %s' % parameters.pref['weight_path'])
90 | print('Lemmatisation model is saved under: %s' % parameters.pref['lemmatisation_path'])
91 | if pref['reanalyze']:
92 | print('Dictionary is saved under: %s' % parameters.pref[
93 | 'corpus_path'] + '_FormatData2_PreprocessData_UniqueFeaturesValues')
94 | else:
95 | print('Dictionary is saved under: %s' % parameters.pref[
96 | 'corpus_path'] + '_FormatDataPreAnalyzed_PreprocessData_UniqueFeaturesValues')
97 |
--------------------------------------------------------------------------------
/merge_analyzed_gold.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | from argparse import ArgumentParser
4 |
5 | from krnnt.serial_pickle import SerialPickler, SerialUnpickler
6 |
7 | if __name__ == '__main__':
8 | parser = ArgumentParser(
9 | description='Combines analyzed corpus with gold. Analyzed corpus must be with gold segmentation.')
10 | parser.add_argument('gold_path', help='')
11 | parser.add_argument('analyzed_path', help='')
12 | parser.add_argument('output_path', help='')
13 | args = parser.parse_args()
14 |
15 | file_path1 = args.gold_path
16 | file_path2 = args.analyzed_path
17 | output_path = args.output_path
18 |
19 | file1 = open(file_path1, 'rb')
20 | su_gold = SerialUnpickler(file1)
21 |
22 | file2 = open(file_path2, 'rb')
23 | su_analyzed = SerialUnpickler(file2)
24 |
25 | file3 = open(output_path, 'wb')
26 | sp = SerialPickler(file3)
27 |
28 | for paragraph_gold in su_gold:
29 | for sentence_gold in paragraph_gold:
30 | paragraph_analyzed = next(su_analyzed.__iter__())
31 | assert len(paragraph_analyzed.sentences), 1
32 | sentence_analyzed = paragraph_analyzed.sentences[0]
33 | assert len(sentence_analyzed.tokens), len(sentence_gold.tokens)
34 | for token_gold, token_analyzed in zip(sentence_gold, sentence_analyzed):
35 | token_gold.interpretations = token_analyzed.interpretations
36 | sp.add(paragraph_gold)
37 |
38 | file3.close()
39 |
--------------------------------------------------------------------------------
/preprocess_data.py:
--------------------------------------------------------------------------------
1 | from argparse import ArgumentParser
2 |
3 | from tqdm import tqdm
4 |
5 | from krnnt.new import preprocess_paragraph_preanalyzed, \
6 | preprocess_paragraph_reanalyzed
7 | from krnnt.serial_pickle import SerialPickler, SerialUnpickler
8 | from krnnt.structure import Paragraph
9 |
10 | if __name__ == '__main__':
11 | parser = ArgumentParser(description='Create features for neural network.')
12 | parser.add_argument('input_path', type=str, help='path to re/preanalyzed data')
13 | parser.add_argument('output_path', type=str, help='save path')
14 | parser.add_argument('-p', '--preanalyzed', action='store_false',
15 | default=True, dest='reanalyzed',
16 | help='training data have not been reanalyzed')
17 | args = parser.parse_args()
18 |
19 | file = open(args.input_path, 'rb')
20 | su = SerialUnpickler(file)
21 |
22 | file2 = open(args.output_path, 'wb')
23 | sp = SerialPickler(file2)
24 |
25 | if args.reanalyzed:
26 | preprocess_method = preprocess_paragraph_reanalyzed
27 | else:
28 | preprocess_method = preprocess_paragraph_preanalyzed
29 |
30 | paragraph: Paragraph
31 | for paragraph in tqdm(su, total=18484):
32 | paragraph_sequence = preprocess_method(paragraph)
33 |
34 | sp.add(paragraph_sequence)
35 |
36 | file.close()
37 | file2.close()
38 |
39 |
--------------------------------------------------------------------------------
/process_xces.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import glob
4 |
5 | from krnnt.serial_pickle import SerialPickler
6 | from argparse import ArgumentParser
7 |
8 | from krnnt.readers import read_xces
9 |
10 | usage = """%prog CORPUS SAVE_PATH
11 |
12 | Converts XCES corpus to internal KRNNT representation and saves it to file.
13 |
14 | E.g. %prog train-analyzed.xml train-analyzed.spickle
15 | """
16 |
17 | if __name__ == '__main__':
18 | parser = ArgumentParser(usage="usage")
19 | parser.add_argument('file_path', type=str, help='path to XCES corpus (or path with wildcard)')
20 | parser.add_argument('output_path', type=str, help='save path')
21 | args = parser.parse_args()
22 |
23 | with open(args.output_path, 'wb') as file:
24 | sp = SerialPickler(file)
25 |
26 | for path in glob.iglob(args.file_path):
27 | print(path)
28 | for paragraph in read_xces(path):
29 | sp.add(paragraph)
30 |
--------------------------------------------------------------------------------
/reanalyze.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | from argparse import ArgumentParser
4 |
5 | from tqdm import tqdm
6 |
7 | from krnnt.aligner import align_paragraphs
8 | from krnnt.analyzers import MacaAnalyzer
9 | from krnnt.structure import Paragraph
10 | from krnnt.serial_pickle import SerialPickler, SerialUnpickler
11 |
12 | usage = """prog CORPUS_GOLD CORPUS_SAVE
13 |
14 | Reanalyze corpus with Maca.
15 |
16 | E.g. prog train-gold.spickle train-reanalyzed.spickle
17 | """
18 |
19 | if __name__ == '__main__':
20 | parser = ArgumentParser(usage=usage)
21 | parser.add_argument('file_path', type=str, help='paths to corpus')
22 | parser.add_argument('output_path', type=str, help='save path')
23 | parser.add_argument('--maca_config', default='morfeusz2-nkjp', help='Maca config')
24 | parser.add_argument('--toki_config_path', default='', help='Toki config path (directory)')
25 | args = parser.parse_args()
26 |
27 | file1 = open(args.file_path, 'rb')
28 | su_gold = SerialUnpickler(file1)
29 |
30 | file2 = open(args.output_path, 'wb')
31 | sp = SerialPickler(file2)
32 |
33 | maca_analyzer = MacaAnalyzer(args.maca_config)
34 |
35 | paragraph_gold: Paragraph
36 | for j, paragraph_gold in tqdm(enumerate(su_gold), total=18484, desc='Morphological analysis'):
37 | paragraph_raw = paragraph_gold.text()
38 |
39 | paragraph_reanalyzed = maca_analyzer.analyze(paragraph_raw)
40 |
41 | print('Number of sentences by Maca vs gold', len(paragraph_reanalyzed.sentences), len(paragraph_gold.sentences))
42 |
43 | paragraph_reanalyzed = align_paragraphs(paragraph_reanalyzed, paragraph_gold)
44 |
45 | sp.add(paragraph_reanalyzed)
46 |
47 | file2.close()
48 |
49 | # TODO: count mismatched sentences
50 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Cython
2 | scikit-learn
3 | flask
4 | tqdm
5 | h5py==2.9.0
6 | Keras==2.2.4
7 | numpy==1.16.4
8 | regex==2019.6.8
9 | requests==2.22.0
10 | jsonlines==1.2.0
11 | tensorflow-gpu==1.12.0
12 | pytest
13 | gunicorn
14 | git+https://github.com/djstrong/pytest-shell.git#egg=pytest-shell
15 | git+https://github.com/djstrong/krnnt_text_utils.git@cython
16 | pytest-benchmark
17 |
--------------------------------------------------------------------------------
/run_test.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #check if server is running
4 | SERVER_STARTED=0
5 | if [ `ps aux | grep krnnt_serve -c` -eq 1 ]; then
6 | echo 'Starting server'
7 | ./start_flask_server.sh > /dev/null 2>&1 &
8 | PID=$!
9 | echo "PID: $PID"
10 | SERVER_STARTED=1
11 | sleep 5
12 | fi
13 |
14 |
15 | cd tests
16 | python3 -m pytest
17 |
18 |
19 | if [ $SERVER_STARTED -eq 1 ]; then
20 | echo 'Killing server'
21 | pkill -P "$PID"
22 | fi
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | setup(name='krnnt',
4 | version='1.0.0',
5 | description='Part of speech tagger for Polish',
6 | url='https://github.com/kwrobel-nlp/krnnt',
7 | author='Krzysztof Wróbel',
8 | author_email='Wrobel.Krzysztof@gmail.com',
9 | packages=['krnnt'],
10 | license='LGPL',
11 | python_requires='>=3, <4',
12 | install_requires=[
13 | 'Cython', 'h5py', 'Keras==2.2.5', 'numpy', 'regex', 'requests', 'jsonlines', 'tqdm', 'flask', 'gunicorn',
14 | 'krnnt_utils @ git+https://github.com/Zhylkaaa/krnnt_text_utils@cython'
15 | ],
16 | extras_require={
17 | 'train': ['scikit-learn'],
18 | 'pytest': ['pytest', 'pytest-benchmark',
19 | 'pytest-shell @ https://api.github.com/repos/djstrong/pytest-shell/tarball/'],
20 | 'tfcpu': ['tensorflow==1.14.0'],
21 | 'tfgpu': ['tensorflow-gpu==1.12.0']
22 | },
23 | zip_safe=False)
--------------------------------------------------------------------------------
/shuffle.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import random
5 | from argparse import ArgumentParser
6 |
7 | from tqdm import tqdm
8 |
9 | from krnnt.structure import Paragraph
10 | from krnnt.serial_pickle import SerialPickler, SerialUnpickler
11 |
12 | usage = """%prog CORPUS SAVE_PATH
13 |
14 | Shuffle training data.
15 |
16 | E.g. %prog train-merged.spickle train-merged.shuf.spickle
17 | """
18 |
19 | if __name__ == '__main__':
20 | parser = ArgumentParser(usage=usage)
21 | parser.add_argument('file_path', type=str, help='paths to corpus')
22 | parser.add_argument('output_path', type=str, help='save path')
23 | parser.add_argument('--seed', '-s', type=int, default=1337, help='seed')
24 | args = parser.parse_args()
25 |
26 | file_path1 = args.file_path
27 | file_path2 = args.output_path
28 |
29 | file = open(file_path1, 'rb')
30 | su = SerialUnpickler(file)
31 |
32 | paragraphs = []
33 | paragraph: Paragraph
34 | for paragraph in tqdm(su, desc='Loading', total=18484):
35 | paragraphs.append(paragraph)
36 | file.close()
37 |
38 | random.seed(args.seed)
39 | random.shuffle(paragraphs)
40 |
41 | file2 = open(file_path2, 'wb')
42 | sp = SerialPickler(file2)
43 |
44 | for paragraph in tqdm(paragraphs, desc='Saving'):
45 | sp.add(paragraph)
46 |
47 | file2.close()
48 |
--------------------------------------------------------------------------------
/split_data.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import math
4 | from argparse import ArgumentParser
5 |
6 | from krnnt.serial_pickle import SerialPickler, SerialUnpickler, count_samples
7 |
8 | if __name__ == '__main__':
9 | parser = ArgumentParser(description='Split data')
10 | parser.add_argument('input_path', help='input path to data')
11 | parser.add_argument('output_path1', help='output path to data')
12 | parser.add_argument('output_path2', help='output path to data')
13 | parser.add_argument('ratio', type=float, help='ratio of data to write to the first output')
14 |
15 | args = parser.parse_args()
16 |
17 | num_data = count_samples(args.input_path)
18 | first_part = math.ceil(num_data * args.ratio)
19 |
20 | sp1 = SerialPickler(open(args.output_path1, 'wb'))
21 | sp2 = SerialPickler(open(args.output_path2, 'wb'))
22 |
23 | su = SerialUnpickler(open(args.input_path, 'rb'))
24 | for i, paragraph in enumerate(su):
25 | if i < first_part:
26 | sp1.add(paragraph)
27 | else:
28 | sp2.add(paragraph)
29 | sp1.close()
30 | sp2.close()
31 |
--------------------------------------------------------------------------------
/start_flask_server.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | PORT=${PORT:-9003}
4 |
5 | export CUDA_VISIBLE_DEVICES=""
6 |
7 | python3 krnnt_serve.py model_data --maca_config morfeusz2-nkjp -p $PORT
--------------------------------------------------------------------------------
/start_gunicorn_server.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | PORT=${PORT:-9003}
4 | WORKERS=${WORKERS:-1}
5 |
6 | echo "Starting server with $WORKERS workers."
7 |
8 | export CUDA_VISIBLE_DEVICES=""
9 |
10 | gunicorn -b 0.0.0.0:$PORT -w $WORKERS -k sync -t 3600 --threads 1 'krnnt_serve:start("model_data","--maca_config","morfeusz2-nkjp")'
--------------------------------------------------------------------------------
/tests/benchmark/test_maca.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from krnnt.analyzers import MacaAnalyzer
4 |
5 | paragraph_raw = 'Moje niefortunne pudła przygnębiły mnie do reszty. Zawsze miałem pretensje, że jestem dobrym myśliwym, a od dzieciństwa nie rozstawałem się ze strzelbą, a tu wśród obcych zblamowałem się jak nigdy w życiu. Jakże inaczej strzelałem cietrzewie i pardwy z moich "hollandów", które pozostawiłem na wieczną zgubę w Petersburgu. Poczciwy Staś Sierakowski pośpieszył mi z pomocą, by wyjaśnić moje niepowodzenia. - Pokaż mi strzelbę - poprosił, a gdy podałem mu mojego mauzera, spytał ze śmiechem: - Gdzieś to świństwo wykopał? - Ano w Gdańsku - odrzekłem zawstydzony. - Chyba byłeś ślepy, kupując taką szkaradę. Z czego strzelałeś przed wojną? - Miałem hollandy - odrzekłem. - Jedyna rada - rzekł w końcu Staś po oględzinach mojej broni. - Każ sobie skrócić szyję na dobrych kilka centymetrów, albo jeszcze lepiej rzuć to świństwo do pieca, a co się nie spali - na śmietnik.'
6 | MACA_CONFIG1 = 'morfeusz-nkjp-official'
7 | MACA_CONFIG2 = 'morfeusz2-nkjp'
8 |
9 |
10 | @pytest.fixture
11 | def get_maca_wrapper():
12 | try:
13 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
14 | list(maca_analyzer._maca_wrapper(paragraph_raw))
15 | except:
16 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
17 | list(maca_analyzer._maca_wrapper(paragraph_raw))
18 |
19 | return maca_analyzer
20 |
21 |
22 | @pytest.fixture
23 | def get_maca_process():
24 | try:
25 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
26 | list(maca_analyzer._maca_process(paragraph_raw))
27 | except:
28 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
29 | list(maca_analyzer._maca_process(paragraph_raw))
30 |
31 | return maca_analyzer
32 |
33 |
34 | def analyze_process(maca_analyzer, data):
35 | results = maca_analyzer._maca_process(data)
36 | return list(results)
37 |
38 |
39 | def analyze_wrapper(maca_analyzer, data):
40 | results = maca_analyzer._maca_wrapper(data)
41 | return list(results)
42 |
43 |
44 | @pytest.mark.slow
45 | def test_maca_process_speed(benchmark, get_maca_process):
46 | maca_analyzer = get_maca_process
47 | benchmark(analyze_process, maca_analyzer, paragraph_raw)
48 |
49 |
50 | @pytest.mark.slow
51 | def test_maca_wrapper_speed(benchmark, get_maca_wrapper):
52 | maca_analyzer = get_maca_wrapper
53 | benchmark(analyze_wrapper, maca_analyzer, paragraph_raw)
54 |
--------------------------------------------------------------------------------
/tests/benchmark/test_maca_analyze.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from krnnt.analyzers import MacaAnalyzer
4 |
5 | paragraph_raw = 'Moje niefortunne pudła przygnębiły mnie do reszty. Zawsze miałem pretensje, że jestem dobrym myśliwym, a od dzieciństwa nie rozstawałem się ze strzelbą, a tu wśród obcych zblamowałem się jak nigdy w życiu. Jakże inaczej strzelałem cietrzewie i pardwy z moich "hollandów", które pozostawiłem na wieczną zgubę w Petersburgu. Poczciwy Staś Sierakowski pośpieszył mi z pomocą, by wyjaśnić moje niepowodzenia. - Pokaż mi strzelbę - poprosił, a gdy podałem mu mojego mauzera, spytał ze śmiechem: - Gdzieś to świństwo wykopał? - Ano w Gdańsku - odrzekłem zawstydzony. - Chyba byłeś ślepy, kupując taką szkaradę. Z czego strzelałeś przed wojną? - Miałem hollandy - odrzekłem. - Jedyna rada - rzekł w końcu Staś po oględzinach mojej broni. - Każ sobie skrócić szyję na dobrych kilka centymetrów, albo jeszcze lepiej rzuć to świństwo do pieca, a co się nie spali - na śmietnik.'
6 | MACA_CONFIG1 = 'morfeusz-nkjp-official'
7 | MACA_CONFIG2 = 'morfeusz2-nkjp'
8 |
9 |
10 | @pytest.fixture
11 | def get_maca_wrapper():
12 | try:
13 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
14 | list(maca_analyzer._maca_wrapper(paragraph_raw))
15 | maca_analyzer._maca = maca_analyzer._maca_wrapper
16 | except:
17 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
18 | list(maca_analyzer._maca_wrapper(paragraph_raw))
19 | maca_analyzer._maca = maca_analyzer._maca_wrapper
20 |
21 | return maca_analyzer
22 |
23 |
24 | @pytest.fixture
25 | def get_maca_process():
26 | try:
27 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
28 | list(maca_analyzer._maca_process(paragraph_raw))
29 | maca_analyzer._maca = maca_analyzer._maca_process
30 | except:
31 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
32 | list(maca_analyzer._maca_process(paragraph_raw))
33 | maca_analyzer._maca = maca_analyzer._maca_process
34 |
35 | return maca_analyzer
36 |
37 |
38 | def analyze_process(maca_analyzer, data):
39 | results = maca_analyzer.analyze(data)
40 | return list(results)
41 |
42 |
43 | def analyze_wrapper(maca_analyzer, data):
44 | results = maca_analyzer.analyze(data)
45 | return list(results)
46 |
47 |
48 | @pytest.mark.slow
49 | def test_maca_process_speed(benchmark, get_maca_process):
50 | maca_analyzer = get_maca_process
51 | benchmark(analyze_process, maca_analyzer, paragraph_raw)
52 |
53 |
54 | @pytest.mark.slow
55 | def test_maca_wrapper_speed(benchmark, get_maca_wrapper):
56 | maca_analyzer = get_maca_wrapper
57 | benchmark(analyze_wrapper, maca_analyzer, paragraph_raw)
58 |
--------------------------------------------------------------------------------
/tests/benchmark/test_shape.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from krnnt.utils import shape
4 | import krnnt_utils
5 |
6 | @pytest.fixture
7 | def word():
8 | return "ljhbasjk8f5IYTVIGHVaisftityvfiouyfO*86f97f697"
9 |
10 | @pytest.mark.slow
11 | def test_shape_regex(word, benchmark):
12 | benchmark(shape,word)
13 |
14 | @pytest.mark.slow
15 | def test_shape_cython(word, benchmark):
16 | benchmark(krnnt_utils.shape,word)
--------------------------------------------------------------------------------
/tests/benchmark/test_tags.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from krnnt.features import TagsPreprocessor, TagsPreprocessorCython
4 |
5 |
6 | @pytest.fixture
7 | def tags():
8 | return ['fin:sg:ter:imperf', 'subst:sg:nom:f']
9 |
10 |
11 | @pytest.mark.slow
12 | def test_tags4(tags, benchmark):
13 | benchmark(TagsPreprocessor.create_tags4_without_guesser, tags)
14 |
15 |
16 | @pytest.mark.slow
17 | def test_tags4_cython(tags, benchmark):
18 | benchmark(TagsPreprocessorCython.create_tags4_without_guesser, tags)
19 |
20 |
21 | @pytest.mark.slow
22 | def test_tags5(tags, benchmark):
23 | benchmark(TagsPreprocessor.create_tags5_without_guesser, tags)
24 |
25 |
26 | @pytest.mark.slow
27 | def test_tags5_cython(tags, benchmark):
28 | benchmark(TagsPreprocessorCython.create_tags5_without_guesser, tags)
29 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pytest
3 |
4 | @pytest.fixture
5 | def rootdir():
6 | return os.path.dirname(os.path.abspath(__file__))
--------------------------------------------------------------------------------
/tests/data/reference/gold-task-c_evaluation.txt:
--------------------------------------------------------------------------------
1 | ### FOLD 1: tests/data/small/gold-task-c.xml (tag) v. /tmp/out.xces (ref)
2 | PolEval 2017 competition scores
3 | -------------------------------
4 | POS accuracy (Subtask A score): 33.1343%
5 | POS accuracy (known words): 33.1343%
6 | POS accuracy (unknown words): 0.0000%
7 | Lemmatization accuracy (Subtask B score): 51.3433%
8 | Lemmatization accuracy (known words): 51.3433%
9 | Lemmatization accuracy (unknown words): 0.0000%
10 | Overall accuracy (Subtask C score): 42.2388%
11 | ----
12 | REF-toks 335
13 | KN 100.0000%
14 | KN_POS_SC_LOWER 53.1343%
15 | KN_SC_LOWER 33.1343%
16 | KN_SEG_CHANGE 0.8955%
17 | KN_SL_LOWER 51.3433%
18 | KN_WC_LOWER 34.0299%
19 | POS_SC_LOWER 53.1343%
20 | POS_WC_LOWER 53.1343%
21 | SC_LOWER 33.1343%
22 | SEG_CHANGE 0.8955%
23 | SEG_NOCHANGE 99.1045%
24 | SL_CASE_CAT_HEUR 51.3433%
25 | SL_LOWER 51.3433%
26 | SL_NOCASE_CAT_HEUR 54.3284%
27 | SL_NOCASE_LOWER 54.3284%
28 | UNK 0.0000%
29 | UNK_POS_SC_LOWER 0.0000%
30 | UNK_SC_LOWER 0.0000%
31 | UNK_SEG_CHANGE 0.0000%
32 | UNK_SL_LOWER 0.0000%
33 | UNK_WC_LOWER 0.0000%
34 | WC_LOWER 34.0299%
35 | WL_LOWER 51.3433%
36 | WC_UPPER 34.9254%
37 | AVG weak lemma lower bound 51.3433%
38 | AVG KN strong lemma lower bound 51.3433%
39 | AVG UNK strong lemma lower bound 0.0000%
40 | AVG strong lemma lower bound 51.3433%
41 | AVG strong lemma nocase lower bound 54.3284%
42 | AVG strong lemma case concat heur 51.3433%
43 | AVG strong lemma nocase concat heur 54.3284%
44 | AVG weak corr lower bound 34.0299%
45 | AVG weak corr upper bound 34.9254%
46 | AVG UNK weak corr lower bound 0.0000%
47 | AVG UNK weak corr upper bound 0.0000%
48 | AVG KN weak corr lower bound 34.0299%
49 | AVG KN weak corr upper bound 34.9254%
50 | AVG POS strong corr lower bound 53.1343%
51 | AVG percentage UNK 0.0000%
52 | AVG percentage seg change 0.8955%
53 |
--------------------------------------------------------------------------------
/tests/data/reference/in_raw.txt:
--------------------------------------------------------------------------------
1 | Lubię placki. Ala ma kota.
2 |
3 | Raz dwa trzy.
--------------------------------------------------------------------------------
/tests/data/reference/lemmatisation_test.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/lemmatisation_test.pkl
--------------------------------------------------------------------------------
/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle
--------------------------------------------------------------------------------
/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part1
--------------------------------------------------------------------------------
/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part2
--------------------------------------------------------------------------------
/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess
--------------------------------------------------------------------------------
/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict
--------------------------------------------------------------------------------
/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2
--------------------------------------------------------------------------------
/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData
--------------------------------------------------------------------------------
/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues
--------------------------------------------------------------------------------
/tests/data/reference/nkjp1m-1.2-reanalyzed.spickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2-reanalyzed.spickle
--------------------------------------------------------------------------------
/tests/data/reference/nkjp1m-1.2.spickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/nkjp1m-1.2.spickle
--------------------------------------------------------------------------------
/tests/data/reference/out.conll:
--------------------------------------------------------------------------------
1 | Lubię Lubię 1 adj:pl:nom:m1:pos 0 5
2 | placki placka 1 subst:pl:acc:f 6 12
3 | . . 0 interp 12 13
4 |
5 | Ala Ala 1 subst:sg:nom:f 14 17
6 | ma ma 1 subst:sg:nom:f 18 20
7 | kota kota 1 subst:sg:nom:f 21 25
8 | . . 0 interp 25 26
9 |
10 |
11 | Raz Raz 1 subst:sg:nom:f 0 3
12 | dwa dwa 1 adj:pl:acc:f:pos 4 7
13 | trzy trzy 1 subst:pl:acc:f 8 12
14 | . . 0 interp 12 13
15 |
16 |
17 |
--------------------------------------------------------------------------------
/tests/data/reference/out.conllu:
--------------------------------------------------------------------------------
1 | 1 Lubię Lubię _ adj:pl:nom:m1:pos _ _ _ _ _
2 | 2 placki placka _ subst:pl:acc:f _ _ _ _ _
3 | 3 . . _ interp _ _ _ _ _
4 |
5 | 1 Ala Ala _ subst:sg:nom:f _ _ _ _ _
6 | 2 ma ma _ subst:sg:nom:f _ _ _ _ _
7 | 3 kota kota _ subst:sg:nom:f _ _ _ _ _
8 | 4 . . _ interp _ _ _ _ _
9 |
10 |
11 | 1 Raz Raz _ subst:sg:nom:f _ _ _ _ _
12 | 2 dwa dwa _ adj:pl:acc:f:pos _ _ _ _ _
13 | 3 trzy trzy _ subst:pl:acc:f _ _ _ _ _
14 | 4 . . _ interp _ _ _ _ _
15 |
16 |
17 |
--------------------------------------------------------------------------------
/tests/data/reference/out.jsonl:
--------------------------------------------------------------------------------
1 | [[["Lubię", "Lubię", "adj:pl:nom:m1:pos"], ["placki", "placka", "subst:pl:acc:f"], [".", ".", "interp"]], [["Ala", "Ala", "subst:sg:nom:f"], ["ma", "ma", "subst:sg:nom:f"], ["kota", "kota", "subst:sg:nom:f"], [".", ".", "interp"]]]
2 | [[["Raz", "Raz", "subst:sg:nom:f"], ["dwa", "dwa", "adj:pl:acc:f:pos"], ["trzy", "trzy", "subst:pl:acc:f"], [".", ".", "interp"]]]
3 |
4 |
--------------------------------------------------------------------------------
/tests/data/reference/out.plain:
--------------------------------------------------------------------------------
1 | Lubię newline
2 | Lubię adj:pl:nom:m1:pos disamb
3 | placki space
4 | placka subst:pl:acc:f disamb
5 | . none
6 | . interp disamb
7 |
8 | Ala space
9 | Ala subst:sg:nom:f disamb
10 | ma space
11 | ma subst:sg:nom:f disamb
12 | kota space
13 | kota subst:sg:nom:f disamb
14 | . none
15 | . interp disamb
16 |
17 |
18 | Raz newline
19 | Raz subst:sg:nom:f disamb
20 | dwa space
21 | dwa adj:pl:acc:f:pos disamb
22 | trzy space
23 | trzy subst:pl:acc:f disamb
24 | . none
25 | . interp disamb
26 |
27 |
28 |
--------------------------------------------------------------------------------
/tests/data/reference/out.xces:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | Lubię
9 | Lubięadj:pl:nom:m1:pos
10 |
11 |
12 | placki
13 | plackasubst:pl:acc:f
14 |
15 |
16 |
17 | .
18 | .interp
19 |
20 |
21 |
22 |
23 | Ala
24 | Alasubst:sg:nom:f
25 |
26 |
27 | ma
28 | masubst:sg:nom:f
29 |
30 |
31 | kota
32 | kotasubst:sg:nom:f
33 |
34 |
35 |
36 | .
37 | .interp
38 |
39 |
40 |
41 |
42 |
43 |
44 | Raz
45 | Razsubst:sg:nom:f
46 |
47 |
48 | dwa
49 | dwaadj:pl:acc:f:pos
50 |
51 |
52 | trzy
53 | trzysubst:pl:acc:f
54 |
55 |
56 |
57 | .
58 | .interp
59 |
60 |
61 |
62 |
63 |
64 |
--------------------------------------------------------------------------------
/tests/data/reference/weight_test.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/weight_test.hdf5
--------------------------------------------------------------------------------
/tests/data/reference/weight_test.hdf5.final:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/weight_test.hdf5.final
--------------------------------------------------------------------------------
/tests/data/reference/weight_test.hdf5.new:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kwrobel-nlp/krnnt/ea7fb03340b4722f6f6c83a642d23138bb2f41c0/tests/data/reference/weight_test.hdf5.new
--------------------------------------------------------------------------------
/tests/data/server/in_raw.txt:
--------------------------------------------------------------------------------
1 | Lubię placki. Ala ma kota.
2 |
3 | Raz dwa trzy.
--------------------------------------------------------------------------------
/tests/data/server/in_tokenized.json:
--------------------------------------------------------------------------------
1 | {
2 | "documents": [
3 | {
4 | "text": "Lubię placki. Ala ma kota.",
5 | "sentences": [
6 | {
7 | "tokens": [
8 | {
9 | "form": "Lubię",
10 | "separator": "newline",
11 | "start": 0,
12 | "end": 0
13 | },
14 | {
15 | "form": "placki",
16 | "separator": "space",
17 | "start": 0,
18 | "end": 0
19 | },
20 | {
21 | "form": ".",
22 | "separator": "none",
23 | "start": 0,
24 | "end": 0
25 | }
26 | ]
27 | },
28 | {
29 | "tokens": [
30 | {
31 | "form": "Ala",
32 | "separator": "space",
33 | "start": 0,
34 | "end": 0
35 | },
36 | {
37 | "form": "ma",
38 | "separator": "space",
39 | "start": 0,
40 | "end": 0
41 | },
42 | {
43 | "form": "kota",
44 | "separator": "space",
45 | "start": 0,
46 | "end": 0
47 | },
48 | {
49 | "form": ".",
50 | "separator": "none",
51 | "start": 0,
52 | "end": 0
53 | }
54 | ]
55 | }
56 | ]
57 | },
58 | {
59 | "text": "Raz dwa trzy.",
60 | "sentences": [
61 | {
62 | "tokens": [
63 | {
64 | "form": "Raz",
65 | "separator": "newline",
66 | "start": 0,
67 | "end": 0
68 | },
69 | {
70 | "form": "dwa",
71 | "separator": "space",
72 | "start": 0,
73 | "end": 0
74 | },
75 | {
76 | "form": "trzy",
77 | "separator": "space",
78 | "start": 0,
79 | "end": 0
80 | },
81 | {
82 | "form": ".",
83 | "separator": "none",
84 | "start": 0,
85 | "end": 0
86 | }
87 | ]
88 | }
89 | ]
90 | }
91 | ]
92 | }
--------------------------------------------------------------------------------
/tests/data/server/in_tokenized_compact.json:
--------------------------------------------------------------------------------
1 | [
2 | [
3 | [["Lubię","newline"],["placki","space"],[".","none"]],
4 | [["Ala","space"],["ma","space"],["kota","space"],[".","none"]]
5 | ],
6 | [
7 | [["Raz","newline"],["dwa","space"],["trzy","space"],[".","none"]]
8 | ]
9 | ]
--------------------------------------------------------------------------------
/tests/data/server/out_raw.conll:
--------------------------------------------------------------------------------
1 | Lubię lubić 1 fin:sg:pri:imperf 0 5
2 | placki placek 1 subst:pl:acc:m3 6 12
3 | . . 0 interp 12 13
4 |
5 | Ala Ala 1 subst:sg:nom:f 14 17
6 | ma mieć 1 fin:sg:ter:imperf 18 20
7 | kota kot 1 subst:sg:acc:m2 21 25
8 | . . 0 interp 25 26
9 |
10 |
11 | Raz raz 1 subst:sg:nom:m3 0 3
12 | dwa dwa 1 num:pl:nom:m3:congr 4 7
13 | trzy trzy 1 num:pl:nom:m3:congr 8 12
14 | . . 0 interp 12 13
15 |
16 |
--------------------------------------------------------------------------------
/tests/data/server/out_raw.conllu:
--------------------------------------------------------------------------------
1 | 1 Lubię lubić _ fin:sg:pri:imperf _ _ _ _ _
2 | 2 placki placek _ subst:pl:acc:m3 _ _ _ _ _
3 | 3 . . _ interp _ _ _ _ _
4 |
5 | 1 Ala Ala _ subst:sg:nom:f _ _ _ _ _
6 | 2 ma mieć _ fin:sg:ter:imperf _ _ _ _ _
7 | 3 kota kot _ subst:sg:acc:m2 _ _ _ _ _
8 | 4 . . _ interp _ _ _ _ _
9 |
10 |
11 | 1 Raz raz _ subst:sg:nom:m3 _ _ _ _ _
12 | 2 dwa dwa _ num:pl:nom:m3:congr _ _ _ _ _
13 | 3 trzy trzy _ num:pl:nom:m3:congr _ _ _ _ _
14 | 4 . . _ interp _ _ _ _ _
15 |
16 |
--------------------------------------------------------------------------------
/tests/data/server/out_raw.jsonl:
--------------------------------------------------------------------------------
1 | [[["Lubię", "lubić", "fin:sg:pri:imperf"], ["placki", "placek", "subst:pl:acc:m3"], [".", ".", "interp"]], [["Ala", "Ala", "subst:sg:nom:f"], ["ma", "mieć", "fin:sg:ter:imperf"], ["kota", "kot", "subst:sg:acc:m2"], [".", ".", "interp"]]]
2 | [[["Raz", "raz", "subst:sg:nom:m3"], ["dwa", "dwa", "num:pl:nom:m3:congr"], ["trzy", "trzy", "num:pl:nom:m3:congr"], [".", ".", "interp"]]]
3 |
--------------------------------------------------------------------------------
/tests/data/server/out_raw.plain:
--------------------------------------------------------------------------------
1 | Lubię newline
2 | lubić fin:sg:pri:imperf disamb
3 | placki space
4 | placek subst:pl:acc:m3 disamb
5 | . none
6 | . interp disamb
7 |
8 | Ala space
9 | Ala subst:sg:nom:f disamb
10 | ma space
11 | mieć fin:sg:ter:imperf disamb
12 | kota space
13 | kot subst:sg:acc:m2 disamb
14 | . none
15 | . interp disamb
16 |
17 | Raz newline
18 | raz subst:sg:nom:m3 disamb
19 | dwa space
20 | dwa num:pl:nom:m3:congr disamb
21 | trzy space
22 | trzy num:pl:nom:m3:congr disamb
23 | . none
24 | . interp disamb
25 |
26 |
--------------------------------------------------------------------------------
/tests/data/server/out_raw.xces:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | Lubię
9 | lubićfin:sg:pri:imperf
10 |
11 |
12 | placki
13 | placeksubst:pl:acc:m3
14 |
15 |
16 |
17 | .
18 | .interp
19 |
20 |
21 |
22 |
23 | Ala
24 | Alasubst:sg:nom:f
25 |
26 |
27 | ma
28 | miećfin:sg:ter:imperf
29 |
30 |
31 | kota
32 | kotsubst:sg:acc:m2
33 |
34 |
35 |
36 | .
37 | .interp
38 |
39 |
40 |
41 |
42 |
43 |
44 | Raz
45 | razsubst:sg:nom:m3
46 |
47 |
48 | dwa
49 | dwanum:pl:nom:m3:congr
50 |
51 |
52 | trzy
53 | trzynum:pl:nom:m3:congr
54 |
55 |
56 |
57 | .
58 | .interp
59 |
60 |
61 |
62 |
63 |
--------------------------------------------------------------------------------
/tests/data/small/00132482.ann.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Gdzie
8 | gdzieadv
9 |
10 |
11 | kupiła
12 | kupićpraet:sg:f:perf
13 |
14 |
15 |
16 | ś
17 | byćaglt:sg:sec:imperf:nwok
18 |
19 |
20 | łańcuszek
21 | łańcuszeksubst:sg:acc:m3
22 |
23 |
24 |
25 | ?
26 | ?interp
27 |
28 |
29 |
30 |
31 | :
32 | :)interj
33 |
34 |
35 |
36 | )
37 | )blank
38 |
39 |
40 |
41 |
--------------------------------------------------------------------------------
/tests/data/small/gold-task-c.txt:
--------------------------------------------------------------------------------
1 | Moje niefortunne pudła przygnębiły mnie do reszty. Zawsze miałem pretensje, że jestem dobrym myśliwym, a od dzieciństwa nie rozstawałem się ze strzelbą, a tu wśród obcych zblamowałem się jak nigdy w życiu. Jakże inaczej strzelałem cietrzewie i pardwy z moich "hollandów", które pozostawiłem na wieczną zgubę w Petersburgu. Poczciwy Staś Sierakowski pośpieszył mi z pomocą, by wyjaśnić moje niepowodzenia. - Pokaż mi strzelbę - poprosił, a gdy podałem mu mojego mauzera, spytał ze śmiechem: - Gdzieś to świństwo wykopał? - Ano w Gdańsku - odrzekłem zawstydzony. - Chyba byłeś ślepy, kupując taką szkaradę. Z czego strzelałeś przed wojną? - Miałem hollandy - odrzekłem. - Jedyna rada - rzekł w końcu Staś po oględzinach mojej broni. - Każ sobie skrócić szyję na dobrych kilka centymetrów, albo jeszcze lepiej rzuć to świństwo do pieca, a co się nie spali - na śmietnik.
2 | Przestrzeń dzielącą je od kolejnego skłonu schodów pokonało, kolebiąc się na boki, rozkołysanym kaczym chodem. Najdziwniejsze jednak było to, co nastąpiło potem. Jego wspinanie się na stopień. Mianowicie najpierw przed nim stanęło, niemal doń przywarło. Samo zresztą było niewiele od niego wyższe. A potem z olbrzymim wysiłkiem zaczęło się nań wspinać, a kiedy betonowa krawędź była już w połowie jego wysokości, ostrożnie się pochylając powoli przeważyło ciężar ciała na poziomą płaszczyznę stopnia. Jakby nie mogło się zginać, jakby kręgosłup miało całkiem zesztywniały. W końcu udało się. Z lekkim stukotem opadło na brzuch. Leżąc tak, wydało z siebie właśnie to jedyne w swoim rodzaju cichutkie jęknięcie. Osiągnąwszy tę fazę wspinaczki, przeszło po chwili do następnego etapu. Ciągle leżąc, zaczęło się czołgać dalej, aż do chwili kiedy środek ciężkości, w ogóle całe ciało, całkowicie znalazło się na stopniu.
--------------------------------------------------------------------------------
/tests/download_model.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | cd ..
3 | mkdir model_data -p
4 | cd model_data
5 |
6 | if [ ! -f "weights.hdf5" ]; then
7 | wget "https://github.com/kwrobel-nlp/krnnt/releases/download/poleval/reanalyze_150epochs_train1.0.zip"
8 | unzip reanalyze_150epochs_train1.0.zip
9 | mv lemmatisation_reana150_1.0.pkl lemmatisation.pkl
10 | mv weights_reana150_1.0.hdf5 weights.hdf5
11 | fi
--------------------------------------------------------------------------------
/tests/test.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #TODO pytest-shell
4 |
5 | #version of morfeusz dictionary may influence results
6 |
7 | MACA_CONFIG=morfeusz2-nkjp
8 |
9 | cd ..
10 |
11 | python3 process_xces.py tests/data/small/nkjp1m-1.2-xces.xml /tmp/nkjp.spickle
12 | echo $?
13 | diff /tmp/nkjp.spickle tests/data/reference/nkjp1m-1.2.spickle
14 |
15 | python3 reanalyze.py --maca_config $MACA_CONFIG /tmp/nkjp.spickle /tmp/nkjp-reanalyzed.spickle
16 | echo $?
17 | diff /tmp/nkjp-reanalyzed.spickle tests/data/reference/nkjp1m-1.2-reanalyzed.spickle
18 |
19 | python3 shuffle.py /tmp/nkjp-reanalyzed.spickle /tmp/nkjp-reanalyzed.shuf.spickle
20 | echo $?
21 | diff /tmp/nkjp-reanalyzed.shuf.spickle tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle
22 |
23 | rm /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2 /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues
24 | CUDA_VISIBLE_DEVICES="" PYTHONHASHSEED=0 python3 krnnt_train.py --maca_config $MACA_CONFIG /tmp/nkjp-reanalyzed.shuf.spickle -e 2 --reproducible --hash test
25 | echo $?
26 | h5diff weight_test.hdf5 tests/data/reference/weight_test.hdf5
27 | h5diff weight_test.hdf5.final tests/data/reference/weight_test.hdf5.final
28 | diff lemmatisation_test.pkl tests/data/reference/lemmatisation_test.pkl
29 | diff /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2 tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2
30 | diff /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData
31 | diff /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues
32 |
33 | echo "Lubię placki." | python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces > /tmp/out.xces
34 | echo $?
35 | diff /tmp/out.xces tests/data/reference/out.xces
36 |
37 | echo "Lubię placki." | python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o plain > /tmp/out.plain
38 | echo $?
39 | diff /tmp/out.plain tests/data/reference/out.plain
40 |
41 | echo "Lubię placki." | python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o conll > /tmp/out.conll
42 | echo $?
43 | diff /tmp/out.conll tests/data/reference/out.conll
44 |
45 | echo "Lubię placki." | python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o conllu > /tmp/out.conllu
46 | echo $?
47 | diff /tmp/out.conllu tests/data/reference/out.conllu
48 |
49 | echo "Lubię placki." | python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o jsonl > /tmp/out.jsonl
50 | echo $?
51 | diff /tmp/out.jsonl tests/data/reference/out.jsonl
52 |
--------------------------------------------------------------------------------
/tests/test_aglt.py:
--------------------------------------------------------------------------------
1 | import copy
2 |
3 | from krnnt.aglt import rewrite_praet, remove_aglt, rule1, rule3, rule1b
4 |
5 | paragraph = [
6 | [
7 | {'token': 'Zrobił', 'sep': 'newline', 'tag': 'praet:sg:m1:perf',
8 | 'lemmas': ['zrobić'], 'start': 0, 'end': 6},
9 | {'token': 'by', 'sep': 'none', 'tag': 'qub', 'lemmas': ['by'],
10 | 'start': 6, 'end': 8},
11 | {'token': 'm', 'sep': 'none', 'tag': 'aglt:sg:pri:imperf:nwok',
12 | 'lemmas': ['być'], 'start': 8, 'end': 9},
13 | {'token': 'to', 'sep': 'space', 'tag': 'subst:sg:acc:n',
14 | 'lemmas': ['to'], 'start': 10, 'end': 12},
15 | {'token': '.', 'sep': 'none', 'prob': 1.0, 'tag': 'interp', 'lemmas': ['.'],
16 | 'start': 12, 'end': 13}
17 | ],
18 | [
19 | {'token': 'Czy', 'sep': 'space', 'tag': 'qub', 'lemmas': ['czy'],
20 | 'start': 14, 'end': 17},
21 | {'token': 'by', 'sep': 'space', 'tag': 'qub', 'lemmas': ['by'],
22 | 'start': 18, 'end': 20},
23 | {'token': 'm', 'sep': 'none', 'tag': 'aglt:sg:pri:imperf:nwok',
24 | 'lemmas': ['być'], 'start': 20, 'end': 21},
25 | {'token': 'to', 'sep': 'space', 'tag': 'subst:sg:acc:n',
26 | 'lemmas': ['to'], 'start': 22, 'end': 24},
27 | {'token': 'zrobił', 'sep': 'space', 'tag': 'praet:sg:m1:perf',
28 | 'lemmas': ['zrobić'], 'start': 25, 'end': 31},
29 | {'token': '?', 'sep': 'none', 'tag': 'interp', 'lemmas': ['?'],
30 | 'start': 31, 'end': 32}
31 | ],
32 | [
33 | {'token': 'Zrobił', 'sep': 'space', 'tag': 'praet:sg:m1:perf',
34 | 'lemmas': ['zrobić'], 'start': 33, 'end': 39},
35 | {'token': 'em', 'sep': 'none', 'tag': 'aglt:sg:pri:imperf:wok',
36 | 'lemmas': ['być'], 'start': 39, 'end': 41},
37 | {'token': 'to', 'sep': 'space', 'tag': 'subst:sg:acc:n',
38 | 'lemmas': ['to'], 'start': 42, 'end': 44},
39 | {'token': '.', 'sep': 'none', 'prob': 1.0, 'tag': 'interp', 'lemmas': ['.'],
40 | 'start': 44, 'end': 45}
41 | ],
42 | [
43 | {'token': 'Aby', 'sep': 'space', 'tag': 'comp', 'lemmas': ['aby'],
44 | 'start': 46, 'end': 49},
45 | {'token': 'm', 'sep': 'none', 'tag': 'aglt:sg:pri:imperf:nwok',
46 | 'lemmas': ['być'], 'start': 49, 'end': 50},
47 | {'token': 'to', 'sep': 'space', 'tag': 'subst:sg:acc:n',
48 | 'lemmas': ['to'], 'start': 51, 'end': 53},
49 | {'token': 'zrobił', 'sep': 'space', 'tag': 'praet:sg:m1:perf',
50 | 'lemmas': ['zrobić'], 'start': 54, 'end': 60},
51 | {'token': '?', 'sep': 'none', 'tag': 'interp', 'lemmas': ['?'],
52 | 'start': 60, 'end': 61}
53 | ],
54 | [
55 | {'token': 'Zrobił', 'sep': 'newline', 'tag': 'praet:sg:m1:perf',
56 | 'lemmas': ['zrobić'], 'start': 0, 'end': 6},
57 | {'token': 'by', 'sep': 'none', 'tag': 'qub', 'lemmas': ['by'],
58 | 'start': 6, 'end': 8},
59 | {'token': 'to', 'sep': 'space', 'tag': 'subst:sg:acc:n',
60 | 'lemmas': ['to'], 'start': 9, 'end': 11},
61 | {'token': '.', 'sep': 'none', 'prob': 1.0, 'tag': 'interp', 'lemmas': ['.'],
62 | 'start': 11, 'end': 12}
63 | ],
64 | [
65 | {'token': 'Czy', 'sep': 'space', 'tag': 'qub', 'lemmas': ['czy'],
66 | 'start': 14, 'end': 17},
67 | {'token': 'by', 'sep': 'space', 'tag': 'qub', 'lemmas': ['by'],
68 | 'start': 18, 'end': 20},
69 | {'token': 'to', 'sep': 'space', 'tag': 'subst:sg:acc:n',
70 | 'lemmas': ['to'], 'start': 21, 'end': 23},
71 | {'token': 'zrobił', 'sep': 'space', 'tag': 'praet:sg:m1:perf',
72 | 'lemmas': ['zrobić'], 'start': 24, 'end': 30},
73 | {'token': '?', 'sep': 'none', 'tag': 'interp', 'lemmas': ['?'],
74 | 'start': 30, 'end': 31}
75 | ]
76 | ]
77 |
78 |
79 | def test_rewrite_praet():
80 | sentence1 = copy.deepcopy(paragraph[2])
81 |
82 | rewrite_praet(sentence1[1], sentence1[0])
83 | assert sentence1[0]['tag'] == 'praet:sg:m1:pri:perf'
84 |
85 |
86 | def test_rewrite_cond():
87 | sentence1 = copy.deepcopy(paragraph[0])
88 | rewrite_praet(sentence1[2], sentence1[0], sentence1[1])
89 | assert sentence1[0]['tag'] == 'cond:sg:m1:pri:perf'
90 |
91 | def test_rewrite_cond2():
92 | sentence1 = copy.deepcopy(paragraph[4])
93 | rewrite_praet(None, sentence1[0], sentence1[1])
94 | assert sentence1[0]['tag'] == 'cond:sg:m1:ter:perf'
95 |
96 | def test_rule1_cond():
97 | sentence1 = copy.deepcopy(paragraph[0])
98 |
99 | remove_aglt(sentence1, [rule1])
100 | print(sentence1)
101 | assert sentence1[0]['tag'] == 'cond:sg:m1:pri:perf'
102 | assert sentence1[1]['token'] != 'by'
103 | assert sentence1[2]['token'] != 'm'
104 | assert sentence1[0]['token'] == 'Zrobiłbym'
105 | assert sentence1[0]['end'] == 9
106 |
107 |
108 | def test_rule1_praet():
109 | sentence1 = copy.deepcopy(paragraph[2])
110 |
111 | remove_aglt(sentence1, [rule1])
112 | print(sentence1)
113 | assert sentence1[0]['tag'] == 'praet:sg:m1:pri:perf'
114 | assert sentence1[1]['token'] != 'm'
115 | assert sentence1[0]['token'] == 'Zrobiłem'
116 | assert sentence1[0]['end'] == 41
117 |
118 | def test_rule3_1():
119 | sentence1 = copy.deepcopy(paragraph[1])
120 |
121 | print(sentence1)
122 | remove_aglt(sentence1, [rule1, rule3])
123 | print(sentence1)
124 | assert sentence1[3]['tag'] == 'cond:sg:m1:pri:perf'
125 | assert sentence1[1]['token'] == 'bym'
126 | assert sentence1[1]['end'] == 21
127 |
128 | def test_rule3_2():
129 | sentence1 = copy.deepcopy(paragraph[3])
130 |
131 | remove_aglt(sentence1, [rule1, rule3])
132 | print(sentence1)
133 | assert sentence1[2]['tag'] == 'praet:sg:m1:pri:perf'
134 | assert sentence1[0]['token'] == 'Abym'
135 | assert sentence1[0]['end'] == 50
136 |
137 | def test_rule3_3():
138 | sentence1 = copy.deepcopy(paragraph[4])
139 |
140 | remove_aglt(sentence1, [rule1b, rule3])
141 | print(sentence1)
142 | assert sentence1[0]['tag'] == 'cond:sg:m1:ter:perf'
143 | assert sentence1[0]['token'] == 'Zrobiłby'
144 | assert sentence1[0]['end'] == 8
145 | assert sentence1[1]['token'] != 'by'
146 |
147 | def test_rule3_4():
148 | sentence1 = copy.deepcopy(paragraph[5])
149 |
150 | remove_aglt(sentence1, [rule1b, rule3])
151 | print(sentence1)
152 | assert sentence1[3]['tag'] == 'cond:sg:m1:ter:perf'
153 | assert sentence1[3]['token'] == 'zrobił'
154 |
--------------------------------------------------------------------------------
/tests/test_analyzers.py:
--------------------------------------------------------------------------------
1 | from krnnt.analyzers import MacaAnalyzer
2 | from krnnt.structure import Form
3 |
4 | reference_maca_output = \
5 | '''Lubię newline
6 | lubić fin:sg:pri:imperf
7 | pociągi space
8 | pociąg subst:pl:nom:m3
9 | pociąg subst:pl:acc:m3
10 | pociąg subst:pl:voc:m3
11 | . none
12 | . interp'''
13 |
14 | paragraph_raw = 'Lubię pociągi.'
15 |
16 | MACA_CONFIG1='morfeusz-nkjp-official'
17 | MACA_CONFIG2='morfeusz2-nkjp'
18 |
19 | def test_maca():
20 | try:
21 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
22 | results = maca_analyzer._maca(paragraph_raw)
23 | results = list(results)
24 | except:
25 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
26 | results = maca_analyzer._maca(paragraph_raw)
27 | results = list(results)
28 |
29 | assert len(results) == 1
30 | assert results[0] == reference_maca_output
31 |
32 | def test_maca_process():
33 | try:
34 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
35 | results = maca_analyzer._maca_process(paragraph_raw)
36 | results = list(results)
37 | except:
38 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
39 | results = maca_analyzer._maca_process(paragraph_raw)
40 | results = list(results)
41 |
42 | assert len(results) == 1
43 | assert results[0] == reference_maca_output
44 |
45 | def test_maca_wrapper():
46 | try:
47 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
48 | results = maca_analyzer._maca_wrapper(paragraph_raw)
49 | results = list(results)
50 | except:
51 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
52 | results = maca_analyzer._maca_wrapper(paragraph_raw)
53 | results = list(results)
54 |
55 | assert len(results) == 1
56 | assert results[0] == reference_maca_output
57 |
58 | def test_parse():
59 | maca_analyzer = MacaAnalyzer('')
60 | maca_analyzer.text = paragraph_raw
61 | maca_analyzer.last_offset = 0
62 | result = maca_analyzer._parse(reference_maca_output)
63 |
64 | reference = [
65 | ('Lubię', 'newline',
66 | [('lubić', 'fin:sg:pri:imperf')],0,5),
67 | ('pociągi', 'space',
68 | [('pociąg', 'subst:pl:nom:m3'),
69 | ('pociąg', 'subst:pl:acc:m3'),
70 | ('pociąg', 'subst:pl:voc:m3')],6,13),
71 | ('.', 'none',
72 | [('.', 'interp')], 13,14)]
73 |
74 | assert result == reference
75 |
76 | def test_maca_analyzer():
77 | try:
78 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
79 | result = maca_analyzer.analyze(paragraph_raw)
80 | except:
81 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
82 | result = maca_analyzer.analyze(paragraph_raw)
83 |
84 | assert len(result.sentences)==1
85 | assert len(result.sentences[0].tokens) == 3
86 |
87 | assert result.sentences[0].tokens[0].form == 'Lubię'
88 | assert result.sentences[0].tokens[0].space_before == 'newline'
89 | assert len(result.sentences[0].tokens[0].interpretations) == 1
90 |
91 | assert result.sentences[0].tokens[1].form == 'pociągi'
92 | assert result.sentences[0].tokens[1].space_before == 'space'
93 | assert len(result.sentences[0].tokens[1].interpretations) == 3
94 |
95 | assert result.sentences[0].tokens[2].form == '.'
96 | assert result.sentences[0].tokens[2].space_before == 'none'
97 | assert len(result.sentences[0].tokens[2].interpretations) == 1
98 |
99 | assert result.sentences[0].tokens[1].interpretations[0] == Form('pociąg', 'subst:pl:nom:m3')
100 | assert result.sentences[0].tokens[1].interpretations[1] == Form('pociąg', 'subst:pl:acc:m3')
101 | assert result.sentences[0].tokens[1].interpretations[2] == Form('pociąg', 'subst:pl:voc:m3')
102 |
103 |
104 | def test_maca_analyzer_lemmas():
105 | paragraph_raw='Ala ma kota.'
106 | try:
107 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
108 | result = maca_analyzer.analyze(paragraph_raw)
109 | except:
110 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
111 | result = maca_analyzer.analyze(paragraph_raw)
112 |
113 | lemmas =[form.lemma for form in result.sentences[0].tokens[2].interpretations]
114 | assert 'kot' in lemmas
115 | assert 'kot:s1' not in lemmas
116 | assert 'kot:s2' not in lemmas
117 |
118 |
--------------------------------------------------------------------------------
/tests/test_blank.py:
--------------------------------------------------------------------------------
1 | import copy
2 |
3 | from krnnt.aglt import rewrite_praet, remove_aglt, rule1, rule3, rule1b
4 | from krnnt.blanks import remove_blanks
5 |
6 | sentence = [
7 |
8 | {'token': '200', 'sep': 'newline', 'tag': 'num:pl:nom:m2:rec',
9 | 'lemmas': ['200'], 'start': 0, 'end': 3},
10 | {'token': '.', 'sep': 'none', 'tag': 'blank', 'lemmas': ['.'],
11 | 'start': 3, 'end': 4},
12 | {'token': '000', 'sep': 'none', 'tag': 'blank',
13 | 'lemmas': ['000'], 'start': 4, 'end': 7},
14 | {'token': 'zł', 'sep': 'space', 'tag': 'brev:npun',
15 | 'lemmas': ['złoty'], 'start': 8, 'end': 10}
16 | ]
17 |
18 |
19 | def test_remove_blanks():
20 | sentence1 = copy.deepcopy(sentence)
21 | remove_blanks(sentence1)
22 | print(sentence1)
23 |
24 | assert len(sentence1)==2
25 |
26 |
27 | assert sentence1[0]['tag'] == 'num:pl:nom:m2:rec'
28 | assert sentence1[0]['token'] == '200.000'
29 | assert sentence1[0]['start'] == 0
30 | assert sentence1[0]['end'] == 7
31 |
32 | assert sentence1[1] == sentence[-1]
--------------------------------------------------------------------------------
/tests/test_features.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from krnnt.features import FeaturePreprocessor, TagsPreprocessorCython, TagsPreprocessor, create_token_features
4 |
5 |
6 | @pytest.fixture
7 | def token():
8 | return 'asd'
9 |
10 |
11 | def test_nic(token):
12 | assert ["NIC"] == FeaturePreprocessor.nic(token)
13 |
14 |
15 | def test_interps():
16 | assert ["."] == FeaturePreprocessor.interps('.', {'tags': ['interp']})
17 | assert [] == FeaturePreprocessor.interps('.', {'tags': ['subst']})
18 | assert [] == FeaturePreprocessor.interps(':)', {'tags': ['interp']})
19 |
20 |
21 | def test_prefix1():
22 | assert ["P0k"] == FeaturePreprocessor.prefix1('kot')
23 | assert ["P0??"] == FeaturePreprocessor.prefix1('©kot')
24 | assert ["P0k"] == FeaturePreprocessor.prefix1('KOT')
25 |
26 |
27 | def test_prefix2():
28 | assert ["P1o"] == FeaturePreprocessor.prefix2('kot')
29 | assert ["P1xx"] == FeaturePreprocessor.prefix2('k')
30 |
31 |
32 | def test_prefix3():
33 | assert ["P2t"] == FeaturePreprocessor.prefix3('kot')
34 |
35 |
36 | def test_suffix1():
37 | assert ["S1t"] == FeaturePreprocessor.suffix1('kot')
38 | assert ["S1??"] == FeaturePreprocessor.suffix1('kot©')
39 |
40 |
41 | def test_suffix2():
42 | assert ["S2o"] == FeaturePreprocessor.suffix2('kot')
43 | assert ["S2xx"] == FeaturePreprocessor.suffix2('k')
44 |
45 |
46 | def test_suffix3():
47 | assert ["S3k"] == FeaturePreprocessor.suffix3('kot')
48 |
49 |
50 | def test_qubliki():
51 | assert [] == FeaturePreprocessor.qubliki('kot')
52 | assert ['ale'] == FeaturePreprocessor.qubliki('ale')
53 | assert ['ale'] == FeaturePreprocessor.qubliki('Ale')
54 |
55 |
56 | @pytest.mark.parametrize('token, expected', [('wrobel', 'l'),
57 | ('Wrobel', 'ul'),
58 | ('WROBEL', 'u'),
59 | ('2019', 'd'),
60 | ('Wrobel2019', 'uld'),
61 | ('Wrobel2019:)', 'uldx')])
62 | def test_shape(token, expected):
63 | features = FeaturePreprocessor.shape(token)
64 | assert features[0] == expected
65 | assert len(features) == 1
66 |
67 |
68 | @pytest.mark.parametrize('tags, expected', [
69 | (['fin:sg:ter:imperf', 'subst:sg:nom:f'], ['1fin:ter', '2fin:sg:imperf', '1subst:nom',
70 | '2subst:sg:f']),
71 | (['adjp:dat'], ['1adjp:dat', '2adjp']),
72 | (['interp'], ['1interp', '2interp']),
73 | ([''], ['1', '2']),
74 | ([], [])])
75 | def test_tags4(tags, expected):
76 | assert TagsPreprocessor.create_tags4_without_guesser(tags) == expected
77 | assert TagsPreprocessorCython.create_tags4_without_guesser(tags) == expected
78 |
79 |
80 | @pytest.mark.parametrize('tags, expected', [
81 | (['fin:sg:ter:imperf', 'subst:sg:nom:f'], ['sg', 'sg:nom:f', 'nom']),
82 | (['adjp:dat'], ['dat']),
83 | (['interp'], []),
84 | ([''], []),
85 | ([], [])])
86 | def test_tags5(tags, expected):
87 | assert TagsPreprocessor.create_tags5_without_guesser(tags) == expected
88 | assert TagsPreprocessorCython.create_tags5_without_guesser(tags) == expected
89 |
90 |
91 | def test_create_token_features(benchmark):
92 | token = 'obejmie'
93 | tags = ['subst:sg:loc:m3', 'subst:sg:voc:m3', 'subst:sg:dat:f', 'subst:sg:loc:f',
94 | 'fin:sg:ter:perf']
95 | space_before = ['space_before']
96 | features=['l', 'P0o', 'P1b', 'P2e', 'S1e', 'S2i', 'S3m', '1subst:loc', '2subst:sg:m3',
97 | '1subst:voc', '1subst:dat', '2subst:sg:f', '1fin:ter', '2fin:sg:perf', 'sg:loc:m3', 'loc',
98 | 'sg:voc:m3', 'voc', 'sg:dat:f', 'dat', 'sg:loc:f', 'sg', 'space_before']
99 |
100 | result_features = create_token_features(token, tags, space_before)
101 | assert result_features == features
102 |
--------------------------------------------------------------------------------
/tests/test_morfeusz.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from krnnt.analyzers import MacaAnalyzer
4 | from krnnt.new import get_morfeusz, analyze_tokenized, analyze_token
5 | from krnnt.structure import Form
6 |
7 | reference_maca_output = \
8 | '''Lubię newline
9 | lubić fin:sg:pri:imperf
10 | pociągi space
11 | pociąg subst:pl:nom:m3
12 | pociąg subst:pl:acc:m3
13 | pociąg subst:pl:voc:m3
14 | . none
15 | . interp'''
16 |
17 | paragraph_raw = 'Lubię pociągi.'
18 |
19 | MACA_CONFIG1='morfeusz-nkjp-official'
20 | MACA_CONFIG2='morfeusz2-nkjp'
21 |
22 | def test_maca():
23 | try:
24 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
25 | results = maca_analyzer._maca(paragraph_raw)
26 | results = list(results)
27 | except:
28 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
29 | results = maca_analyzer._maca(paragraph_raw)
30 | results = list(results)
31 |
32 | assert len(results) == 1
33 | assert results[0] == reference_maca_output
34 |
35 |
36 | def test_maca_analyzer(rootdir):
37 | try:
38 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
39 | result = maca_analyzer.analyze(paragraph_raw)
40 | except:
41 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
42 | result = maca_analyzer.analyze(paragraph_raw)
43 |
44 | lines = []
45 | for line in open(os.path.join(rootdir, 'data/full/test-raw.txt')):
46 | line = line.strip()
47 | if not line: continue
48 | lines.append(line)
49 |
50 | morfeusz = get_morfeusz()
51 |
52 |
53 |
54 | for line in lines:
55 | paragraph = maca_analyzer.analyze(line)
56 | for sentence in paragraph:
57 | for token in sentence:
58 |
59 | maca_tags=[(form.lemma, form.tags) for form in token.interpretations]
60 | morfeusz_tags=analyze_token(morfeusz, token.form)
61 | maca_tags=set(maca_tags)
62 | morfeusz_tags=set(morfeusz_tags)
63 | if maca_tags!=morfeusz_tags:
64 | print(token)
65 | print(sorted(maca_tags-morfeusz_tags))
66 | print(sorted(morfeusz_tags-maca_tags))
--------------------------------------------------------------------------------
/tests/test_parallel_api_speed.py:
--------------------------------------------------------------------------------
1 | import concurrent.futures
2 | import os
3 |
4 | import pytest
5 | import requests
6 |
7 |
8 | def test_api(rootdir):
9 | url = 'http://localhost:9003'
10 |
11 | for line in open(os.path.join(rootdir, 'data/full/test-raw.txt')):
12 | line=line.strip()
13 | if not line: continue
14 |
15 | tag('http://localhost:9003', line)
16 |
17 | def tag(url, data):
18 | payload = data.encode('utf-8')
19 | r = requests.post(url, data=payload)
20 | return r
21 |
22 | def chunk(l, batch_size):
23 | batch = []
24 | for element in l:
25 | batch.append(element)
26 | if len(batch) == batch_size:
27 | yield batch
28 | batch = []
29 | if batch:
30 | yield batch
31 |
32 | @pytest.mark.slow
33 | @pytest.mark.parametrize('chunk_size', [100000, 10000, 1000, 100, 10, 4, 2,1])
34 | def test_parallel_api(rootdir, chunk_size):
35 | print(rootdir, chunk_size)
36 |
37 | lines=[]
38 | for line in open(os.path.join(rootdir, 'data/full/test-raw.txt')):
39 | line = line.strip()
40 | if not line: continue
41 | lines.append(line)
42 |
43 | batches = list(chunk(lines, chunk_size))
44 | print(len(batches))
45 |
46 | with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
47 | future_to_url = {executor.submit(tag, 'http://localhost:9003', "\n\n".join(batch)): "\n\n".join(batch) for batch in batches}
48 | for future in concurrent.futures.as_completed(future_to_url):
49 | r=future.result()
50 | # print(r.text)
51 |
52 | @pytest.mark.slow
53 | @pytest.mark.parametrize('chunk_size', [100000,10,1])
54 | def test_parallel_api_maca(rootdir, chunk_size):
55 | lines=[]
56 | for line in open(os.path.join(rootdir, 'data/full/train-raw.txt')):
57 | line = line.strip()
58 | if not line: continue
59 | lines.append(line)
60 |
61 | batches = list(chunk(lines, chunk_size))
62 |
63 | with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
64 | future_to_url = {executor.submit(tag, 'http://localhost:9003/maca/', "\n\n".join(batch)): "\n\n".join(batch) for batch in batches}
65 | for future in concurrent.futures.as_completed(future_to_url):
66 | r=future.result()
67 |
--------------------------------------------------------------------------------
/tests/test_process_xces.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from krnnt.aligner import align_paragraphs
4 | from krnnt.analyzers import MacaAnalyzer
5 | from krnnt.readers import read_xces
6 |
7 | #TODO parametrize?
8 |
9 |
10 |
11 | def test_different_xces_formats(rootdir):
12 | data = {
13 | os.path.join(rootdir, 'data/small/nkjp1m-1.2-xces.xml'): [8, 7],
14 | os.path.join(rootdir, 'data/small/train-gold.xml'): [10, 8, 6],
15 | os.path.join(rootdir, 'data/small/gold-task-c.xml'): [12, 12],
16 | os.path.join(rootdir, 'data/small/00130846.ann.xml'): [25],
17 | os.path.join(rootdir, 'data/small/00130846.xml'): [25],
18 | os.path.join(rootdir, 'data/small/00132482.ann.xml'): [2],
19 | os.path.join(rootdir, 'data/small/00132482.xml'): [2]
20 | }
21 |
22 | for path, paragraph_lenghts in data.items():
23 | assert paragraph_lenghts == [len(paragraph.sentences) for paragraph in read_xces(path)]
24 | for paragraph in read_xces(path):
25 | print(paragraph.text())
26 |
27 | for sentence in paragraph:
28 | for token in sentence:
29 | print(token)
30 | print()
31 |
32 | def test_reanalyze(rootdir):
33 | data = {
34 | os.path.join(rootdir, 'data/small/nkjp1m-1.2-xces.xml'): [8, 7],
35 | os.path.join(rootdir, 'data/small/train-gold.xml'): [10, 8, 6],
36 | os.path.join(rootdir, 'data/small/gold-task-c.xml'): [12, 12],
37 | os.path.join(rootdir, 'data/small/00130846.ann.xml'): [25],
38 | os.path.join(rootdir, 'data/small/00130846.xml'): [25],
39 | os.path.join(rootdir, 'data/small/00132482.ann.xml'): [2],
40 | os.path.join(rootdir, 'data/small/00132482.xml'): [2]
41 | }
42 |
43 | for path, paragraph_lenghts in data.items():
44 | # assert paragraph_lenghts == [len(paragraph.sentences) for paragraph in read_xces(path)]
45 | maca_analyzer = MacaAnalyzer('morfeusz2-nkjp')
46 | for paragraph in read_xces(path):
47 | paragraph_raw = paragraph.text()
48 |
49 | paragraph_reanalyzed = maca_analyzer.analyze(paragraph_raw)
50 |
51 | print('Number of sentences by Maca vs gold', len(paragraph_reanalyzed.sentences), len(paragraph.sentences))
52 |
53 | paragraph_reanalyzed = align_paragraphs(paragraph_reanalyzed, paragraph)
54 | for sentence in paragraph_reanalyzed:
55 | for token in sentence:
56 | print(token)
57 | print()
58 |
--------------------------------------------------------------------------------
/tests/test_speed.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | MACA_CONFIG=morfeusz2-nkjp
4 |
5 |
6 | time cat tests/data/full/test-raw.txt | CUDA_VISIBLE_DEVICES="" python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces > /tmp/out.xces
7 | #12s
8 |
9 | time cat tests/data/full/train-raw.txt | CUDA_VISIBLE_DEVICES="" python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces > /tmp/out.xces
10 | #7m16s
11 |
12 | #one thread
13 | time cat tests/data/full/test-raw.txt | CUDA_VISIBLE_DEVICES="" python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces --reproducible > /tmp/out.xces
14 | #22s
15 |
16 | #GPU 1050GTX
17 | #train
18 | #5m12s
19 |
20 | #time maca-analyse -c morfeusz2-nkjp < tests/data/full/train-raw.txt > /dev/null
21 | #35s
22 |
23 | #time maca-analyse -c morfeusz2-nkjp < tests/data/full/test-raw.txt > /dev/null
24 | #0.9s
25 |
26 | #maca per line test-raw.txt
27 | #45s
28 |
29 | #i tak zrównolegla
30 |
31 | # test-raw.txt API 1w1t GPU 44s
32 | # test-raw.txt API 1w2t GPU 44s
33 | # test-raw.txt API 2w1t GPU 44s
34 |
35 | # test-raw.txt API 1w1t CPU 43s
36 | # test-raw.txt API 1w2t CPU 43s
37 | # test-raw.txt API 2w1t CPU 42s
38 |
39 | # pool=2 test-raw.txt API 1w1t CPU 29s
40 | # pool=2 test-raw.txt API 1w2t CPU 28s
41 | # pool=2 test-raw.txt API 2w1t CPU 25s
42 |
43 | # pool=2 test-raw.txt API 1w1t GPU 21s
44 | # pool=2 test-raw.txt API 1w2t GPU 30s
45 | # pool=2 test-raw.txt API 2w1t GPU 23s
46 |
47 | # pool=10 test-raw.txt API 1w1t CPU 20s
48 | # pool=10 test-raw.txt API 1w2t CPU 20s
49 | # pool=10 test-raw.txt API 2w1t CPU 17s
50 | # pool=10 test-raw.txt API 4w1t CPU 15s
51 | # pool=10 test-raw.txt API 4w2t CPU 16s
52 | # pool=10 test-raw.txt API 8w1t CPU 16s
53 | # pool=100 test-raw.txt API 10w1t CPU 14s
54 | # pool=100 test-raw.txt API 20w1t CPU 14s
55 |
56 | # pool=10 test-raw.txt API 1w1t GPU 21s
57 | # pool=10 test-raw.txt API 1w2t GPU 21s
58 | # pool=10 test-raw.txt API 2w1t GPU 14s
59 | # pool=10 test-raw.txt API 4w1t GPU OOM
--------------------------------------------------------------------------------
/tests/test_structure.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from krnnt.readers import read_xces
4 |
5 |
6 | def test_paragraph_text(rootdir):
7 | data = {
8 | os.path.join(rootdir, 'data/small/nkjp1m-1.2-xces.xml'): [8, 7],
9 | os.path.join(rootdir, 'data/small/train-gold.xml'): [10, 8, 6],
10 | os.path.join(rootdir, 'data/small/gold-task-c.xml'): [12, 12],
11 | os.path.join(rootdir, 'data/small/00130846.ann.xml'): [25],
12 | os.path.join(rootdir, 'data/small/00130846.xml'): [25],
13 | os.path.join(rootdir, 'data/small/00132482.ann.xml'): [2],
14 | os.path.join(rootdir, 'data/small/00132482.xml'): [2]
15 | }
16 |
17 | for path, paragraph_lenghts in data.items():
18 | print(path)
19 | for paragraph in read_xces(path):
20 | paragraph_raw = ''
21 | for sentence_gold in paragraph:
22 | paragraph_raw += sentence_gold.text()
23 | paragraph_raw = paragraph_raw[1:]
24 | assert paragraph_raw == paragraph.text()
25 |
--------------------------------------------------------------------------------
/tests/test_system.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 |
4 | def test_download_model(bash, rootdir):
5 | commands = [
6 | 'cd %s' % rootdir,
7 | './download_model.sh'
8 | ]
9 |
10 | with bash() as s:
11 | for command in commands:
12 | s.run_script_inline([command])
13 |
14 |
15 | def test_process_xces(bash, rootdir):
16 | commands = [
17 | 'cd %s' % rootdir,
18 | 'cd ..',
19 | 'python3 process_xces.py tests/data/small/nkjp1m-1.2-xces.xml /tmp/nkjp.spickle',
20 | 'diff /tmp/nkjp.spickle tests/data/reference/nkjp1m-1.2.spickle']
21 |
22 | for command in commands:
23 | bash.run_script_inline([command])
24 |
25 |
26 | @pytest.mark.xfail(reason="version of morfeusz dictionary may influence results")
27 | def test_reanalyze(bash, rootdir):
28 | commands = [
29 | 'cd %s' % rootdir,
30 | 'cd ..',
31 | 'python3 reanalyze.py --maca_config $MACA_CONFIG /tmp/nkjp.spickle /tmp/nkjp-reanalyzed.spickle',
32 | 'diff /tmp/nkjp-reanalyzed.spickle tests/data/reference/nkjp1m-1.2-reanalyzed.spickle'
33 | ]
34 |
35 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp'}) as s:
36 | for command in commands:
37 | print(command)
38 | s.run_script_inline([command])
39 |
40 |
41 | def test_shuffle(bash, rootdir):
42 | commands = [
43 | 'cd %s' % rootdir,
44 | 'cd ..',
45 | 'python3 shuffle.py tests/data/reference/nkjp1m-1.2-reanalyzed.spickle /tmp/nkjp-reanalyzed.shuf.spickle',
46 | 'diff /tmp/nkjp-reanalyzed.shuf.spickle tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle'
47 | ]
48 |
49 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp'}) as s:
50 | for command in commands:
51 | s.run_script_inline([command])
52 |
53 |
54 | def test_preprocess(bash, rootdir):
55 | commands = [
56 | 'cd %s' % rootdir,
57 | 'cd ..',
58 | 'rm -f /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2 /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData',
59 | 'python3 preprocess_data.py /tmp/nkjp-reanalyzed.shuf.spickle /tmp/nkjp-reanalyzed.shuf.spickle.preprocess',
60 | 'diff /tmp/nkjp-reanalyzed.shuf.spickle.preprocess tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess',
61 | ]
62 |
63 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp'}) as s:
64 | for command in commands:
65 | s.run_script_inline([command])
66 |
67 |
68 | def test_create_dict(bash, rootdir):
69 | commands = [
70 | 'cd %s' % rootdir,
71 | 'cd ..',
72 | 'rm -f /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2 /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData',
73 | 'python3 create_dict.py tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess /tmp/nkjp-reanalyzed.shuf.spickle.preprocess.dict',
74 | 'diff /tmp/nkjp-reanalyzed.shuf.spickle.preprocess.dict tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict',
75 | ]
76 |
77 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp'}) as s:
78 | for command in commands:
79 | s.run_script_inline([command])
80 |
81 |
82 | @pytest.mark.slow
83 | def test_train2(bash, rootdir):
84 | commands = [
85 | 'cd %s' % rootdir,
86 | 'cd ..',
87 | 'python3 train.py tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict --maca_config $MACA_CONFIG -e 2 --reproducible --hash test',
88 | 'h5diff weight_test.hdf5 tests/data/reference/weight_test.hdf5.new',
89 | 'h5diff weight_test.hdf5.final tests/data/reference/weight_test.hdf5.final.new',
90 | ]
91 |
92 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s:
93 | for command in commands:
94 | s.run_script_inline([command])
95 |
96 |
97 | @pytest.mark.slow
98 | def test_train_lemmatization(bash, rootdir):
99 | commands = [
100 | 'cd %s' % rootdir,
101 | 'cd ..',
102 | 'python3 train_lemmatization.py tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess --reproducible --hash test',
103 | 'diff lemmatisation_test.pkl tests/data/reference/lemmatisation_test.pkl',
104 | ]
105 |
106 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s:
107 | for command in commands:
108 | s.run_script_inline([command])
109 |
110 |
111 | def test_join_dicts(bash, rootdir):
112 | commands = [
113 | 'cd %s' % rootdir,
114 | 'cd ..',
115 | 'python3 join_dicts.py /tmp/joined_dicts.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict --reproducible',
116 | 'diff /tmp/joined_dicts.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.preprocess.dict',
117 | ]
118 |
119 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s:
120 | for command in commands:
121 | s.run_script_inline([command])
122 |
123 |
124 | def test_split_data(bash, rootdir):
125 | commands = [
126 | 'cd %s' % rootdir,
127 | 'cd ..',
128 | 'python3 split_data.py tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle /tmp/nkjp1m-1.2-reanalyzed.shuf.spickle.part1 /tmp/nkjp1m-1.2-reanalyzed.shuf.spickle.part2 0.2',
129 | 'diff /tmp/nkjp1m-1.2-reanalyzed.shuf.spickle.part1 tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part1',
130 | 'diff /tmp/nkjp1m-1.2-reanalyzed.shuf.spickle.part2 tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part2',
131 | ]
132 |
133 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s:
134 | for command in commands:
135 | s.run_script_inline([command])
136 |
137 |
138 | def test_join_data(bash, rootdir):
139 | commands = [
140 | 'cd %s' % rootdir,
141 | 'cd ..',
142 | 'python3 join_data.py /tmp/nkjp1m-1.2-reanalyzed.shuf.spickle.joined tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part1 tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle.part2',
143 | 'diff /tmp/nkjp1m-1.2-reanalyzed.shuf.spickle.joined tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle',
144 | ]
145 |
146 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s:
147 | for command in commands:
148 | s.run_script_inline([command])
149 |
150 |
151 | @pytest.mark.slow
152 | def test_train(bash, rootdir):
153 | commands = [
154 | 'cd %s' % rootdir,
155 | 'cd ..',
156 | 'rm -f /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2 /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues',
157 | 'python3 krnnt_train.py --maca_config $MACA_CONFIG /tmp/nkjp-reanalyzed.shuf.spickle -e 2 --reproducible --hash test',
158 |
159 | 'h5diff weight_test.hdf5 tests/data/reference/weight_test.hdf5',
160 | 'h5diff weight_test.hdf5.final tests/data/reference/weight_test.hdf5.final',
161 | 'diff lemmatisation_test.pkl tests/data/reference/lemmatisation_test.pkl',
162 | 'diff /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2 tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2',
163 | 'diff /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData',
164 | 'diff /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues',
165 | ]
166 |
167 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s:
168 | for command in commands:
169 | s.run_script_inline([command])
170 |
171 |
172 | def test_run_xces(bash, rootdir):
173 | commands = [
174 | 'cd %s' % rootdir,
175 | 'cd ..',
176 | 'python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces > /tmp/out.xces < tests/data/reference/in_raw.txt',
177 | 'diff /tmp/out.xces tests/data/reference/out.xces'
178 | ]
179 |
180 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': ''}) as s:
181 | for command in commands:
182 | s.run_script_inline([command])
183 |
184 |
185 | def test_run_xces_from_training(bash, rootdir):
186 | commands = [
187 | 'cd %s' % rootdir,
188 | 'cd ..',
189 | 'python3 krnnt_run.py weight_test.hdf5.final lemmatisation_test.pkl /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces > /tmp/out.xces < tests/data/reference/in_raw.txt',
190 | 'diff /tmp/out.xces tests/data/reference/out.xces'
191 | ]
192 |
193 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': ''}) as s:
194 | for command in commands:
195 | s.run_script_inline([command])
196 |
197 |
198 | def test_run_plain(bash, rootdir):
199 | commands = [
200 | 'cd %s' % rootdir,
201 | 'cd ..',
202 | 'python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o plain > /tmp/out.plain < tests/data/reference/in_raw.txt',
203 | 'diff /tmp/out.plain tests/data/reference/out.plain'
204 | ]
205 |
206 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': ''}) as s:
207 | for command in commands:
208 | s.run_script_inline([command])
209 |
210 |
211 | def test_run_conll(bash, rootdir):
212 | commands = [
213 | 'cd %s' % rootdir,
214 | 'cd ..',
215 | 'python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o conll > /tmp/out.conll < tests/data/reference/in_raw.txt',
216 | 'diff /tmp/out.conll tests/data/reference/out.conll'
217 | ]
218 |
219 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': ''}) as s:
220 | for command in commands:
221 | s.run_script_inline([command])
222 |
223 |
224 | def test_run_conllu(bash, rootdir):
225 | commands = [
226 | 'cd %s' % rootdir,
227 | 'cd ..',
228 | 'python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o conllu > /tmp/out.conllu < tests/data/reference/in_raw.txt',
229 | 'diff /tmp/out.conllu tests/data/reference/out.conllu'
230 | ]
231 |
232 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': ''}) as s:
233 | for command in commands:
234 | s.run_script_inline([command])
235 |
236 |
237 | def test_run_jsonl(bash, rootdir):
238 | commands = [
239 | 'cd %s' % rootdir,
240 | 'cd ..',
241 |
242 | 'python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o jsonl > /tmp/out.jsonl < tests/data/reference/in_raw.txt',
243 | 'diff /tmp/out.jsonl tests/data/reference/out.jsonl'
244 | ]
245 |
246 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': ''}) as s:
247 | for command in commands:
248 | s.run_script_inline([command])
249 |
250 |
251 | @pytest.mark.xfail(reason="non-deterministic lemmatisation?")
252 | def test_run_evaluation(bash, rootdir):
253 | commands = [
254 | 'cd %s' % rootdir,
255 | 'cd ..',
256 | 'cat tests/data/small/gold-task-c.txt | python3 krnnt_run.py tests/data/reference/weight_test.hdf5.final tests/data/reference/lemmatisation_test.pkl tests/data/reference/nkjp1m-1.2-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces --reproducible > /tmp/out.xces',
257 | 'python2 tagger-eval.py tests/data/small/gold-task-c.xml /tmp/out.xces > /tmp/out_evaluation.txt',
258 | 'diff /tmp/out_evaluation.txt tests/data/reference/gold-task-c_evaluation.txt '
259 | ]
260 |
261 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s:
262 | for command in commands:
263 | s.run_script_inline([command])
264 |
265 |
266 | @pytest.mark.xfail(reason="non-deterministic lemmatisation?")
267 | def test_run_evaluation_from_training(bash, rootdir):
268 | commands = [
269 | 'cd %s' % rootdir,
270 | 'cd ..',
271 | 'cat tests/data/small/gold-task-c.txt | python3 krnnt_run.py weight_test.hdf5.final lemmatisation_test.pkl /tmp/nkjp-reanalyzed.shuf.spickle_FormatData2_PreprocessData_UniqueFeaturesValues --maca_config $MACA_CONFIG -o xces --reproducible > /tmp/out.xces',
272 | 'python2 tagger-eval.py tests/data/small/gold-task-c.xml /tmp/out.xces > /tmp/out_evaluation.txt',
273 | 'diff /tmp/out_evaluation.txt tests/data/reference/gold-task-c_evaluation.txt '
274 | ]
275 |
276 | with bash(envvars={'MACA_CONFIG': 'morfeusz2-nkjp', 'CUDA_VISIBLE_DEVICES': '', 'PYTHONHASHSEED': '0'}) as s:
277 | for command in commands:
278 | s.run_script_inline([command])
279 |
--------------------------------------------------------------------------------
/tests/test_system_server.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 |
4 | def test_download_model(bash, rootdir):
5 | commands = [
6 | 'cd %s' % rootdir,
7 | './download_model.sh'
8 | ]
9 |
10 | with bash() as s:
11 | for command in commands:
12 | s.run_script_inline([command])
13 |
14 | #TODO: run server: python3 krnnt_serve.py model_data/ --maca_config morfeusz2-nkjp
15 |
16 | def test_post_raw(bash,rootdir):
17 | commands = [
18 | 'cd %s' % rootdir,
19 | 'cd ..',
20 | 'curl -X POST "http://localhost:9003" --data-binary @tests/data/server/in_raw.txt > /tmp/out.txt',
21 | 'diff /tmp/out.txt tests/data/server/out_raw.plain'
22 | ]
23 |
24 | with bash() as s:
25 | for command in commands:
26 | s.run_script_inline([command])
27 |
28 | def test_post_raw_jsonl(bash,rootdir):
29 | commands = [
30 | 'cd %s' % rootdir,
31 | 'cd ..',
32 | 'curl -X POST "http://localhost:9003/?output_format=jsonl&input_format=lines" --data-binary @tests/data/server/in_raw.txt > /tmp/out.txt',
33 | 'diff /tmp/out.txt tests/data/server/out_raw.jsonl'
34 | ]
35 |
36 | with bash() as s:
37 | for command in commands:
38 | s.run_script_inline([command])
39 |
40 | def test_post_raw_conll(bash,rootdir):
41 | commands = [
42 | 'cd %s' % rootdir,
43 | 'cd ..',
44 | 'curl -X POST "http://localhost:9003/?output_format=conll&input_format=lines" --data-binary @tests/data/server/in_raw.txt > /tmp/out.txt',
45 | 'diff /tmp/out.txt tests/data/server/out_raw.conll'
46 | ]
47 |
48 | with bash() as s:
49 | for command in commands:
50 | s.run_script_inline([command])
51 |
52 | def test_post_raw_conllu(bash,rootdir):
53 | commands = [
54 | 'cd %s' % rootdir,
55 | 'cd ..',
56 | 'curl -X POST "http://localhost:9003/?output_format=conllu&input_format=lines" --data-binary @tests/data/server/in_raw.txt > /tmp/out.txt',
57 | 'diff /tmp/out.txt tests/data/server/out_raw.conllu'
58 | ]
59 |
60 | with bash() as s:
61 | for command in commands:
62 | s.run_script_inline([command])
63 |
64 | def test_post_raw_xces(bash,rootdir):
65 | commands = [
66 | 'cd %s' % rootdir,
67 | 'cd ..',
68 | 'curl -X POST "http://localhost:9003/?output_format=xces&input_format=lines" --data-binary @tests/data/server/in_raw.txt > /tmp/out.txt',
69 | 'diff /tmp/out.txt tests/data/server/out_raw.xces'
70 | ]
71 |
72 | with bash() as s:
73 | for command in commands:
74 | s.run_script_inline([command])
75 |
76 | def test_post_form(bash, rootdir):
77 | commands = [
78 | 'cd %s' % rootdir,
79 | 'cd ..',
80 | 'curl -X POST "http://localhost:9003" --data-binary "text=Lubię placki. Ala ma kota.\n\nRaz dwa trzy." > /tmp/out.txt'
81 | ]
82 |
83 | with bash() as s:
84 | for command in commands:
85 | s.run_script_inline([command])
86 |
87 | generated = open('/tmp/out.txt').read()
88 | reference = open(os.path.join(rootdir,'data/server/out_raw.plain')).read()
89 |
90 | assert reference in generated
91 |
92 | def test_post_tokenized_json(bash, rootdir):
93 | commands = [
94 | 'cd %s' % rootdir,
95 | 'cd ..',
96 | 'curl -X POST -H "Content-Type: application/json" "http://localhost:9003" -d @tests/data/server/in_tokenized.json > /tmp/out.txt',
97 | 'diff -B /tmp/out.txt tests/data/server/out_raw.plain'
98 | ]
99 |
100 | with bash() as s:
101 | for command in commands:
102 | s.run_script_inline([command])
103 |
104 | def test_post_tokenized_compact_json(bash, rootdir):
105 | commands = [
106 | 'cd %s' % rootdir,
107 | 'cd ..',
108 | 'curl -X POST -H "Content-Type: application/json" "http://localhost:9003" -d @tests/data/server/in_tokenized_compact.json > /tmp/out.txt',
109 | 'diff -B /tmp/out.txt tests/data/server/out_raw.plain'
110 | ]
111 |
112 | with bash() as s:
113 | for command in commands:
114 | s.run_script_inline([command])
115 |
116 | def test_post_raw_poleval(bash, rootdir):
117 | commands = [
118 | 'cd %s' % rootdir,
119 | 'cd ..',
120 | 'curl -X POST "http://localhost:9003" --data-binary @tests/data/full/test-raw.txt > /tmp/out.txt'
121 | ]
122 |
123 | with bash() as s:
124 | for command in commands:
125 | s.run_script_inline([command])
--------------------------------------------------------------------------------
/tests/test_tagset.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from krnnt.analyzers import MacaAnalyzer
3 | from krnnt.new import get_morfeusz, analyze_token
4 |
5 | MACA_CONFIG1='morfeusz-nkjp-official'
6 | MACA_CONFIG2='morfeusz2-nkjp'
7 |
8 | @pytest.fixture
9 | def maca():
10 | try:
11 | maca_analyzer = MacaAnalyzer(MACA_CONFIG1)
12 | list(maca_analyzer._maca("test"))
13 | except:
14 | maca_analyzer = MacaAnalyzer(MACA_CONFIG2)
15 | list(maca_analyzer._maca("test"))
16 |
17 | return maca_analyzer
18 |
19 | test_data = [
20 | ('IV', '', 'num:::'),
21 | ('IV', '', 'romandig'),
22 | ('1', '', 'dig'),
23 | ('prostu', 'adjp', 'adjp:gen'),
24 | (':)', '', 'emo'),
25 | ('godzien', 'adjc', ''),
26 | ('oślep', 'burk', 'frag'),
27 | ('obojga', 'numcol:pl:gen:m1:rec', ''),
28 | ('dwoje', 'numcol:pl:acc:m1:rec', ''),
29 | ('czworo', 'numcol:pl:nom:m1:rec', ''),
30 | ('hej', 'interj', ''),
31 | ('jeszcze', 'qub', 'part'),
32 | ('czterem', 'num:pl:dat:m1:congr', ''),
33 | ('czym', 'conj', 'comp'),
34 | ('niedaleko', 'prep:gen', ''),
35 | ('doprawdy', 'qub', 'adv'),
36 | ('jak', 'qub', 'adv'),
37 | ('pół', '', 'numcomp'),
38 | ('pół', '', 'num:comp'),
39 | ('pół', 'num:pl:acc:n:rec', ''),
40 | ('słowa', 'subst:pl:acc:n', 'subst:sg:gen:n:ncol'),
41 | ('rozklepywało', '', 'praet:sg:n1:ter:imperf'),
42 | ('bardzo', 'adv:pos', 'adv'),
43 | ('bardziej', 'adv:com', ''),
44 | ('znacząco', 'adv:pos', 'pacta'),
45 | ('my', '', 'ppron12:pl:nom:_:pri'),
46 | ('sobie', 'siebie:dat', ''),
47 | ('zł', 'brev:npun', 'brev'),
48 | ]
49 |
50 | @pytest.mark.parametrize('form, exist, not_exist', test_data)
51 | @pytest.mark.xfail
52 | def test_maca(maca, form, exist, not_exist):
53 | paragraph=maca.analyze(form)
54 | sentence=paragraph.sentences[0]
55 | token=sentence.tokens[0]
56 | tags = [form.tags for form in token.interpretations]
57 | print(tags)
58 | if exist:
59 | assert exist in tags
60 | if not_exist:
61 | assert not_exist not in tags
62 |
63 | @pytest.mark.parametrize('form, exist, not_exist', test_data)
64 | @pytest.mark.xfail
65 | def test_morfeusz(maca, form, exist, not_exist):
66 | morfeusz = get_morfeusz()
67 | tags=[tag for form, tag in analyze_token(morfeusz, form)]
68 | print(tags)
69 | if exist:
70 | assert exist in tags
71 | if not_exist:
72 | assert not_exist not in tags
--------------------------------------------------------------------------------
/tests/test_writers.py:
--------------------------------------------------------------------------------
1 | from krnnt.writers import results_to_conll_str, results_to_conllu_str, results_to_txt_str, results_to_plain_str, \
2 | results_to_xces_str
3 |
4 | results = [[[{'token': 'Lubię', 'sep': 'newline', 'prob': 0.37375012, 'tag': 'adj:pl:nom:m1:pos', 'lemmas': ['Lubię'],
5 | 'start': 0, 'end': 5},
6 | {'token': 'placki', 'sep': 'space', 'prob': 0.38550463, 'tag': 'subst:pl:nom:m1', 'lemmas': ['placki'],
7 | 'start': 6, 'end': 12},
8 | {'token': '.', 'sep': 'none', 'prob': 0.99999726, 'tag': 'interp', 'lemmas': ['.'], 'start': 12,
9 | 'end': 13}], [
10 | {'token': 'Ala', 'sep': 'space', 'prob': 0.9995969, 'tag': 'subst:sg:nom:f', 'lemmas': ['Ala'],
11 | 'start': 14, 'end': 17},
12 | {'token': 'ma', 'sep': 'space', 'prob': 0.6605565, 'tag': 'subst:sg:nom:f', 'lemmas': ['ma'],
13 | 'start': 18, 'end': 20},
14 | {'token': 'kota', 'sep': 'space', 'prob': 0.93132496, 'tag': 'subst:sg:nom:f', 'lemmas': ['kota'],
15 | 'start': 21, 'end': 25},
16 | {'token': '.', 'sep': 'none', 'prob': 0.9999993, 'tag': 'interp', 'lemmas': ['.'], 'start': 25,
17 | 'end': 26}]], [[
18 | {'token': 'Raz', 'sep': 'space', 'prob': 0.23650545, 'tag': 'subst:sg:nom:f', 'lemmas': ['Raz'],
19 | 'start': 27, 'end': 30},
20 | {'token': 'dwa', 'sep': 'space', 'prob': 0.581044, 'tag': 'adj:pl:acc:f:pos', 'lemmas': ['dwa'],
21 | 'start': 31, 'end': 34},
22 | {'token': 'trzy', 'sep': 'space', 'prob': 0.71970826, 'tag': 'subst:pl:acc:f', 'lemmas': ['trzy'],
23 | 'start': 35, 'end': 39},
24 | {'token': '.', 'sep': 'none', 'prob': 0.99999905, 'tag': 'interp', 'lemmas': ['.'], 'start': 39,
25 | 'end': 40}]]]
26 |
27 |
28 | def test_conll():
29 | reference=\
30 | """Lubię Lubię 1 adj:pl:nom:m1:pos 0 5
31 | placki placki 1 subst:pl:nom:m1 6 12
32 | . . 0 interp 12 13
33 |
34 | Ala Ala 1 subst:sg:nom:f 14 17
35 | ma ma 1 subst:sg:nom:f 18 20
36 | kota kota 1 subst:sg:nom:f 21 25
37 | . . 0 interp 25 26
38 |
39 |
40 | Raz Raz 1 subst:sg:nom:f 27 30
41 | dwa dwa 1 adj:pl:acc:f:pos 31 34
42 | trzy trzy 1 subst:pl:acc:f 35 39
43 | . . 0 interp 39 40
44 |
45 | """
46 | output = results_to_conll_str(results)
47 | assert output == reference
48 |
49 | def test_conllu():
50 | reference=\
51 | """1 Lubię Lubię _ adj:pl:nom:m1:pos _ _ _ _ _
52 | 2 placki placki _ subst:pl:nom:m1 _ _ _ _ _
53 | 3 . . _ interp _ _ _ _ _
54 |
55 | 1 Ala Ala _ subst:sg:nom:f _ _ _ _ _
56 | 2 ma ma _ subst:sg:nom:f _ _ _ _ _
57 | 3 kota kota _ subst:sg:nom:f _ _ _ _ _
58 | 4 . . _ interp _ _ _ _ _
59 |
60 |
61 | 1 Raz Raz _ subst:sg:nom:f _ _ _ _ _
62 | 2 dwa dwa _ adj:pl:acc:f:pos _ _ _ _ _
63 | 3 trzy trzy _ subst:pl:acc:f _ _ _ _ _
64 | 4 . . _ interp _ _ _ _ _
65 |
66 | """
67 | output = results_to_conllu_str(results)
68 | assert output == reference
69 |
70 | def test_txt():
71 | reference=\
72 | """Lubię placki.
73 | Ala ma kota.
74 |
75 | Raz dwa trzy.
76 |
77 | """
78 | output = results_to_txt_str(results)
79 |
80 | assert output == reference
81 |
82 | def test_plain():
83 | reference=\
84 | """Lubię newline
85 | Lubię adj:pl:nom:m1:pos disamb
86 | placki space
87 | placki subst:pl:nom:m1 disamb
88 | . none
89 | . interp disamb
90 |
91 | Ala space
92 | Ala subst:sg:nom:f disamb
93 | ma space
94 | ma subst:sg:nom:f disamb
95 | kota space
96 | kota subst:sg:nom:f disamb
97 | . none
98 | . interp disamb
99 |
100 |
101 | Raz space
102 | Raz subst:sg:nom:f disamb
103 | dwa space
104 | dwa adj:pl:acc:f:pos disamb
105 | trzy space
106 | trzy subst:pl:acc:f disamb
107 | . none
108 | . interp disamb
109 |
110 | """
111 | output = results_to_plain_str(results)
112 | assert output == reference
113 |
114 | def test_xces():
115 | reference=\
116 | """
117 |
118 |
119 |
120 |
121 |
122 |
123 | Lubię
124 | Lubięadj:pl:nom:m1:pos
125 |
126 |
127 | placki
128 | plackisubst:pl:nom:m1
129 |
130 |
131 |
132 | .
133 | .interp
134 |
135 |
136 |
137 |
138 | Ala
139 | Alasubst:sg:nom:f
140 |
141 |
142 | ma
143 | masubst:sg:nom:f
144 |
145 |
146 | kota
147 | kotasubst:sg:nom:f
148 |
149 |
150 |
151 | .
152 | .interp
153 |
154 |
155 |
156 |
157 |
158 |
159 | Raz
160 | Razsubst:sg:nom:f
161 |
162 |
163 | dwa
164 | dwaadj:pl:acc:f:pos
165 |
166 |
167 | trzy
168 | trzysubst:pl:acc:f
169 |
170 |
171 |
172 | .
173 | .interp
174 |
175 |
176 |
177 |
178 | """
179 |
180 | output = results_to_xces_str(results)
181 | assert output == reference
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import logging
4 | from argparse import ArgumentParser
5 |
6 | from keras.models import load_model
7 |
8 | from krnnt.keras_models import BEST, ExperimentParameters
9 | from krnnt.new import UnalignedSimpleEvaluator
10 | from krnnt.tagger_exps import RunFolds2, KerasData, RunExperiment, KerasData2, RunExperiment2
11 |
12 | logging.basicConfig(level=logging.DEBUG)
13 |
14 | if __name__ == '__main__':
15 | parser = ArgumentParser()
16 | parser.add_argument('data_path', help='path to preprocessed data')
17 | parser.add_argument('features_dict', help='path to features dict')
18 |
19 | parser.add_argument('-p', '--preanalyzed', action='store_false',
20 | default=True, dest='reanalyzed',
21 | help='training data have not been reanalyzed')
22 | parser.add_argument('-c', '--cv', action='store_true',
23 | default=False, dest='cv',
24 | help='run 10-fold cross-validation')
25 | parser.add_argument('-t', '--train_ratio',
26 | default=1.0, dest='train_ratio', type=float,
27 | help='percentage of data for training')
28 | parser.add_argument('-d', '--dev_ratio',
29 | default=0.0, dest='dev_ratio', type=float,
30 | help='percentage of training data for development')
31 | parser.add_argument('--dev_data', default='0.0', help='dev data ratio or path to dev data')
32 | parser.add_argument('--test_data', default='0.0', help='test data ratio or path to test data')
33 | parser.add_argument('--load_model', default=None, help='path to pretrained model')
34 | parser.add_argument('-e', '--epochs',
35 | default=100, dest='epochs', type=int,
36 | help='number of epochs')
37 | parser.add_argument('--patience',
38 | default=10, dest='patience', type=int,
39 | help='patience')
40 | parser.add_argument('--maca_config',
41 | default='morfeusz2-nkjp',
42 | help='Maca config')
43 | parser.add_argument('--tensor_board',
44 | action='store_true',
45 | help='save data for TensorBoard')
46 | parser.add_argument('-g', '--debug', action='store_true', dest='debug_mode') # TODO
47 | parser.add_argument('--hash', action='store', default=None, dest='hash')
48 | parser.add_argument('--reproducible', action='store_true', default=False, help='set seeds')
49 | parser.add_argument('-f', '--fold', default=None, dest='fold')
50 | args = parser.parse_args()
51 |
52 | if args.reproducible:
53 | from numpy.random import seed
54 | seed(1337)
55 | import random as rn
56 | rn.seed(1337)
57 | import tensorflow as tf
58 | session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
59 | inter_op_parallelism_threads=1)
60 | from keras import backend as K
61 | tf.set_random_seed(1337)
62 | sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
63 | K.set_session(sess)
64 |
65 | pref = {'nb_epoch': 100, 'batch_size': 256,
66 | 'internal_neurons': 256, 'feature_name': 'tags4e3', 'label_name': 'label',
67 | 'evaluator': UnalignedSimpleEvaluator, 'patience': 10,
68 | 'weight_path': 'weights.hdf5', 'samples_per_epoch': 10000, 'keras_model_class': BEST,
69 | 'corpus_path': 'data/train-reanalyzed.spickle', 'reanalyze': True, 'train_data_ratio': 0.9,
70 | 'dev_data_ratio': 0.1}
71 |
72 | pref['reanalyze'] = args.reanalyzed
73 | pref['train_data_ratio'] = float(args.train_ratio)
74 | pref['dev_data_ratio'] = float(args.dev_ratio)
75 |
76 | pref['tensor_board']= args.tensor_board
77 | pref['nb_epoch'] = args.epochs
78 |
79 | pref['dev_data'] = args.dev_data
80 | if pref['dev_data']=='0.0':
81 | pref['patience'] = pref['nb_epoch']
82 | pref['test_data'] = args.test_data
83 | pref['load_model'] = args.load_model
84 |
85 |
86 | # pref['corpus_path'] = args.corpus_path
87 | pref['patience'] = args.patience
88 | pref['maca_config'] = args.maca_config
89 | if args.hash is not None:
90 | pref['h'] = args.hash
91 | if args.fold is not None:
92 | pref['fold'] = int(args.fold)
93 |
94 | keras_model_class = pref['keras_model_class']
95 |
96 | if args.cv:
97 | logging.error('CV is not supported')
98 | # rf = RunFolds2(keras_model_class, pref)
99 | # rf.run()
100 | else:
101 | parameters = ExperimentParameters(pref)
102 |
103 | km = keras_model_class(parameters)
104 |
105 |
106 |
107 |
108 | print('Model will be saved under: %s.final' % parameters.pref['weight_path'])
109 |
110 | kd = KerasData2(args.data_path, args.features_dict, parameters)
111 | re = RunExperiment2(kd, km)
112 | re.run()
113 |
114 | print('Model is saved under: %s' % parameters.pref['weight_path'])
115 |
116 |
--------------------------------------------------------------------------------
/train_lemmatization.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from argparse import ArgumentParser
5 |
6 | from krnnt.keras_models import ExperimentParameters
7 | from krnnt.tagger_exps import KerasData2, RunLemma
8 |
9 | if __name__ == '__main__':
10 | parser = ArgumentParser(description='Train lemmatization')
11 | parser.add_argument('data_path', help='path to preprocessed data')
12 |
13 |
14 | parser.add_argument('-t', '--train_ratio',
15 | default=1.0, dest='train_ratio', type=float,
16 | help='percentage of data for training')
17 | parser.add_argument('-d', '--dev_ratio',
18 | default=0.0, dest='dev_ratio', type=float,
19 | help='percentage of training data for development')
20 | parser.add_argument('--dev_data', default='0.1', help='dev data ratio or path to dev data')
21 | parser.add_argument('--test_data', default='0.1', help='test data ratio or path to test data')
22 | parser.add_argument('-g', '--debug', action='store_true', dest='debug_mode') # TODO
23 | parser.add_argument('--hash', action='store', default=None, dest='hash')
24 | parser.add_argument('--reproducible', action='store_true', default=False, help='set seeds')
25 |
26 | args = parser.parse_args()
27 |
28 | if args.reproducible:
29 | from numpy.random import seed
30 | seed(1337)
31 | import random as rn
32 | rn.seed(1337)
33 | import tensorflow as tf
34 | session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
35 | inter_op_parallelism_threads=1)
36 | from keras import backend as K
37 | tf.set_random_seed(1337)
38 | sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
39 | K.set_session(sess)
40 |
41 | pref = {
42 | 'train_data_ratio': float(args.train_ratio),
43 | 'dev_data_ratio': float(args.dev_ratio),
44 | 'dev_data': args.dev_data,
45 | 'test_data': args.test_data
46 | }
47 |
48 | if args.hash is not None:
49 | pref['h'] = args.hash
50 |
51 |
52 | parameters = ExperimentParameters(pref)
53 |
54 | kd = KerasData2(args.data_path, None, parameters)
55 | re = RunLemma(kd)
56 | re.learn_lemma()
57 |
58 | print('Lemmatisation model is saved under: %s' % parameters.pref['lemmatisation_path'])
59 |
60 | #TODO CV, usunac zaleznosc od TF, KerasData2 bez słownika
--------------------------------------------------------------------------------
/voting.py:
--------------------------------------------------------------------------------
1 | import collections
2 |
3 | import sys
4 |
5 | from krnnt.writers import results_to_xces, results_to_xces_str
6 | from krnnt.readers import read_xces
7 |
8 | # path='/home/djstrong/projects/repos/krnnt/models/voting/'
9 | # path='/home/djstrong/projects/repos/krnnt/'
10 | # files=[path+'text-raw.'+str(i)+'.xml' for i in range(4)]
11 | # files=[path+str(i)+'b.xml' for i in range(10)]
12 |
13 | path=sys.argv[1]
14 | files=[path+str(i)+'.xml' for i in range(10)]
15 |
16 | def checkEqual2(iterator):
17 | return len(set(iterator)) == 1
18 |
19 | xcess = [read_xces(file) for file in files]
20 |
21 | result = []
22 |
23 | count_all=0
24 | count_mismatch=0
25 |
26 | while True:
27 | try:
28 | paragraphs = [next(xces) for xces in xcess]
29 |
30 | for sentences in zip(*paragraphs):
31 | sentence = []
32 | result.append(sentence)
33 | for tokens in zip(*sentences):
34 | count_all+=1
35 | # print(tokens)
36 | forms = [token.gold_form for token in tokens]
37 | tags = [form.tags for form in forms]
38 |
39 | token_result = {'sep': 'space' if tokens[0].space_before else 'none','token':tokens[0].form}
40 | sentence.append(token_result)
41 | if not checkEqual2(tags):
42 | # print(tags)
43 | tags_count=collections.defaultdict(list)
44 | for form in forms:
45 | tags_count[form.tags].append(form)
46 | # print(tags_count)
47 |
48 | sorted_forms = sorted(tags_count.items(), key=lambda x: len(x[1]), reverse=True)
49 | # print(tokens[0].form, '\t'*(3-int(len(tokens[0].form)/8)), [(form[0], len(form[1])) for form in sorted_forms])
50 | winner = sorted_forms[0][1][0]
51 |
52 |
53 |
54 | token_result['tag']=winner.tags
55 | token_result['lemmas']=[winner.lemma]
56 | count_mismatch+=1
57 | else:
58 | # print(tokens[0].form, '\t'*(3-int(len(tokens[0].form)/8)), forms[0].tags)
59 | token_result['tag']=forms[0].tags
60 | token_result['lemmas']=[forms[0].lemma]
61 |
62 | # print()
63 | # print()
64 |
65 |
66 |
67 | except StopIteration:
68 | break
69 |
70 |
71 | print(results_to_xces_str(result))
72 |
73 | print(count_all, count_mismatch, file=sys.stderr)
--------------------------------------------------------------------------------